diff --git a/docs/chinese-llm-setup.md b/docs/chinese-llm-setup.md
index 1fb0ce2a1..6638dbba1 100644
--- a/docs/chinese-llm-setup.md
+++ b/docs/chinese-llm-setup.md
@@ -24,7 +24,7 @@ SurfSense 现已支持以下国产 LLM:
1. 登录 SurfSense Dashboard
2. 进入 **Settings** → **API Keys** (或 **LLM Configurations**)
-3. 点击 **Add LLM Model**
+3. 点击 **Add Model**
4. 从 **Provider** 下拉菜单中选择你的国产 LLM 提供商
5. 填写必填字段(见下方各提供商详细配置)
6. 点击 **Save**
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 000000000..9703ac09f
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,6 @@
+{
+ "name": "SurfSense",
+ "lockfileVersion": 3,
+ "requires": true,
+ "packages": {}
+}
diff --git a/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py b/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py
new file mode 100644
index 000000000..1fef9fbcb
--- /dev/null
+++ b/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py
@@ -0,0 +1,149 @@
+"""Add LOCAL_FOLDER_FILE document type, folder metadata, and document_versions table
+
+Revision ID: 118
+Revises: 117
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "118"
+down_revision: str | None = "117"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+PUBLICATION_NAME = "zero_publication"
+
+
+def upgrade() -> None:
+ conn = op.get_bind()
+
+ # Add LOCAL_FOLDER_FILE to documenttype enum
+ op.execute(
+ """
+ DO $$
+ BEGIN
+ IF NOT EXISTS (
+ SELECT 1 FROM pg_type t
+ JOIN pg_enum e ON t.oid = e.enumtypid
+ WHERE t.typname = 'documenttype' AND e.enumlabel = 'LOCAL_FOLDER_FILE'
+ ) THEN
+ ALTER TYPE documenttype ADD VALUE 'LOCAL_FOLDER_FILE';
+ END IF;
+ END
+ $$;
+ """
+ )
+
+ # Add JSONB metadata column to folders table
+ col_exists = conn.execute(
+ sa.text(
+ "SELECT 1 FROM information_schema.columns "
+ "WHERE table_name = 'folders' AND column_name = 'metadata'"
+ )
+ ).fetchone()
+ if not col_exists:
+ op.add_column(
+ "folders",
+ sa.Column("metadata", sa.dialects.postgresql.JSONB, nullable=True),
+ )
+
+ # Create document_versions table
+ table_exists = conn.execute(
+ sa.text(
+ "SELECT 1 FROM information_schema.tables WHERE table_name = 'document_versions'"
+ )
+ ).fetchone()
+ if not table_exists:
+ op.create_table(
+ "document_versions",
+ sa.Column("id", sa.Integer(), nullable=False, autoincrement=True),
+ sa.Column("document_id", sa.Integer(), nullable=False),
+ sa.Column("version_number", sa.Integer(), nullable=False),
+ sa.Column("source_markdown", sa.Text(), nullable=True),
+ sa.Column("content_hash", sa.String(), nullable=False),
+ sa.Column("title", sa.String(), nullable=True),
+ sa.Column(
+ "created_at",
+ sa.TIMESTAMP(timezone=True),
+ server_default=sa.text("now()"),
+ nullable=False,
+ ),
+ sa.ForeignKeyConstraint(
+ ["document_id"],
+ ["documents.id"],
+ ondelete="CASCADE",
+ ),
+ sa.PrimaryKeyConstraint("id"),
+ sa.UniqueConstraint(
+ "document_id",
+ "version_number",
+ name="uq_document_version",
+ ),
+ )
+
+ op.execute(
+ "CREATE INDEX IF NOT EXISTS ix_document_versions_document_id "
+ "ON document_versions (document_id)"
+ )
+ op.execute(
+ "CREATE INDEX IF NOT EXISTS ix_document_versions_created_at "
+ "ON document_versions (created_at)"
+ )
+
+ # Add document_versions to Zero publication
+ pub_exists = conn.execute(
+ sa.text("SELECT 1 FROM pg_publication WHERE pubname = :name"),
+ {"name": PUBLICATION_NAME},
+ ).fetchone()
+ if pub_exists:
+ already_in_pub = conn.execute(
+ sa.text(
+ "SELECT 1 FROM pg_publication_tables "
+ "WHERE pubname = :name AND tablename = 'document_versions'"
+ ),
+ {"name": PUBLICATION_NAME},
+ ).fetchone()
+ if not already_in_pub:
+ op.execute(
+ f"ALTER PUBLICATION {PUBLICATION_NAME} ADD TABLE document_versions"
+ )
+
+
+def downgrade() -> None:
+ conn = op.get_bind()
+
+ # Remove from publication
+ pub_exists = conn.execute(
+ sa.text("SELECT 1 FROM pg_publication WHERE pubname = :name"),
+ {"name": PUBLICATION_NAME},
+ ).fetchone()
+ if pub_exists:
+ already_in_pub = conn.execute(
+ sa.text(
+ "SELECT 1 FROM pg_publication_tables "
+ "WHERE pubname = :name AND tablename = 'document_versions'"
+ ),
+ {"name": PUBLICATION_NAME},
+ ).fetchone()
+ if already_in_pub:
+ op.execute(
+ f"ALTER PUBLICATION {PUBLICATION_NAME} DROP TABLE document_versions"
+ )
+
+ op.execute("DROP INDEX IF EXISTS ix_document_versions_created_at")
+ op.execute("DROP INDEX IF EXISTS ix_document_versions_document_id")
+ op.execute("DROP TABLE IF EXISTS document_versions")
+
+ # Drop metadata column from folders
+ col_exists = conn.execute(
+ sa.text(
+ "SELECT 1 FROM information_schema.columns "
+ "WHERE table_name = 'folders' AND column_name = 'metadata'"
+ )
+ ).fetchone()
+ if col_exists:
+ op.drop_column("folders", "metadata")
diff --git a/surfsense_backend/alembic/versions/51_add_new_llm_config_table.py b/surfsense_backend/alembic/versions/51_add_new_llm_config_table.py
index 89a5c1246..7d90f4b13 100644
--- a/surfsense_backend/alembic/versions/51_add_new_llm_config_table.py
+++ b/surfsense_backend/alembic/versions/51_add_new_llm_config_table.py
@@ -17,10 +17,10 @@ depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""
- Add the new_llm_configs table that combines LLM model settings with prompt configuration.
+ Add the new_llm_configs table that combines model settings with prompt configuration.
This table includes:
- - LLM model configuration (provider, model_name, api_key, etc.)
+ - Model configuration (provider, model_name, api_key, etc.)
- Configurable system instructions
- Citation toggle
"""
@@ -41,7 +41,7 @@ def upgrade() -> None:
name VARCHAR(100) NOT NULL,
description VARCHAR(500),
- -- LLM Model Configuration (same as llm_configs, excluding language)
+ -- Model Configuration (same as llm_configs, excluding language)
provider litellmprovider NOT NULL,
custom_provider VARCHAR(100),
model_name VARCHAR(100) NOT NULL,
diff --git a/surfsense_backend/app/config/global_llm_config.example.yaml b/surfsense_backend/app/config/global_llm_config.example.yaml
index 6ca3e95e3..49a8d0295 100644
--- a/surfsense_backend/app/config/global_llm_config.example.yaml
+++ b/surfsense_backend/app/config/global_llm_config.example.yaml
@@ -17,7 +17,7 @@
# - Configure router_settings below to customize the load balancing behavior
#
# Structure matches NewLLMConfig:
-# - LLM model configuration (provider, model_name, api_key, etc.)
+# - Model configuration (provider, model_name, api_key, etc.)
# - Prompt configuration (system_instructions, citations_enabled)
# Router Settings for Auto Mode
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 90630cc83..077b7daa6 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -64,6 +64,7 @@ class DocumentType(StrEnum):
COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"
COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR"
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
+ LOCAL_FOLDER_FILE = "LOCAL_FOLDER_FILE"
# Native Google document types → their legacy Composio equivalents.
@@ -955,6 +956,7 @@ class Folder(BaseModel, TimestampMixin):
onupdate=lambda: datetime.now(UTC),
index=True,
)
+ folder_metadata = Column("metadata", JSONB, nullable=True)
parent = relationship("Folder", remote_side="Folder.id", backref="children")
search_space = relationship("SearchSpace", back_populates="folders")
@@ -1039,6 +1041,26 @@ class Document(BaseModel, TimestampMixin):
)
+class DocumentVersion(BaseModel, TimestampMixin):
+ __tablename__ = "document_versions"
+ __table_args__ = (
+ UniqueConstraint("document_id", "version_number", name="uq_document_version"),
+ )
+
+ document_id = Column(
+ Integer,
+ ForeignKey("documents.id", ondelete="CASCADE"),
+ nullable=False,
+ index=True,
+ )
+ version_number = Column(Integer, nullable=False)
+ source_markdown = Column(Text, nullable=True)
+ content_hash = Column(String, nullable=False)
+ title = Column(String, nullable=True)
+
+ document = relationship("Document", backref="versions")
+
+
class Chunk(BaseModel, TimestampMixin):
__tablename__ = "chunks"
diff --git a/surfsense_backend/app/indexing_pipeline/exceptions.py b/surfsense_backend/app/indexing_pipeline/exceptions.py
index 9155e9baa..666fa4b9f 100644
--- a/surfsense_backend/app/indexing_pipeline/exceptions.py
+++ b/surfsense_backend/app/indexing_pipeline/exceptions.py
@@ -59,7 +59,7 @@ class PipelineMessages:
LLM_AUTH = "LLM authentication failed. Check your API key."
LLM_PERMISSION = "LLM request denied. Check your account permissions."
- LLM_NOT_FOUND = "LLM model not found. Check your model configuration."
+ LLM_NOT_FOUND = "Model not found. Check your model configuration."
LLM_BAD_REQUEST = "LLM rejected the request. Document content may be invalid."
LLM_UNPROCESSABLE = (
"Document exceeds the LLM context window even after optimization."
@@ -67,7 +67,7 @@ class PipelineMessages:
LLM_RESPONSE = "LLM returned an invalid response."
LLM_AUTH = "LLM authentication failed. Check your API key."
LLM_PERMISSION = "LLM request denied. Check your account permissions."
- LLM_NOT_FOUND = "LLM model not found. Check your model configuration."
+ LLM_NOT_FOUND = "Model not found. Check your model configuration."
LLM_BAD_REQUEST = "LLM rejected the request. Document content may be invalid."
LLM_UNPROCESSABLE = (
"Document exceeds the LLM context window even after optimization."
diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py
index 1937f11cb..efa0ff2f6 100644
--- a/surfsense_backend/app/routes/__init__.py
+++ b/surfsense_backend/app/routes/__init__.py
@@ -84,7 +84,7 @@ router.include_router(confluence_add_connector_router)
router.include_router(clickup_add_connector_router)
router.include_router(dropbox_add_connector_router)
router.include_router(new_llm_config_router) # LLM configs with prompt configuration
-router.include_router(model_list_router) # Dynamic LLM model catalogue from OpenRouter
+router.include_router(model_list_router) # Dynamic model catalogue from OpenRouter
router.include_router(logs_router)
router.include_router(circleback_webhook_router) # Circleback meeting webhooks
router.include_router(surfsense_docs_router) # Surfsense documentation for citations
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index f53c81bb6..5008b1a10 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -2,6 +2,7 @@
import asyncio
from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
+from pydantic import BaseModel as PydanticBaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
@@ -10,6 +11,8 @@ from app.db import (
Chunk,
Document,
DocumentType,
+ DocumentVersion,
+ Folder,
Permission,
SearchSpace,
SearchSpaceMembership,
@@ -27,6 +30,7 @@ from app.schemas import (
DocumentTitleSearchResponse,
DocumentUpdate,
DocumentWithChunksRead,
+ FolderRead,
PaginatedResponse,
)
from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher
@@ -957,6 +961,39 @@ async def get_document_by_chunk_id(
) from e
+@router.get("/documents/watched-folders", response_model=list[FolderRead])
+async def get_watched_folders(
+ search_space_id: int,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """Return root folders that are marked as watched (metadata->>'watched' = 'true')."""
+ await check_permission(
+ session,
+ user,
+ search_space_id,
+ Permission.DOCUMENTS_READ.value,
+ "You don't have permission to read documents in this search space",
+ )
+
+ folders = (
+ (
+ await session.execute(
+ select(Folder).where(
+ Folder.search_space_id == search_space_id,
+ Folder.parent_id.is_(None),
+ Folder.folder_metadata.isnot(None),
+ Folder.folder_metadata["watched"].astext == "true",
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+
+ return folders
+
+
@router.get(
"/documents/{document_id}/chunks",
response_model=PaginatedResponse[ChunkRead],
@@ -1212,3 +1249,297 @@ async def delete_document(
raise HTTPException(
status_code=500, detail=f"Failed to delete document: {e!s}"
) from e
+
+
+# ====================================================================
+# Version History Endpoints
+# ====================================================================
+
+
+@router.get("/documents/{document_id}/versions")
+async def list_document_versions(
+ document_id: int,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """List all versions for a document, ordered by version_number descending."""
+ document = (
+ await session.execute(select(Document).where(Document.id == document_id))
+ ).scalar_one_or_none()
+ if not document:
+ raise HTTPException(status_code=404, detail="Document not found")
+
+ await check_permission(
+ session, user, document.search_space_id, Permission.DOCUMENTS_READ.value
+ )
+
+ versions = (
+ (
+ await session.execute(
+ select(DocumentVersion)
+ .where(DocumentVersion.document_id == document_id)
+ .order_by(DocumentVersion.version_number.desc())
+ )
+ )
+ .scalars()
+ .all()
+ )
+
+ return [
+ {
+ "version_number": v.version_number,
+ "title": v.title,
+ "content_hash": v.content_hash,
+ "created_at": v.created_at.isoformat() if v.created_at else None,
+ }
+ for v in versions
+ ]
+
+
+@router.get("/documents/{document_id}/versions/{version_number}")
+async def get_document_version(
+ document_id: int,
+ version_number: int,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """Get full version content including source_markdown."""
+ document = (
+ await session.execute(select(Document).where(Document.id == document_id))
+ ).scalar_one_or_none()
+ if not document:
+ raise HTTPException(status_code=404, detail="Document not found")
+
+ await check_permission(
+ session, user, document.search_space_id, Permission.DOCUMENTS_READ.value
+ )
+
+ version = (
+ await session.execute(
+ select(DocumentVersion).where(
+ DocumentVersion.document_id == document_id,
+ DocumentVersion.version_number == version_number,
+ )
+ )
+ ).scalar_one_or_none()
+ if not version:
+ raise HTTPException(status_code=404, detail="Version not found")
+
+ return {
+ "version_number": version.version_number,
+ "title": version.title,
+ "content_hash": version.content_hash,
+ "source_markdown": version.source_markdown,
+ "created_at": version.created_at.isoformat() if version.created_at else None,
+ }
+
+
+@router.post("/documents/{document_id}/versions/{version_number}/restore")
+async def restore_document_version(
+ document_id: int,
+ version_number: int,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """Restore a previous version: snapshot current state, then overwrite document content."""
+ document = (
+ await session.execute(select(Document).where(Document.id == document_id))
+ ).scalar_one_or_none()
+ if not document:
+ raise HTTPException(status_code=404, detail="Document not found")
+
+ await check_permission(
+ session, user, document.search_space_id, Permission.DOCUMENTS_UPDATE.value
+ )
+
+ version = (
+ await session.execute(
+ select(DocumentVersion).where(
+ DocumentVersion.document_id == document_id,
+ DocumentVersion.version_number == version_number,
+ )
+ )
+ ).scalar_one_or_none()
+ if not version:
+ raise HTTPException(status_code=404, detail="Version not found")
+
+ # Snapshot current state before restoring
+ from app.utils.document_versioning import create_version_snapshot
+
+ await create_version_snapshot(session, document)
+
+ # Restore the version's content onto the document
+ document.source_markdown = version.source_markdown
+ document.title = version.title or document.title
+ document.content_needs_reindexing = True
+ await session.commit()
+
+ from app.tasks.celery_tasks.document_reindex_tasks import reindex_document_task
+
+ reindex_document_task.delay(document_id, str(user.id))
+
+ return {
+ "message": f"Restored version {version_number}",
+ "document_id": document_id,
+ "restored_version": version_number,
+ }
+
+
+# ===== Local folder indexing endpoints =====
+
+
+class FolderIndexRequest(PydanticBaseModel):
+ folder_path: str
+ folder_name: str
+ search_space_id: int
+ exclude_patterns: list[str] | None = None
+ file_extensions: list[str] | None = None
+ root_folder_id: int | None = None
+ enable_summary: bool = False
+
+
+class FolderIndexFilesRequest(PydanticBaseModel):
+ folder_path: str
+ folder_name: str
+ search_space_id: int
+ target_file_paths: list[str]
+ root_folder_id: int | None = None
+ enable_summary: bool = False
+
+
+@router.post("/documents/folder-index")
+async def folder_index(
+ request: FolderIndexRequest,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """Full-scan index of a local folder. Creates the root Folder row synchronously
+ and dispatches the heavy indexing work to a Celery task.
+ Returns the root_folder_id so the desktop can persist it.
+ """
+ from app.config import config as app_config
+
+ if not app_config.is_self_hosted():
+ raise HTTPException(
+ status_code=400,
+ detail="Local folder indexing is only available in self-hosted mode",
+ )
+
+ await check_permission(
+ session,
+ user,
+ request.search_space_id,
+ Permission.DOCUMENTS_CREATE.value,
+ "You don't have permission to create documents in this search space",
+ )
+
+ watched_metadata = {
+ "watched": True,
+ "folder_path": request.folder_path,
+ "exclude_patterns": request.exclude_patterns,
+ "file_extensions": request.file_extensions,
+ }
+
+ root_folder_id = request.root_folder_id
+ if root_folder_id:
+ existing = (
+ await session.execute(select(Folder).where(Folder.id == root_folder_id))
+ ).scalar_one_or_none()
+ if not existing:
+ root_folder_id = None
+ else:
+ existing.folder_metadata = watched_metadata
+ await session.commit()
+
+ if not root_folder_id:
+ root_folder = Folder(
+ name=request.folder_name,
+ search_space_id=request.search_space_id,
+ created_by_id=str(user.id),
+ position="a0",
+ folder_metadata=watched_metadata,
+ )
+ session.add(root_folder)
+ await session.flush()
+ root_folder_id = root_folder.id
+ await session.commit()
+
+ from app.tasks.celery_tasks.document_tasks import index_local_folder_task
+
+ index_local_folder_task.delay(
+ search_space_id=request.search_space_id,
+ user_id=str(user.id),
+ folder_path=request.folder_path,
+ folder_name=request.folder_name,
+ exclude_patterns=request.exclude_patterns,
+ file_extensions=request.file_extensions,
+ root_folder_id=root_folder_id,
+ enable_summary=request.enable_summary,
+ )
+
+ return {
+ "message": "Folder indexing started",
+ "status": "processing",
+ "root_folder_id": root_folder_id,
+ }
+
+
+@router.post("/documents/folder-index-files")
+async def folder_index_files(
+ request: FolderIndexFilesRequest,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """Index multiple files within a watched folder (batched chokidar trigger).
+ Validates that all target_file_paths are under folder_path.
+ Dispatches a single Celery task that processes them in parallel.
+ """
+ from app.config import config as app_config
+
+ if not app_config.is_self_hosted():
+ raise HTTPException(
+ status_code=400,
+ detail="Local folder indexing is only available in self-hosted mode",
+ )
+
+ if not request.target_file_paths:
+ raise HTTPException(
+ status_code=400, detail="target_file_paths must not be empty"
+ )
+
+ await check_permission(
+ session,
+ user,
+ request.search_space_id,
+ Permission.DOCUMENTS_CREATE.value,
+ "You don't have permission to create documents in this search space",
+ )
+
+ from pathlib import Path
+
+ for fp in request.target_file_paths:
+ try:
+ Path(fp).relative_to(request.folder_path)
+ except ValueError as err:
+ raise HTTPException(
+ status_code=400,
+ detail=f"target_file_path {fp} must be inside folder_path",
+ ) from err
+
+ from app.tasks.celery_tasks.document_tasks import index_local_folder_task
+
+ index_local_folder_task.delay(
+ search_space_id=request.search_space_id,
+ user_id=str(user.id),
+ folder_path=request.folder_path,
+ folder_name=request.folder_name,
+ target_file_paths=request.target_file_paths,
+ root_folder_id=request.root_folder_id,
+ enable_summary=request.enable_summary,
+ )
+
+ return {
+ "message": f"Batch indexing started for {len(request.target_file_paths)} file(s)",
+ "status": "processing",
+ "file_count": len(request.target_file_paths),
+ }
diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py
index 09a35c619..829b2cf69 100644
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@@ -128,9 +128,20 @@ async def get_editor_content(
chunk_contents = chunk_contents_result.scalars().all()
if not chunk_contents:
+ doc_status = document.status or {}
+ state = (
+ doc_status.get("state", "ready")
+ if isinstance(doc_status, dict)
+ else "ready"
+ )
+ if state in ("pending", "processing"):
+ raise HTTPException(
+ status_code=409,
+ detail="This document is still being processed. Please wait a moment and try again.",
+ )
raise HTTPException(
status_code=400,
- detail="This document has no content and cannot be edited. Please re-upload to enable editing.",
+ detail="This document has no viewable content yet. It may still be syncing. Try again in a few seconds, or re-upload if the issue persists.",
)
markdown_content = "\n\n".join(chunk_contents)
@@ -138,7 +149,7 @@ async def get_editor_content(
if not markdown_content.strip():
raise HTTPException(
status_code=400,
- detail="This document has empty content and cannot be edited.",
+ detail="This document appears to be empty. Try re-uploading or editing it to add content.",
)
document.source_markdown = markdown_content
diff --git a/surfsense_backend/app/routes/folders_routes.py b/surfsense_backend/app/routes/folders_routes.py
index d688e692a..2dc9bceac 100644
--- a/surfsense_backend/app/routes/folders_routes.py
+++ b/surfsense_backend/app/routes/folders_routes.py
@@ -192,6 +192,33 @@ async def get_folder_breadcrumb(
) from e
+@router.patch("/folders/{folder_id}/watched")
+async def stop_watching_folder(
+ folder_id: int,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """Clear the watched flag from a folder's metadata."""
+ folder = await session.get(Folder, folder_id)
+ if not folder:
+ raise HTTPException(status_code=404, detail="Folder not found")
+
+ await check_permission(
+ session,
+ user,
+ folder.search_space_id,
+ Permission.DOCUMENTS_UPDATE.value,
+ "You don't have permission to update folders in this search space",
+ )
+
+ if folder.folder_metadata and isinstance(folder.folder_metadata, dict):
+ updated = {**folder.folder_metadata, "watched": False}
+ folder.folder_metadata = updated
+ await session.commit()
+
+ return {"message": "Folder watch status updated"}
+
+
@router.put("/folders/{folder_id}", response_model=FolderRead)
async def update_folder(
folder_id: int,
@@ -340,7 +367,7 @@ async def delete_folder(
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
- """Delete a folder and cascade-delete subfolders. Documents are async-deleted via Celery."""
+ """Mark documents for deletion and dispatch Celery to delete docs first, then folders."""
try:
folder = await session.get(Folder, folder_id)
if not folder:
@@ -372,30 +399,29 @@ async def delete_folder(
)
await session.commit()
- await session.execute(Folder.__table__.delete().where(Folder.id == folder_id))
- await session.commit()
+ try:
+ from app.tasks.celery_tasks.document_tasks import (
+ delete_folder_documents_task,
+ )
- if document_ids:
- try:
- from app.tasks.celery_tasks.document_tasks import (
- delete_folder_documents_task,
- )
-
- delete_folder_documents_task.delay(document_ids)
- except Exception as err:
+ delete_folder_documents_task.delay(
+ document_ids, folder_subtree_ids=list(subtree_ids)
+ )
+ except Exception as err:
+ if document_ids:
await session.execute(
Document.__table__.update()
.where(Document.id.in_(document_ids))
.values(status={"state": "ready"})
)
await session.commit()
- raise HTTPException(
- status_code=503,
- detail="Folder deleted but document cleanup could not be queued. Documents have been restored.",
- ) from err
+ raise HTTPException(
+ status_code=503,
+ detail="Could not queue folder deletion. Documents have been restored.",
+ ) from err
return {
- "message": "Folder deleted successfully",
+ "message": "Folder deletion started",
"documents_queued_for_deletion": len(document_ids),
}
diff --git a/surfsense_backend/app/routes/model_list_routes.py b/surfsense_backend/app/routes/model_list_routes.py
index ef6e30514..79ae7221f 100644
--- a/surfsense_backend/app/routes/model_list_routes.py
+++ b/surfsense_backend/app/routes/model_list_routes.py
@@ -1,5 +1,5 @@
"""
-API route for fetching the available LLM models catalogue.
+API route for fetching the available models catalogue.
Serves a dynamically-updated list sourced from the OpenRouter public API,
with a local JSON fallback when the API is unreachable.
@@ -30,7 +30,7 @@ async def list_available_models(
user: User = Depends(current_active_user),
):
"""
- Return all available LLM models grouped by provider.
+ Return all available models grouped by provider.
The list is sourced from the OpenRouter public API and cached for 1 hour.
If the API is unreachable, a local fallback file is used instead.
diff --git a/surfsense_backend/app/routes/new_llm_config_routes.py b/surfsense_backend/app/routes/new_llm_config_routes.py
index f784bd273..78907c719 100644
--- a/surfsense_backend/app/routes/new_llm_config_routes.py
+++ b/surfsense_backend/app/routes/new_llm_config_routes.py
@@ -1,7 +1,7 @@
"""
API routes for NewLLMConfig CRUD operations.
-NewLLMConfig combines LLM model settings with prompt configuration:
+NewLLMConfig combines model settings with prompt configuration:
- LLM provider, model, API key, etc.
- Configurable system instructions
- Citation toggle
diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py
index b73b8c789..d208ff910 100644
--- a/surfsense_backend/app/routes/search_source_connectors_routes.py
+++ b/surfsense_backend/app/routes/search_source_connectors_routes.py
@@ -55,23 +55,12 @@ from app.schemas import (
)
from app.services.composio_service import ComposioService, get_composio_service
from app.services.notification_service import NotificationService
-from app.tasks.connector_indexers import (
- index_airtable_records,
- index_clickup_tasks,
- index_confluence_pages,
- index_crawled_urls,
- index_discord_messages,
- index_elasticsearch_documents,
- index_github_repos,
- index_google_calendar_events,
- index_google_gmail_messages,
- index_jira_issues,
- index_linear_issues,
- index_luma_events,
- index_notion_pages,
- index_slack_messages,
-)
from app.users import current_active_user
+
+# NOTE: connector indexer functions are imported lazily inside each
+# ``run_*_indexing`` helper to break a circular import cycle:
+# connector_indexers.__init__ → airtable_indexer → airtable_history
+# → app.routes.__init__ → this file → connector_indexers (not ready yet)
from app.utils.connector_naming import ensure_unique_connector_name
from app.utils.indexing_locks import (
acquire_connector_indexing_lock,
@@ -1378,6 +1367,8 @@ async def run_slack_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_slack_messages
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -1824,6 +1815,8 @@ async def run_notion_indexing_with_new_session(
Create a new session and run the Notion indexing task.
This prevents session leaks by creating a dedicated session for the background task.
"""
+ from app.tasks.connector_indexers import index_notion_pages
+
async with async_session_maker() as session:
await _run_indexing_with_notifications(
session=session,
@@ -1858,6 +1851,8 @@ async def run_notion_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_notion_pages
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -1910,6 +1905,8 @@ async def run_github_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_github_repos
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -1961,6 +1958,8 @@ async def run_linear_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_linear_issues
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2011,6 +2010,8 @@ async def run_discord_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_discord_messages
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2113,6 +2114,8 @@ async def run_jira_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_jira_issues
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2166,6 +2169,8 @@ async def run_confluence_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_confluence_pages
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2217,6 +2222,8 @@ async def run_clickup_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_clickup_tasks
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2268,6 +2275,8 @@ async def run_airtable_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_airtable_records
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2321,6 +2330,8 @@ async def run_google_calendar_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_google_calendar_events
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2370,6 +2381,7 @@ async def run_google_gmail_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_google_gmail_messages
# Create a wrapper function that calls index_google_gmail_messages with max_messages
async def gmail_indexing_wrapper(
@@ -2836,6 +2848,8 @@ async def run_luma_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_luma_events
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2888,6 +2902,8 @@ async def run_elasticsearch_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_elasticsearch_documents
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
@@ -2938,6 +2954,8 @@ async def run_web_page_indexing(
start_date: Start date for indexing
end_date: End date for indexing
"""
+ from app.tasks.connector_indexers import index_crawled_urls
+
await _run_indexing_with_notifications(
session=session,
connector_id=connector_id,
diff --git a/surfsense_backend/app/schemas/folders.py b/surfsense_backend/app/schemas/folders.py
index 263817182..a7e065144 100644
--- a/surfsense_backend/app/schemas/folders.py
+++ b/surfsense_backend/app/schemas/folders.py
@@ -1,6 +1,7 @@
"""Pydantic schemas for folder CRUD, move, and reorder operations."""
from datetime import datetime
+from typing import Any
from uuid import UUID
from pydantic import BaseModel, ConfigDict, Field
@@ -34,6 +35,9 @@ class FolderRead(BaseModel):
created_by_id: UUID | None
created_at: datetime
updated_at: datetime
+ metadata: dict[str, Any] | None = Field(
+ default=None, validation_alias="folder_metadata"
+ )
model_config = ConfigDict(from_attributes=True)
diff --git a/surfsense_backend/app/schemas/new_llm_config.py b/surfsense_backend/app/schemas/new_llm_config.py
index 9863665b6..15ed4ce67 100644
--- a/surfsense_backend/app/schemas/new_llm_config.py
+++ b/surfsense_backend/app/schemas/new_llm_config.py
@@ -1,7 +1,7 @@
"""
Pydantic schemas for the NewLLMConfig API.
-NewLLMConfig combines LLM model settings with prompt configuration:
+NewLLMConfig combines model settings with prompt configuration:
- LLM provider, model, API key, etc.
- Configurable system instructions
- Citation toggle
@@ -26,7 +26,7 @@ class NewLLMConfigBase(BaseModel):
None, max_length=500, description="Optional description"
)
- # LLM Model Configuration
+ # Model Configuration
provider: LiteLLMProvider = Field(..., description="LiteLLM provider type")
custom_provider: str | None = Field(
None, max_length=100, description="Custom provider name when provider is CUSTOM"
@@ -71,7 +71,7 @@ class NewLLMConfigUpdate(BaseModel):
name: str | None = Field(None, max_length=100)
description: str | None = Field(None, max_length=500)
- # LLM Model Configuration
+ # Model Configuration
provider: LiteLLMProvider | None = None
custom_provider: str | None = Field(None, max_length=100)
model_name: str | None = Field(None, max_length=100)
@@ -106,7 +106,7 @@ class NewLLMConfigPublic(BaseModel):
name: str
description: str | None = None
- # LLM Model Configuration (no api_key)
+ # Model Configuration (no api_key)
provider: LiteLLMProvider
custom_provider: str | None = None
model_name: str
@@ -149,7 +149,7 @@ class GlobalNewLLMConfigRead(BaseModel):
name: str
description: str | None = None
- # LLM Model Configuration (no api_key)
+ # Model Configuration (no api_key)
provider: str # String because YAML doesn't enforce enum, "AUTO" for Auto mode
custom_provider: str | None = None
model_name: str
diff --git a/surfsense_backend/app/services/model_list_service.py b/surfsense_backend/app/services/model_list_service.py
index ebc0e0d7c..2a81c2d52 100644
--- a/surfsense_backend/app/services/model_list_service.py
+++ b/surfsense_backend/app/services/model_list_service.py
@@ -1,5 +1,5 @@
"""
-Service for fetching and caching the available LLM model list.
+Service for fetching and caching the available model list.
Uses the OpenRouter public API as the primary source, with a local
fallback JSON file when the API is unreachable.
diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
index 662b41f2a..4e9249d34 100644
--- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
+++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py
@@ -1,6 +1,7 @@
"""Celery tasks for document processing."""
import asyncio
+import contextlib
import logging
import os
from uuid import UUID
@@ -10,6 +11,7 @@ from app.config import config
from app.services.notification_service import NotificationService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.celery_tasks import get_celery_session_maker
+from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
from app.tasks.document_processors import (
add_extension_received_document,
add_youtube_video_document,
@@ -141,21 +143,30 @@ async def _delete_document_background(document_id: int) -> None:
retry_backoff_max=300,
max_retries=5,
)
-def delete_folder_documents_task(self, document_ids: list[int]):
- """Celery task to batch-delete documents orphaned by folder deletion."""
+def delete_folder_documents_task(
+ self,
+ document_ids: list[int],
+ folder_subtree_ids: list[int] | None = None,
+):
+ """Celery task to delete documents first, then the folder rows."""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
- loop.run_until_complete(_delete_folder_documents(document_ids))
+ loop.run_until_complete(
+ _delete_folder_documents(document_ids, folder_subtree_ids)
+ )
finally:
loop.close()
-async def _delete_folder_documents(document_ids: list[int]) -> None:
- """Delete chunks in batches, then document rows for each orphaned document."""
+async def _delete_folder_documents(
+ document_ids: list[int],
+ folder_subtree_ids: list[int] | None = None,
+) -> None:
+ """Delete chunks in batches, then document rows, then folder rows."""
from sqlalchemy import delete as sa_delete, select
- from app.db import Chunk, Document
+ from app.db import Chunk, Document, Folder
async with get_celery_session_maker()() as session:
batch_size = 500
@@ -177,6 +188,12 @@ async def _delete_folder_documents(document_ids: list[int]) -> None:
await session.delete(doc)
await session.commit()
+ if folder_subtree_ids:
+ await session.execute(
+ sa_delete(Folder).where(Folder.id.in_(folder_subtree_ids))
+ )
+ await session.commit()
+
@celery_app.task(
name="delete_search_space_background",
@@ -1243,3 +1260,154 @@ async def _process_circleback_meeting(
heartbeat_task.cancel()
if notification:
_stop_heartbeat(notification.id)
+
+
+# ===== Local folder indexing task =====
+
+
+@celery_app.task(name="index_local_folder", bind=True)
+def index_local_folder_task(
+ self,
+ search_space_id: int,
+ user_id: str,
+ folder_path: str,
+ folder_name: str,
+ exclude_patterns: list[str] | None = None,
+ file_extensions: list[str] | None = None,
+ root_folder_id: int | None = None,
+ enable_summary: bool = False,
+ target_file_paths: list[str] | None = None,
+):
+ """Celery task to index a local folder. Config is passed directly — no connector row."""
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+
+ try:
+ loop.run_until_complete(
+ _index_local_folder_async(
+ search_space_id=search_space_id,
+ user_id=user_id,
+ folder_path=folder_path,
+ folder_name=folder_name,
+ exclude_patterns=exclude_patterns,
+ file_extensions=file_extensions,
+ root_folder_id=root_folder_id,
+ enable_summary=enable_summary,
+ target_file_paths=target_file_paths,
+ )
+ )
+ finally:
+ loop.close()
+
+
+async def _index_local_folder_async(
+ search_space_id: int,
+ user_id: str,
+ folder_path: str,
+ folder_name: str,
+ exclude_patterns: list[str] | None = None,
+ file_extensions: list[str] | None = None,
+ root_folder_id: int | None = None,
+ enable_summary: bool = False,
+ target_file_paths: list[str] | None = None,
+):
+ """Run local folder indexing with notification + heartbeat."""
+ is_batch = bool(target_file_paths)
+ is_full_scan = not target_file_paths
+ file_count = len(target_file_paths) if target_file_paths else None
+
+ if is_batch:
+ doc_name = f"{folder_name} ({file_count} file{'s' if file_count != 1 else ''})"
+ else:
+ doc_name = folder_name
+
+ notification = None
+ notification_id: int | None = None
+ heartbeat_task = None
+
+ async with get_celery_session_maker()() as session:
+ try:
+ notification = (
+ await NotificationService.document_processing.notify_processing_started(
+ session=session,
+ user_id=UUID(user_id),
+ document_type="LOCAL_FOLDER_FILE",
+ document_name=doc_name,
+ search_space_id=search_space_id,
+ )
+ )
+ notification_id = notification.id
+ _start_heartbeat(notification_id)
+ heartbeat_task = asyncio.create_task(_run_heartbeat_loop(notification_id))
+ except Exception:
+ logger.warning(
+ "Failed to create notification for local folder indexing",
+ exc_info=True,
+ )
+
+ async def _heartbeat_progress(completed_count: int) -> None:
+ """Refresh heartbeat and optionally update notification progress."""
+ if notification:
+ with contextlib.suppress(Exception):
+ await NotificationService.document_processing.notify_processing_progress(
+ session=session,
+ notification=notification,
+ stage="indexing",
+ stage_message=f"Syncing files ({completed_count}/{file_count or '?'})",
+ )
+
+ try:
+ _indexed, _skipped_or_failed, _rfid, err = await index_local_folder(
+ session=session,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ folder_path=folder_path,
+ folder_name=folder_name,
+ exclude_patterns=exclude_patterns,
+ file_extensions=file_extensions,
+ root_folder_id=root_folder_id,
+ enable_summary=enable_summary,
+ target_file_paths=target_file_paths,
+ on_heartbeat_callback=_heartbeat_progress
+ if (is_batch or is_full_scan)
+ else None,
+ )
+
+ if notification:
+ try:
+ await session.refresh(notification)
+ if err:
+ await NotificationService.document_processing.notify_processing_completed(
+ session=session,
+ notification=notification,
+ error_message=err,
+ )
+ else:
+ await NotificationService.document_processing.notify_processing_completed(
+ session=session,
+ notification=notification,
+ )
+ except Exception:
+ logger.warning(
+ "Failed to update notification after local folder indexing",
+ exc_info=True,
+ )
+
+ except Exception as e:
+ logger.exception(f"Local folder indexing failed: {e}")
+ if notification:
+ try:
+ await session.refresh(notification)
+ await NotificationService.document_processing.notify_processing_completed(
+ session=session,
+ notification=notification,
+ error_message=str(e)[:200],
+ )
+ except Exception:
+ pass
+ raise
+ finally:
+ if heartbeat_task:
+ heartbeat_task.cancel()
+ if notification_id is not None:
+ _stop_heartbeat(notification_id)
diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py
index 9a1d17fd5..1b032d54a 100644
--- a/surfsense_backend/app/tasks/connector_indexers/__init__.py
+++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py
@@ -42,9 +42,9 @@ from .jira_indexer import index_jira_issues
# Issue tracking and project management
from .linear_indexer import index_linear_issues
-from .luma_indexer import index_luma_events
# Documentation and knowledge management
+from .luma_indexer import index_luma_events
from .notion_indexer import index_notion_pages
from .obsidian_indexer import index_obsidian_vault
from .slack_indexer import index_slack_messages
diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
new file mode 100644
index 000000000..acfbce0bf
--- /dev/null
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@@ -0,0 +1,1247 @@
+"""
+Local folder indexer.
+
+Indexes files from a local folder on disk. Supports:
+- Full-scan mode (startup reconciliation / manual trigger)
+- Batch mode (chokidar real-time trigger, 1..N files)
+- Filesystem folder structure mirroring into DB Folder rows
+- Document versioning via create_version_snapshot
+- ETL-based file parsing for binary formats (PDF, DOCX, images, audio, etc.)
+
+Desktop-only: all change detection is driven by chokidar in the desktop app.
+Config (folder_path, exclude_patterns, etc.) is passed in from the caller —
+no connector row is read.
+"""
+
+import asyncio
+import os
+from collections.abc import Awaitable, Callable
+from datetime import UTC, datetime
+from pathlib import Path
+
+from sqlalchemy import select
+from sqlalchemy.exc import IntegrityError, SQLAlchemyError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import config
+from app.db import (
+ Document,
+ DocumentStatus,
+ DocumentType,
+ Folder,
+)
+from app.indexing_pipeline.connector_document import ConnectorDocument
+from app.indexing_pipeline.document_hashing import compute_identifier_hash
+from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
+from app.services.llm_service import get_user_long_context_llm
+from app.services.page_limit_service import PageLimitExceededError, PageLimitService
+from app.services.task_logging_service import TaskLoggingService
+from app.tasks.celery_tasks import get_celery_session_maker
+from app.utils.document_versioning import create_version_snapshot
+
+from .base import (
+ check_document_by_unique_identifier,
+ logger,
+)
+
+PLAINTEXT_EXTENSIONS = frozenset(
+ {
+ ".md",
+ ".markdown",
+ ".txt",
+ ".text",
+ ".json",
+ ".jsonl",
+ ".yaml",
+ ".yml",
+ ".toml",
+ ".ini",
+ ".cfg",
+ ".conf",
+ ".xml",
+ ".css",
+ ".scss",
+ ".less",
+ ".sass",
+ ".py",
+ ".pyw",
+ ".pyi",
+ ".pyx",
+ ".js",
+ ".jsx",
+ ".ts",
+ ".tsx",
+ ".mjs",
+ ".cjs",
+ ".java",
+ ".kt",
+ ".kts",
+ ".scala",
+ ".groovy",
+ ".c",
+ ".h",
+ ".cpp",
+ ".cxx",
+ ".cc",
+ ".hpp",
+ ".hxx",
+ ".cs",
+ ".fs",
+ ".fsx",
+ ".go",
+ ".rs",
+ ".rb",
+ ".php",
+ ".pl",
+ ".pm",
+ ".lua",
+ ".swift",
+ ".m",
+ ".mm",
+ ".r",
+ ".R",
+ ".jl",
+ ".sh",
+ ".bash",
+ ".zsh",
+ ".fish",
+ ".bat",
+ ".cmd",
+ ".ps1",
+ ".sql",
+ ".graphql",
+ ".gql",
+ ".env",
+ ".gitignore",
+ ".dockerignore",
+ ".editorconfig",
+ ".makefile",
+ ".cmake",
+ ".log",
+ ".rst",
+ ".tex",
+ ".bib",
+ ".org",
+ ".adoc",
+ ".asciidoc",
+ ".vue",
+ ".svelte",
+ ".astro",
+ ".tf",
+ ".hcl",
+ ".proto",
+ }
+)
+
+AUDIO_EXTENSIONS = frozenset(
+ {
+ ".mp3",
+ ".mp4",
+ ".mpeg",
+ ".mpga",
+ ".m4a",
+ ".wav",
+ ".webm",
+ }
+)
+
+
+DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
+
+
+def _is_plaintext_file(filename: str) -> bool:
+ return Path(filename).suffix.lower() in PLAINTEXT_EXTENSIONS
+
+
+def _is_audio_file(filename: str) -> bool:
+ return Path(filename).suffix.lower() in AUDIO_EXTENSIONS
+
+
+def _is_direct_convert_file(filename: str) -> bool:
+ return Path(filename).suffix.lower() in DIRECT_CONVERT_EXTENSIONS
+
+
+def _needs_etl(filename: str) -> bool:
+ """File is not plaintext, not audio, and not direct-convert — requires ETL."""
+ return (
+ not _is_plaintext_file(filename)
+ and not _is_audio_file(filename)
+ and not _is_direct_convert_file(filename)
+ )
+
+
+HeartbeatCallbackType = Callable[[int], Awaitable[None]]
+
+
+def _estimate_pages_safe(page_limit_service: PageLimitService, file_path: str) -> int:
+ """Estimate page count with a file-size fallback."""
+ try:
+ return page_limit_service.estimate_pages_before_processing(file_path)
+ except Exception:
+ file_size = os.path.getsize(file_path)
+ return max(1, file_size // (80 * 1024))
+
+
+async def _check_page_limit_or_skip(
+ page_limit_service: PageLimitService,
+ user_id: str,
+ file_path: str,
+) -> int:
+ """Estimate pages and check the limit; raises PageLimitExceededError if over quota.
+
+ Returns the estimated page count on success.
+ """
+ estimated = _estimate_pages_safe(page_limit_service, file_path)
+ await page_limit_service.check_page_limit(user_id, estimated)
+ return estimated
+
+
+def _compute_final_pages(
+ page_limit_service: PageLimitService,
+ estimated_pages: int,
+ content_length: int,
+) -> int:
+ """Return the final page count as max(estimated, actual)."""
+ actual = page_limit_service.estimate_pages_from_content_length(content_length)
+ return max(estimated_pages, actual)
+
+DEFAULT_EXCLUDE_PATTERNS = [
+ ".git",
+ "node_modules",
+ "__pycache__",
+ ".DS_Store",
+ ".obsidian",
+ ".trash",
+]
+
+
+def scan_folder(
+ folder_path: str,
+ file_extensions: list[str] | None = None,
+ exclude_patterns: list[str] | None = None,
+) -> list[dict]:
+ """Walk a directory and return a list of file entries.
+
+ Args:
+ folder_path: Absolute path to the folder to scan.
+ file_extensions: If provided, only include files with these extensions
+ (e.g. [".md", ".txt"]). ``None`` means include all files.
+ exclude_patterns: Directory/file names to exclude. Any path component
+ matching one of these strings is skipped.
+
+ Returns:
+ List of dicts with keys: path, relative_path, name, modified_at, size.
+ """
+ root = Path(folder_path)
+ if not root.exists():
+ raise ValueError(f"Folder path does not exist: {folder_path}")
+
+ if exclude_patterns is None:
+ exclude_patterns = []
+
+ files: list[dict] = []
+ for dirpath, dirnames, filenames in os.walk(root):
+ rel_dir = Path(dirpath).relative_to(root)
+
+ dirnames[:] = [d for d in dirnames if d not in exclude_patterns]
+
+ if any(part in exclude_patterns for part in rel_dir.parts):
+ continue
+
+ for fname in filenames:
+ if fname in exclude_patterns:
+ continue
+
+ full = Path(dirpath) / fname
+
+ if (
+ file_extensions is not None
+ and full.suffix.lower() not in file_extensions
+ ):
+ continue
+
+ try:
+ stat = full.stat()
+ rel_path = full.relative_to(root)
+ files.append(
+ {
+ "path": str(full),
+ "relative_path": str(rel_path),
+ "name": full.name,
+ "modified_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC),
+ "size": stat.st_size,
+ }
+ )
+ except OSError as e:
+ logger.warning(f"Could not stat file {full}: {e}")
+
+ return files
+
+
+def _read_plaintext_file(file_path: str) -> str:
+ """Read a plaintext/text-based file as UTF-8."""
+ with open(file_path, encoding="utf-8", errors="replace") as f:
+ content = f.read()
+ if "\x00" in content:
+ raise ValueError(
+ f"File contains null bytes — likely a binary file opened as text: {file_path}"
+ )
+ return content
+
+
+async def _read_file_content(file_path: str, filename: str) -> str:
+ """Read file content, using ETL for binary formats.
+
+ Plaintext files are read directly. Audio and document files (PDF, DOCX, etc.)
+ are routed through the configured ETL service (same as Google Drive / OneDrive).
+
+ Raises ValueError if the file cannot be parsed (e.g. no ETL service configured
+ for a binary file).
+ """
+ if _is_plaintext_file(filename):
+ return _read_plaintext_file(file_path)
+
+ if _is_direct_convert_file(filename):
+ from app.tasks.document_processors._direct_converters import (
+ convert_file_directly,
+ )
+
+ return convert_file_directly(file_path, filename)
+
+ if _is_audio_file(filename):
+ etl_service = config.ETL_SERVICE if hasattr(config, "ETL_SERVICE") else None
+ stt_service_val = config.STT_SERVICE if hasattr(config, "STT_SERVICE") else None
+ if not stt_service_val and not etl_service:
+ raise ValueError(
+ f"No STT_SERVICE configured — cannot transcribe audio file: {filename}"
+ )
+
+ if _needs_etl(filename):
+ etl_service = getattr(config, "ETL_SERVICE", None)
+ if not etl_service:
+ raise ValueError(
+ f"No ETL_SERVICE configured — cannot parse binary file: {filename}. "
+ f"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
+ )
+
+ from app.connectors.onedrive.content_extractor import (
+ _parse_file_to_markdown,
+ )
+
+ return await _parse_file_to_markdown(file_path, filename)
+
+
+def _content_hash(content: str, search_space_id: int) -> str:
+ """SHA-256 hash of content scoped to a search space.
+
+ Matches the format used by ``compute_content_hash`` in the unified
+ pipeline so that dedup checks are consistent.
+ """
+ import hashlib
+
+ return hashlib.sha256(f"{search_space_id}:{content}".encode()).hexdigest()
+
+
+async def _compute_file_content_hash(
+ file_path: str,
+ filename: str,
+ search_space_id: int,
+) -> tuple[str, str]:
+ """Read a file (via ETL if needed) and compute its content hash.
+
+ Returns (content_text, content_hash).
+ """
+ content = await _read_file_content(file_path, filename)
+ return content, _content_hash(content, search_space_id)
+
+
+async def _mirror_folder_structure(
+ session: AsyncSession,
+ folder_path: str,
+ folder_name: str,
+ search_space_id: int,
+ user_id: str,
+ root_folder_id: int | None = None,
+ exclude_patterns: list[str] | None = None,
+) -> tuple[dict[str, int], int]:
+ """Mirror the local filesystem directory structure into DB Folder rows.
+
+ Returns (mapping, root_folder_id) where mapping is
+ relative_dir_path -> folder_id. The empty string key maps to the root folder.
+ """
+ root = Path(folder_path)
+ if exclude_patterns is None:
+ exclude_patterns = []
+
+ subdirs: list[str] = []
+ for dirpath, dirnames, _ in os.walk(root):
+ dirnames[:] = [d for d in dirnames if d not in exclude_patterns]
+ rel = Path(dirpath).relative_to(root)
+ if any(part in exclude_patterns for part in rel.parts):
+ continue
+ rel_str = str(rel) if str(rel) != "." else ""
+ if rel_str:
+ subdirs.append(rel_str)
+
+ subdirs.sort(key=lambda p: p.count(os.sep))
+
+ mapping: dict[str, int] = {}
+
+ if root_folder_id:
+ existing = (
+ await session.execute(select(Folder).where(Folder.id == root_folder_id))
+ ).scalar_one_or_none()
+ if existing:
+ mapping[""] = existing.id
+ else:
+ root_folder_id = None
+
+ if not root_folder_id:
+ root_folder = Folder(
+ name=folder_name,
+ search_space_id=search_space_id,
+ created_by_id=user_id,
+ position="a0",
+ )
+ session.add(root_folder)
+ await session.flush()
+ mapping[""] = root_folder.id
+ root_folder_id = root_folder.id
+
+ for rel_dir in subdirs:
+ dir_parts = Path(rel_dir).parts
+ dir_name = dir_parts[-1]
+ parent_rel = str(Path(*dir_parts[:-1])) if len(dir_parts) > 1 else ""
+
+ parent_id = mapping.get(parent_rel, mapping[""])
+
+ existing_folder = (
+ await session.execute(
+ select(Folder).where(
+ Folder.name == dir_name,
+ Folder.parent_id == parent_id,
+ Folder.search_space_id == search_space_id,
+ )
+ )
+ ).scalar_one_or_none()
+
+ if existing_folder:
+ mapping[rel_dir] = existing_folder.id
+ else:
+ new_folder = Folder(
+ name=dir_name,
+ parent_id=parent_id,
+ search_space_id=search_space_id,
+ created_by_id=user_id,
+ position="a0",
+ )
+ session.add(new_folder)
+ await session.flush()
+ mapping[rel_dir] = new_folder.id
+
+ await session.flush()
+ return mapping, root_folder_id
+
+
+async def _resolve_folder_for_file(
+ session: AsyncSession,
+ rel_path: str,
+ root_folder_id: int,
+ search_space_id: int,
+ user_id: str,
+) -> int:
+ """Given a file's relative path, ensure all parent Folder rows exist and
+ return the folder_id for the file's immediate parent directory.
+
+ For a file at "notes/daily/today.md", this ensures Folder rows exist for
+ "notes" and "notes/daily", and returns the id of "notes/daily".
+ For a file at "readme.md" (root level), returns root_folder_id.
+ """
+ parent_dir = str(Path(rel_path).parent)
+ if parent_dir == ".":
+ return root_folder_id
+
+ parts = Path(parent_dir).parts
+ current_parent_id = root_folder_id
+
+ for part in parts:
+ existing = (
+ await session.execute(
+ select(Folder).where(
+ Folder.name == part,
+ Folder.parent_id == current_parent_id,
+ Folder.search_space_id == search_space_id,
+ )
+ )
+ ).scalar_one_or_none()
+
+ if existing:
+ current_parent_id = existing.id
+ else:
+ new_folder = Folder(
+ name=part,
+ parent_id=current_parent_id,
+ search_space_id=search_space_id,
+ created_by_id=user_id,
+ position="a0",
+ )
+ session.add(new_folder)
+ await session.flush()
+ current_parent_id = new_folder.id
+
+ return current_parent_id
+
+
+async def _cleanup_empty_folder_chain(
+ session: AsyncSession,
+ folder_id: int,
+ root_folder_id: int,
+) -> None:
+ """Walk up from folder_id toward root, deleting empty folders (no docs, no
+ children). Stops at root_folder_id which is never deleted."""
+ current_id = folder_id
+ while current_id and current_id != root_folder_id:
+ has_doc = (
+ await session.execute(
+ select(Document.id).where(Document.folder_id == current_id).limit(1)
+ )
+ ).scalar_one_or_none()
+ if has_doc is not None:
+ break
+
+ has_child = (
+ await session.execute(
+ select(Folder.id).where(Folder.parent_id == current_id).limit(1)
+ )
+ ).scalar_one_or_none()
+ if has_child is not None:
+ break
+
+ folder = (
+ await session.execute(select(Folder).where(Folder.id == current_id))
+ ).scalar_one_or_none()
+ if not folder:
+ break
+
+ parent_id = folder.parent_id
+ await session.delete(folder)
+ await session.flush()
+ current_id = parent_id
+
+
+async def _cleanup_empty_folders(
+ session: AsyncSession,
+ root_folder_id: int,
+ search_space_id: int,
+ existing_dirs_on_disk: set[str],
+ folder_mapping: dict[str, int],
+) -> None:
+ """Delete Folder rows that are empty (no docs, no children) and no longer on disk."""
+ from sqlalchemy import delete as sa_delete
+
+ id_to_rel: dict[int, str] = {fid: rel for rel, fid in folder_mapping.items() if rel}
+
+ all_folders = (
+ (
+ await session.execute(
+ select(Folder).where(
+ Folder.search_space_id == search_space_id,
+ Folder.id != root_folder_id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+
+ candidates: list[Folder] = []
+ for folder in all_folders:
+ rel = id_to_rel.get(folder.id)
+ if rel and rel in existing_dirs_on_disk:
+ continue
+ candidates.append(folder)
+
+ changed = True
+ while changed:
+ changed = False
+ remaining: list[Folder] = []
+ for folder in candidates:
+ doc_exists = (
+ await session.execute(
+ select(Document.id).where(Document.folder_id == folder.id).limit(1)
+ )
+ ).scalar_one_or_none()
+ if doc_exists is not None:
+ remaining.append(folder)
+ continue
+
+ child_exists = (
+ await session.execute(
+ select(Folder.id).where(Folder.parent_id == folder.id).limit(1)
+ )
+ ).scalar_one_or_none()
+ if child_exists is not None:
+ remaining.append(folder)
+ continue
+
+ await session.execute(sa_delete(Folder).where(Folder.id == folder.id))
+ changed = True
+ candidates = remaining
+
+
+def _build_connector_doc(
+ title: str,
+ content: str,
+ relative_path: str,
+ folder_name: str,
+ *,
+ search_space_id: int,
+ user_id: str,
+ enable_summary: bool,
+) -> ConnectorDocument:
+ """Build a ConnectorDocument from a local file's extracted content."""
+ unique_id = f"{folder_name}:{relative_path}"
+ metadata = {
+ "folder_name": folder_name,
+ "file_path": relative_path,
+ "document_type": "Local Folder File",
+ "connector_type": "Local Folder",
+ }
+ fallback_summary = f"File: {title}\n\n{content[:4000]}"
+
+ return ConnectorDocument(
+ title=title,
+ source_markdown=content,
+ unique_id=unique_id,
+ document_type=DocumentType.LOCAL_FOLDER_FILE,
+ search_space_id=search_space_id,
+ connector_id=None,
+ created_by_id=user_id,
+ should_summarize=enable_summary,
+ fallback_summary=fallback_summary,
+ metadata=metadata,
+ )
+
+
+async def index_local_folder(
+ session: AsyncSession,
+ search_space_id: int,
+ user_id: str,
+ folder_path: str,
+ folder_name: str,
+ exclude_patterns: list[str] | None = None,
+ file_extensions: list[str] | None = None,
+ root_folder_id: int | None = None,
+ enable_summary: bool = False,
+ target_file_paths: list[str] | None = None,
+ on_heartbeat_callback: HeartbeatCallbackType | None = None,
+) -> tuple[int, int, int | None, str | None]:
+ """Index files from a local folder.
+
+ Supports two modes:
+ - Batch (target_file_paths set): processes 1..N files.
+ Single-file uses the caller's session; multi-file fans out with per-file sessions.
+ - Full scan (no target paths): walks entire folder, handles new/changed/deleted files.
+
+ Returns (indexed_count, skipped_count, root_folder_id, error_or_warning_message).
+ """
+ task_logger = TaskLoggingService(session, search_space_id)
+
+ log_entry = await task_logger.log_task_start(
+ task_name="local_folder_indexing",
+ source="local_folder_indexing_task",
+ message=f"Starting local folder indexing for {folder_name}",
+ metadata={
+ "folder_path": folder_path,
+ "user_id": str(user_id),
+ "target_file_paths_count": len(target_file_paths)
+ if target_file_paths
+ else None,
+ },
+ )
+
+ try:
+ if not folder_path or not os.path.exists(folder_path):
+ await task_logger.log_task_failure(
+ log_entry,
+ f"Folder path missing or does not exist: {folder_path}",
+ "Folder not found",
+ {},
+ )
+ return (
+ 0,
+ 0,
+ root_folder_id,
+ f"Folder path missing or does not exist: {folder_path}",
+ )
+
+ if exclude_patterns is None:
+ exclude_patterns = DEFAULT_EXCLUDE_PATTERNS
+
+ # ====================================================================
+ # BATCH MODE (1..N files)
+ # ====================================================================
+ if target_file_paths:
+ if len(target_file_paths) == 1:
+ indexed, skipped, err = await _index_single_file(
+ session=session,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ folder_path=folder_path,
+ folder_name=folder_name,
+ target_file_path=target_file_paths[0],
+ enable_summary=enable_summary,
+ root_folder_id=root_folder_id,
+ task_logger=task_logger,
+ log_entry=log_entry,
+ )
+ return indexed, skipped, root_folder_id, err
+
+ indexed, failed, err = await _index_batch_files(
+ search_space_id=search_space_id,
+ user_id=user_id,
+ folder_path=folder_path,
+ folder_name=folder_name,
+ target_file_paths=target_file_paths,
+ enable_summary=enable_summary,
+ root_folder_id=root_folder_id,
+ on_progress_callback=on_heartbeat_callback,
+ )
+ if err:
+ await task_logger.log_task_success(
+ log_entry,
+ f"Batch indexing: {indexed} indexed, {failed} failed",
+ {"indexed": indexed, "failed": failed},
+ )
+ else:
+ await task_logger.log_task_success(
+ log_entry,
+ f"Batch indexing complete: {indexed} indexed",
+ {"indexed": indexed, "failed": failed},
+ )
+ return indexed, failed, root_folder_id, err
+
+ # ====================================================================
+ # FULL-SCAN MODE
+ # ====================================================================
+
+ await task_logger.log_task_progress(
+ log_entry, "Mirroring folder structure", {"stage": "folder_mirror"}
+ )
+
+ folder_mapping, root_folder_id = await _mirror_folder_structure(
+ session=session,
+ folder_path=folder_path,
+ folder_name=folder_name,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ root_folder_id=root_folder_id,
+ exclude_patterns=exclude_patterns,
+ )
+ await session.flush()
+
+ try:
+ files = scan_folder(folder_path, file_extensions, exclude_patterns)
+ except Exception as e:
+ await task_logger.log_task_failure(
+ log_entry, f"Failed to scan folder: {e}", "Scan error", {}
+ )
+ return 0, 0, root_folder_id, f"Failed to scan folder: {e}"
+
+ logger.info(f"Found {len(files)} files in folder")
+
+ indexed_count = 0
+ skipped_count = 0
+ failed_count = 0
+
+ page_limit_service = PageLimitService(session)
+
+ # ================================================================
+ # PHASE 1: Pre-filter files (mtime / content-hash), version changed
+ # ================================================================
+ connector_docs: list[ConnectorDocument] = []
+ file_meta_map: dict[str, dict] = {}
+ seen_unique_hashes: set[str] = set()
+
+ for file_info in files:
+ try:
+ relative_path = file_info["relative_path"]
+ file_path_abs = file_info["path"]
+
+ unique_identifier = f"{folder_name}:{relative_path}"
+ unique_identifier_hash = compute_identifier_hash(
+ DocumentType.LOCAL_FOLDER_FILE.value,
+ unique_identifier,
+ search_space_id,
+ )
+ seen_unique_hashes.add(unique_identifier_hash)
+
+ existing_document = await check_document_by_unique_identifier(
+ session, unique_identifier_hash
+ )
+
+ if existing_document:
+ stored_mtime = (existing_document.document_metadata or {}).get(
+ "mtime"
+ )
+ current_mtime = file_info["modified_at"].timestamp()
+
+ if stored_mtime and abs(current_mtime - stored_mtime) < 1.0:
+ if not DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.READY
+ ):
+ existing_document.status = DocumentStatus.ready()
+ skipped_count += 1
+ continue
+
+ try:
+ estimated_pages = await _check_page_limit_or_skip(
+ page_limit_service, user_id, file_path_abs
+ )
+ except PageLimitExceededError:
+ logger.warning(
+ f"Page limit exceeded, skipping: {file_path_abs}"
+ )
+ failed_count += 1
+ continue
+
+ try:
+ content, content_hash = await _compute_file_content_hash(
+ file_path_abs, file_info["relative_path"], search_space_id
+ )
+ except Exception as read_err:
+ logger.warning(f"Could not read {file_path_abs}: {read_err}")
+ skipped_count += 1
+ continue
+
+ if existing_document.content_hash == content_hash:
+ meta = dict(existing_document.document_metadata or {})
+ meta["mtime"] = current_mtime
+ existing_document.document_metadata = meta
+ if not DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.READY
+ ):
+ existing_document.status = DocumentStatus.ready()
+ skipped_count += 1
+ continue
+
+ await create_version_snapshot(session, existing_document)
+ else:
+ try:
+ estimated_pages = await _check_page_limit_or_skip(
+ page_limit_service, user_id, file_path_abs
+ )
+ except PageLimitExceededError:
+ logger.warning(
+ f"Page limit exceeded, skipping: {file_path_abs}"
+ )
+ failed_count += 1
+ continue
+
+ try:
+ content, content_hash = await _compute_file_content_hash(
+ file_path_abs, file_info["relative_path"], search_space_id
+ )
+ except Exception as read_err:
+ logger.warning(f"Could not read {file_path_abs}: {read_err}")
+ skipped_count += 1
+ continue
+
+ if not content.strip():
+ skipped_count += 1
+ continue
+
+ doc = _build_connector_doc(
+ title=file_info["name"],
+ content=content,
+ relative_path=relative_path,
+ folder_name=folder_name,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ enable_summary=enable_summary,
+ )
+ connector_docs.append(doc)
+ file_meta_map[unique_identifier] = {
+ "relative_path": relative_path,
+ "mtime": file_info["modified_at"].timestamp(),
+ "estimated_pages": estimated_pages,
+ "content_length": len(content),
+ }
+
+ except Exception as e:
+ logger.exception(f"Phase 1 error for {file_info.get('path')}: {e}")
+ failed_count += 1
+
+ # ================================================================
+ # PHASE 1.5: Delete documents no longer on disk
+ # ================================================================
+ all_root_folder_ids = set(folder_mapping.values())
+ all_db_folders = (
+ (
+ await session.execute(
+ select(Folder.id).where(
+ Folder.search_space_id == search_space_id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+ all_root_folder_ids.update(all_db_folders)
+
+ all_folder_docs = (
+ (
+ await session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == search_space_id,
+ Document.folder_id.in_(list(all_root_folder_ids)),
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+
+ for doc in all_folder_docs:
+ if doc.unique_identifier_hash not in seen_unique_hashes:
+ await session.delete(doc)
+
+ await session.flush()
+
+ # ================================================================
+ # PHASE 2: Index via unified pipeline
+ # ================================================================
+ if connector_docs:
+ from app.indexing_pipeline.document_hashing import (
+ compute_unique_identifier_hash,
+ )
+
+ pipeline = IndexingPipelineService(session)
+ doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs}
+ documents = await pipeline.prepare_for_indexing(connector_docs)
+
+ # Assign folder_id immediately so docs appear in the correct
+ # folder while still pending/processing (visible via Zero sync).
+ for document in documents:
+ cd = doc_map.get(document.unique_identifier_hash)
+ if cd is None:
+ continue
+ rel_path = (cd.metadata or {}).get("file_path", "")
+ parent_dir = str(Path(rel_path).parent) if rel_path else ""
+ if parent_dir == ".":
+ parent_dir = ""
+ document.folder_id = folder_mapping.get(
+ parent_dir, folder_mapping.get("")
+ )
+ try:
+ await session.commit()
+ except IntegrityError:
+ await session.rollback()
+ for document in documents:
+ await session.refresh(document)
+
+ llm = await get_user_long_context_llm(session, user_id, search_space_id)
+
+ for document in documents:
+ connector_doc = doc_map.get(document.unique_identifier_hash)
+ if connector_doc is None:
+ failed_count += 1
+ continue
+
+ result = await pipeline.index(document, connector_doc, llm)
+
+ if DocumentStatus.is_state(result.status, DocumentStatus.READY):
+ indexed_count += 1
+
+ unique_id = connector_doc.unique_id
+ mtime_info = file_meta_map.get(unique_id, {})
+
+ doc_meta = dict(result.document_metadata or {})
+ doc_meta["mtime"] = mtime_info.get("mtime")
+ result.document_metadata = doc_meta
+
+ est = mtime_info.get("estimated_pages", 1)
+ content_len = mtime_info.get("content_length", 0)
+ final_pages = _compute_final_pages(
+ page_limit_service, est, content_len
+ )
+ await page_limit_service.update_page_usage(
+ user_id, final_pages, allow_exceed=True
+ )
+ else:
+ failed_count += 1
+
+ if on_heartbeat_callback and indexed_count % 5 == 0:
+ await on_heartbeat_callback(indexed_count)
+
+ # Cleanup empty folders
+ existing_dirs = set()
+ for dirpath, dirnames, _ in os.walk(folder_path):
+ dirnames[:] = [d for d in dirnames if d not in exclude_patterns]
+ rel = str(Path(dirpath).relative_to(folder_path))
+ if rel == ".":
+ rel = ""
+ if rel and not any(part in exclude_patterns for part in Path(rel).parts):
+ existing_dirs.add(rel)
+
+ root_fid = folder_mapping.get("")
+ if root_fid:
+ await _cleanup_empty_folders(
+ session, root_fid, search_space_id, existing_dirs, folder_mapping
+ )
+
+ try:
+ await session.commit()
+ except Exception as e:
+ if "duplicate key value violates unique constraint" in str(e).lower():
+ logger.warning(f"Duplicate key during commit: {e}")
+ await session.rollback()
+ else:
+ raise
+
+ warning_parts = []
+ if failed_count > 0:
+ warning_parts.append(f"{failed_count} failed")
+ warning_message = ", ".join(warning_parts) if warning_parts else None
+
+ await task_logger.log_task_success(
+ log_entry,
+ f"Completed local folder indexing for {folder_name}",
+ {
+ "indexed": indexed_count,
+ "skipped": skipped_count,
+ "failed": failed_count,
+ },
+ )
+
+ return indexed_count, skipped_count, root_folder_id, warning_message
+
+ except SQLAlchemyError as e:
+ logger.exception(f"Database error during local folder indexing: {e}")
+ await session.rollback()
+ await task_logger.log_task_failure(
+ log_entry, f"DB error: {e}", "Database error", {}
+ )
+ return 0, 0, root_folder_id, f"Database error: {e}"
+
+ except Exception as e:
+ logger.exception(f"Error during local folder indexing: {e}")
+ await task_logger.log_task_failure(
+ log_entry, f"Error: {e}", "Unexpected error", {}
+ )
+ return 0, 0, root_folder_id, str(e)
+
+
+BATCH_CONCURRENCY = 5
+
+
+async def _index_batch_files(
+ search_space_id: int,
+ user_id: str,
+ folder_path: str,
+ folder_name: str,
+ target_file_paths: list[str],
+ enable_summary: bool,
+ root_folder_id: int | None,
+ on_progress_callback: HeartbeatCallbackType | None = None,
+) -> tuple[int, int, str | None]:
+ """Process multiple files in parallel with bounded concurrency.
+
+ Each file gets its own DB session so they can run concurrently.
+ Returns (indexed_count, failed_count, error_summary_or_none).
+ """
+ semaphore = asyncio.Semaphore(BATCH_CONCURRENCY)
+ indexed = 0
+ failed = 0
+ errors: list[str] = []
+ lock = asyncio.Lock()
+ completed = 0
+
+ async def process_one(file_path: str) -> None:
+ nonlocal indexed, failed, completed
+ async with semaphore:
+ try:
+ async with get_celery_session_maker()() as file_session:
+ task_logger = TaskLoggingService(file_session, search_space_id)
+ log_entry = await task_logger.log_task_start(
+ task_name="local_folder_indexing",
+ source="local_folder_batch_indexing",
+ message=f"Batch: indexing {Path(file_path).name}",
+ metadata={"file_path": file_path},
+ )
+ ix, _sk, err = await _index_single_file(
+ session=file_session,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ folder_path=folder_path,
+ folder_name=folder_name,
+ target_file_path=file_path,
+ enable_summary=enable_summary,
+ root_folder_id=root_folder_id,
+ task_logger=task_logger,
+ log_entry=log_entry,
+ )
+ async with lock:
+ indexed += ix
+ if err:
+ failed += 1
+ errors.append(f"{Path(file_path).name}: {err}")
+ completed += 1
+ if on_progress_callback and completed % BATCH_CONCURRENCY == 0:
+ await on_progress_callback(completed)
+ except Exception as exc:
+ logger.exception(f"Batch: error processing {file_path}: {exc}")
+ async with lock:
+ failed += 1
+ completed += 1
+ errors.append(f"{Path(file_path).name}: {exc}")
+
+ await asyncio.gather(*[process_one(fp) for fp in target_file_paths])
+
+ if on_progress_callback:
+ await on_progress_callback(completed)
+
+ error_summary = None
+ if errors:
+ error_summary = f"{failed} file(s) failed: " + "; ".join(errors[:5])
+ if len(errors) > 5:
+ error_summary += f" ... and {len(errors) - 5} more"
+
+ return indexed, failed, error_summary
+
+
+async def _index_single_file(
+ session: AsyncSession,
+ search_space_id: int,
+ user_id: str,
+ folder_path: str,
+ folder_name: str,
+ target_file_path: str,
+ enable_summary: bool,
+ root_folder_id: int | None,
+ task_logger,
+ log_entry,
+) -> tuple[int, int, str | None]:
+ """Process a single file (chokidar real-time trigger)."""
+ try:
+ full_path = Path(target_file_path)
+ if not full_path.exists():
+ rel = str(full_path.relative_to(folder_path))
+ unique_id = f"{folder_name}:{rel}"
+ uid_hash = compute_identifier_hash(
+ DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id
+ )
+ existing = await check_document_by_unique_identifier(session, uid_hash)
+ if existing:
+ deleted_folder_id = existing.folder_id
+ await session.delete(existing)
+ await session.flush()
+ if deleted_folder_id and root_folder_id:
+ await _cleanup_empty_folder_chain(
+ session, deleted_folder_id, root_folder_id
+ )
+ await session.commit()
+ return 0, 0, None
+ return 0, 0, None
+
+ rel_path = str(full_path.relative_to(folder_path))
+
+ unique_id = f"{folder_name}:{rel_path}"
+ uid_hash = compute_identifier_hash(
+ DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id
+ )
+
+ page_limit_service = PageLimitService(session)
+ try:
+ estimated_pages = await _check_page_limit_or_skip(
+ page_limit_service, user_id, str(full_path)
+ )
+ except PageLimitExceededError as e:
+ return 0, 1, f"Page limit exceeded: {e}"
+
+ try:
+ content, content_hash = await _compute_file_content_hash(
+ str(full_path), full_path.name, search_space_id
+ )
+ except Exception as e:
+ return 0, 1, f"Could not read file: {e}"
+
+ if not content.strip():
+ return 0, 1, None
+
+ existing = await check_document_by_unique_identifier(session, uid_hash)
+
+ if existing:
+ if existing.content_hash == content_hash:
+ mtime = full_path.stat().st_mtime
+ meta = dict(existing.document_metadata or {})
+ meta["mtime"] = mtime
+ existing.document_metadata = meta
+ await session.commit()
+ return 0, 1, None
+
+ await create_version_snapshot(session, existing)
+
+ mtime = full_path.stat().st_mtime
+
+ connector_doc = _build_connector_doc(
+ title=full_path.name,
+ content=content,
+ relative_path=rel_path,
+ folder_name=folder_name,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ enable_summary=enable_summary,
+ )
+
+ pipeline = IndexingPipelineService(session)
+ llm = await get_user_long_context_llm(session, user_id, search_space_id)
+ documents = await pipeline.prepare_for_indexing([connector_doc])
+
+ if not documents:
+ return 0, 1, None
+
+ db_doc = documents[0]
+
+ if root_folder_id:
+ try:
+ db_doc.folder_id = await _resolve_folder_for_file(
+ session, rel_path, root_folder_id, search_space_id, user_id
+ )
+ await session.commit()
+ except IntegrityError:
+ await session.rollback()
+ await session.refresh(db_doc)
+
+ await pipeline.index(db_doc, connector_doc, llm)
+
+ await session.refresh(db_doc)
+ doc_meta = dict(db_doc.document_metadata or {})
+ doc_meta["mtime"] = mtime
+ db_doc.document_metadata = doc_meta
+ await session.commit()
+
+ indexed = (
+ 1 if DocumentStatus.is_state(db_doc.status, DocumentStatus.READY) else 0
+ )
+ failed_msg = None if indexed else "Indexing failed"
+
+ if indexed:
+ final_pages = _compute_final_pages(
+ page_limit_service, estimated_pages, len(content)
+ )
+ await page_limit_service.update_page_usage(
+ user_id, final_pages, allow_exceed=True
+ )
+ await task_logger.log_task_success(
+ log_entry,
+ f"Single file indexed: {rel_path}",
+ {"file": rel_path, "pages_processed": final_pages},
+ )
+ return indexed, 0 if indexed else 1, failed_msg
+
+ except Exception as e:
+ logger.exception(f"Error indexing single file {target_file_path}: {e}")
+ await session.rollback()
+ return 0, 0, str(e)
diff --git a/surfsense_backend/app/utils/document_versioning.py b/surfsense_backend/app/utils/document_versioning.py
new file mode 100644
index 000000000..e6ad1fb06
--- /dev/null
+++ b/surfsense_backend/app/utils/document_versioning.py
@@ -0,0 +1,107 @@
+"""Document versioning: snapshot creation and cleanup.
+
+Rules:
+- 30-minute debounce window: if the latest version was created < 30 min ago,
+ overwrite it instead of creating a new row.
+- Maximum 20 versions per document.
+- Versions older than 90 days are cleaned up.
+"""
+
+from datetime import UTC, datetime, timedelta
+
+from sqlalchemy import delete, func, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentVersion
+
+MAX_VERSIONS_PER_DOCUMENT = 20
+DEBOUNCE_MINUTES = 30
+RETENTION_DAYS = 90
+
+
+def _now() -> datetime:
+ return datetime.now(UTC)
+
+
+async def create_version_snapshot(
+ session: AsyncSession,
+ document: Document,
+) -> DocumentVersion | None:
+ """Snapshot the document's current state into a DocumentVersion row.
+
+ Returns the created/updated DocumentVersion, or None if nothing was done.
+ """
+ now = _now()
+
+ latest = (
+ await session.execute(
+ select(DocumentVersion)
+ .where(DocumentVersion.document_id == document.id)
+ .order_by(DocumentVersion.version_number.desc())
+ .limit(1)
+ )
+ ).scalar_one_or_none()
+
+ if latest is not None:
+ age = now - latest.created_at.replace(tzinfo=UTC)
+ if age < timedelta(minutes=DEBOUNCE_MINUTES):
+ latest.source_markdown = document.source_markdown
+ latest.content_hash = document.content_hash
+ latest.title = document.title
+ latest.created_at = now
+ await session.flush()
+ return latest
+
+ max_num = (
+ await session.execute(
+ select(func.coalesce(func.max(DocumentVersion.version_number), 0)).where(
+ DocumentVersion.document_id == document.id
+ )
+ )
+ ).scalar_one()
+
+ version = DocumentVersion(
+ document_id=document.id,
+ version_number=max_num + 1,
+ source_markdown=document.source_markdown,
+ content_hash=document.content_hash,
+ title=document.title,
+ created_at=now,
+ )
+ session.add(version)
+ await session.flush()
+
+ # Cleanup: remove versions older than 90 days
+ cutoff = now - timedelta(days=RETENTION_DAYS)
+ await session.execute(
+ delete(DocumentVersion).where(
+ DocumentVersion.document_id == document.id,
+ DocumentVersion.created_at < cutoff,
+ )
+ )
+
+ # Cleanup: cap at MAX_VERSIONS_PER_DOCUMENT
+ count = (
+ await session.execute(
+ select(func.count())
+ .select_from(DocumentVersion)
+ .where(DocumentVersion.document_id == document.id)
+ )
+ ).scalar_one()
+
+ if count > MAX_VERSIONS_PER_DOCUMENT:
+ excess = count - MAX_VERSIONS_PER_DOCUMENT
+ oldest_ids_result = await session.execute(
+ select(DocumentVersion.id)
+ .where(DocumentVersion.document_id == document.id)
+ .order_by(DocumentVersion.version_number.asc())
+ .limit(excess)
+ )
+ oldest_ids = [row[0] for row in oldest_ids_result.all()]
+ if oldest_ids:
+ await session.execute(
+ delete(DocumentVersion).where(DocumentVersion.id.in_(oldest_ids))
+ )
+
+ await session.flush()
+ return version
diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py
new file mode 100644
index 000000000..4d9bda7ee
--- /dev/null
+++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py
@@ -0,0 +1,1180 @@
+"""Integration tests for local folder indexer — Tier 3 (I1-I5), Tier 4 (F1-F7), Tier 5 (P1), Tier 6 (B1-B2)."""
+
+import os
+from contextlib import asynccontextmanager
+from pathlib import Path
+
+import pytest
+from sqlalchemy import func, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import (
+ Document,
+ DocumentStatus,
+ DocumentType,
+ DocumentVersion,
+ Folder,
+ SearchSpace,
+ User,
+)
+
+pytestmark = pytest.mark.integration
+
+UNIFIED_FIXTURES = (
+ "patched_summarize",
+ "patched_embed_texts",
+ "patched_chunk_text",
+)
+
+
+class _FakeSessionMaker:
+ """Wraps an existing AsyncSession so ``async with factory()`` yields it
+ without closing it. Used to route batch-mode DB operations through the
+ test's savepoint-wrapped session."""
+
+ def __init__(self, session: AsyncSession):
+ self._session = session
+
+ def __call__(self):
+ @asynccontextmanager
+ async def _ctx():
+ yield self._session
+
+ return _ctx()
+
+
+@pytest.fixture
+def patched_batch_sessions(monkeypatch, db_session):
+ """Make ``_index_batch_files`` use the test session and run sequentially."""
+ monkeypatch.setattr(
+ "app.tasks.connector_indexers.local_folder_indexer.get_celery_session_maker",
+ lambda: _FakeSessionMaker(db_session),
+ )
+ monkeypatch.setattr(
+ "app.tasks.connector_indexers.local_folder_indexer.BATCH_CONCURRENCY",
+ 1,
+ )
+
+
+# ====================================================================
+# Tier 3: Full Indexer Integration (I1-I5)
+# ====================================================================
+
+
+class TestFullIndexer:
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_i1_new_file_indexed(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """I1: Single new .md file is indexed with status READY."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "note.md").write_text("# Hello World\n\nContent here.")
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ assert err is None
+ assert count == 1
+
+ docs = (
+ (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+ assert len(docs) == 1
+ assert docs[0].document_type == DocumentType.LOCAL_FOLDER_FILE
+ assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY)
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_i2_unchanged_skipped(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """I2: Second run on unchanged directory creates no new documents."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "note.md").write_text("# Hello\n\nSame content.")
+
+ count1, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+ assert count1 == 1
+
+ count2, _, _, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ root_folder_id=root_folder_id,
+ )
+ assert count2 == 0
+
+ total = (
+ await db_session.execute(
+ select(func.count())
+ .select_from(Document)
+ .where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ ).scalar_one()
+ assert total == 1
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_i3_changed_reindexed(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """I3: Modified file content triggers re-index and creates a version."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ f = tmp_path / "note.md"
+ f.write_text("# Version 1\n\nOriginal.")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ f.write_text("# Version 2\n\nUpdated.")
+ os.utime(f, (f.stat().st_atime + 10, f.stat().st_mtime + 10))
+
+ count, _, _, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ root_folder_id=root_folder_id,
+ )
+ assert count == 1
+
+ versions = (
+ (
+ await db_session.execute(
+ select(DocumentVersion)
+ .join(Document)
+ .where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+ assert len(versions) >= 1
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_i4_deleted_removed(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """I4: Deleted file is removed from DB on re-sync."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ f = tmp_path / "to_delete.md"
+ f.write_text("# Delete me")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ docs_before = (
+ await db_session.execute(
+ select(func.count())
+ .select_from(Document)
+ .where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ ).scalar_one()
+ assert docs_before == 1
+
+ f.unlink()
+
+ await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ root_folder_id=root_folder_id,
+ )
+
+ docs_after = (
+ await db_session.execute(
+ select(func.count())
+ .select_from(Document)
+ .where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ ).scalar_one()
+ assert docs_after == 0
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_i5_single_file_mode(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """I5: Batch mode with a single file only processes that file."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "a.md").write_text("File A")
+ (tmp_path / "b.md").write_text("File B")
+ (tmp_path / "c.md").write_text("File C")
+
+ count, _, _, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[str(tmp_path / "b.md")],
+ )
+ assert count == 1
+
+ docs = (
+ (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+ assert len(docs) == 1
+ assert docs[0].title == "b.md"
+
+
+# ====================================================================
+# Tier 4: Folder Mirroring (F1-F7)
+# ====================================================================
+
+
+class TestFolderMirroring:
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_f1_root_folder_created(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """F1: First sync creates a root Folder and returns root_folder_id."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "root.md").write_text("Root file")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ assert root_folder_id is not None
+
+ root_folder = (
+ await db_session.execute(select(Folder).where(Folder.id == root_folder_id))
+ ).scalar_one()
+ assert root_folder.name == "test-folder"
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_f2_nested_folder_rows(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """F2: Nested dirs create Folder rows with correct parent_id chain."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ daily = tmp_path / "notes" / "daily"
+ daily.mkdir(parents=True)
+ weekly = tmp_path / "notes" / "weekly"
+ weekly.mkdir(parents=True)
+ (daily / "today.md").write_text("today")
+ (weekly / "review.md").write_text("review")
+
+ await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ folders = (
+ (
+ await db_session.execute(
+ select(Folder).where(Folder.search_space_id == db_search_space.id)
+ )
+ )
+ .scalars()
+ .all()
+ )
+
+ folder_names = {f.name for f in folders}
+ assert "notes" in folder_names
+ assert "daily" in folder_names
+ assert "weekly" in folder_names
+
+ notes_folder = next(f for f in folders if f.name == "notes")
+ daily_folder = next(f for f in folders if f.name == "daily")
+ weekly_folder = next(f for f in folders if f.name == "weekly")
+
+ assert daily_folder.parent_id == notes_folder.id
+ assert weekly_folder.parent_id == notes_folder.id
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_f3_resync_reuses_folders(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """F3: Re-sync reuses existing Folder rows, no duplicates."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ sub = tmp_path / "docs"
+ sub.mkdir()
+ (sub / "file.md").write_text("content")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ folders_before = (
+ (
+ await db_session.execute(
+ select(Folder).where(Folder.search_space_id == db_search_space.id)
+ )
+ )
+ .scalars()
+ .all()
+ )
+ ids_before = {f.id for f in folders_before}
+
+ await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ root_folder_id=root_folder_id,
+ )
+
+ folders_after = (
+ (
+ await db_session.execute(
+ select(Folder).where(Folder.search_space_id == db_search_space.id)
+ )
+ )
+ .scalars()
+ .all()
+ )
+ ids_after = {f.id for f in folders_after}
+
+ assert ids_before == ids_after
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_f4_folder_id_assigned(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """F4: Documents get correct folder_id based on their directory."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ daily = tmp_path / "notes" / "daily"
+ daily.mkdir(parents=True)
+ (daily / "today.md").write_text("today note")
+ (tmp_path / "root.md").write_text("root note")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ docs = (
+ (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+
+ today_doc = next(d for d in docs if d.title == "today.md")
+ root_doc = next(d for d in docs if d.title == "root.md")
+
+ daily_folder = (
+ await db_session.execute(select(Folder).where(Folder.name == "daily"))
+ ).scalar_one()
+
+ assert today_doc.folder_id == daily_folder.id
+
+ assert root_doc.folder_id == root_folder_id
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_f5_empty_folder_cleanup(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """F5: Deleted dir's empty Folder row is cleaned up on re-sync."""
+ import shutil
+
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ daily = tmp_path / "notes" / "daily"
+ daily.mkdir(parents=True)
+ weekly = tmp_path / "notes" / "weekly"
+ weekly.mkdir(parents=True)
+ (daily / "today.md").write_text("today")
+ (weekly / "review.md").write_text("review")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ weekly_folder = (
+ await db_session.execute(select(Folder).where(Folder.name == "weekly"))
+ ).scalar_one_or_none()
+ assert weekly_folder is not None
+
+ shutil.rmtree(weekly)
+
+ await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ root_folder_id=root_folder_id,
+ )
+
+ weekly_after = (
+ await db_session.execute(select(Folder).where(Folder.name == "weekly"))
+ ).scalar_one_or_none()
+ assert weekly_after is None
+
+ daily_after = (
+ await db_session.execute(select(Folder).where(Folder.name == "daily"))
+ ).scalar_one_or_none()
+ assert daily_after is not None
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_f6_single_file_creates_subfolder(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """F6: Single-file mode creates missing Folder rows and assigns correct folder_id."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "root.md").write_text("root")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ sub = tmp_path / "notes" / "daily"
+ sub.mkdir(parents=True)
+ (sub / "new.md").write_text("new note in subfolder")
+
+ count, _, _, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[str(sub / "new.md")],
+ root_folder_id=root_folder_id,
+ )
+ assert count == 1
+
+ doc = (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.title == "new.md",
+ )
+ )
+ ).scalar_one()
+
+ daily_folder = (
+ await db_session.execute(select(Folder).where(Folder.name == "daily"))
+ ).scalar_one()
+
+ assert doc.folder_id == daily_folder.id
+ assert daily_folder.parent_id is not None
+
+ notes_folder = (
+ await db_session.execute(select(Folder).where(Folder.name == "notes"))
+ ).scalar_one()
+ assert daily_folder.parent_id == notes_folder.id
+ assert notes_folder.parent_id == root_folder_id
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_f7_single_file_delete_cleans_empty_folders(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """F7: Deleting the only file in a subfolder via batch mode removes empty Folder rows."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ sub = tmp_path / "notes" / "ephemeral"
+ sub.mkdir(parents=True)
+ (sub / "temp.md").write_text("temporary")
+ (tmp_path / "keep.md").write_text("keep this")
+
+ _, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ eph_folder = (
+ await db_session.execute(select(Folder).where(Folder.name == "ephemeral"))
+ ).scalar_one_or_none()
+ assert eph_folder is not None
+
+ target = sub / "temp.md"
+ target.unlink()
+
+ await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[str(target)],
+ root_folder_id=root_folder_id,
+ )
+
+ eph_after = (
+ await db_session.execute(select(Folder).where(Folder.name == "ephemeral"))
+ ).scalar_one_or_none()
+ assert eph_after is None
+
+ notes_after = (
+ await db_session.execute(select(Folder).where(Folder.name == "notes"))
+ ).scalar_one_or_none()
+ assert notes_after is None
+
+
+# ====================================================================
+# Tier 6: Batch Mode (B1-B2)
+# ====================================================================
+
+
+class TestBatchMode:
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_b1_batch_indexes_multiple_files(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ patched_batch_sessions,
+ ):
+ """B1: Batch with 3 files indexes all of them."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "a.md").write_text("File A content")
+ (tmp_path / "b.md").write_text("File B content")
+ (tmp_path / "c.md").write_text("File C content")
+
+ count, failed, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[
+ str(tmp_path / "a.md"),
+ str(tmp_path / "b.md"),
+ str(tmp_path / "c.md"),
+ ],
+ )
+
+ assert count == 3
+ assert failed == 0
+ assert err is None
+
+ docs = (
+ (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+ assert len(docs) == 3
+ assert {d.title for d in docs} == {"a.md", "b.md", "c.md"}
+ assert all(
+ DocumentStatus.is_state(d.status, DocumentStatus.READY) for d in docs
+ )
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_b2_partial_failure(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ patched_batch_sessions,
+ ):
+ """B2: One unreadable file fails gracefully; the other two still get indexed."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "good1.md").write_text("Good file one")
+ (tmp_path / "good2.md").write_text("Good file two")
+ (tmp_path / "bad.md").write_bytes(b"\x00binary garbage")
+
+ count, failed, _, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[
+ str(tmp_path / "good1.md"),
+ str(tmp_path / "bad.md"),
+ str(tmp_path / "good2.md"),
+ ],
+ )
+
+ assert count == 2
+ assert failed == 1
+ assert err is not None
+
+ docs = (
+ (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+ assert len(docs) == 2
+ assert {d.title for d in docs} == {"good1.md", "good2.md"}
+
+
+# ====================================================================
+# Tier 5: Pipeline Integration (P1)
+# ====================================================================
+
+
+class TestPipelineIntegration:
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_p1_local_folder_file_through_pipeline(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ mocker,
+ ):
+ """P1: LOCAL_FOLDER_FILE ConnectorDocument through prepare+index to READY."""
+ from app.indexing_pipeline.connector_document import ConnectorDocument
+ from app.indexing_pipeline.indexing_pipeline_service import (
+ IndexingPipelineService,
+ )
+
+ doc = ConnectorDocument(
+ title="Test Local File",
+ source_markdown="## Local file\n\nContent from disk.",
+ unique_id="test-folder:test.md",
+ document_type=DocumentType.LOCAL_FOLDER_FILE,
+ search_space_id=db_search_space.id,
+ connector_id=None,
+ created_by_id=str(db_user.id),
+ )
+
+ service = IndexingPipelineService(session=db_session)
+ prepared = await service.prepare_for_indexing([doc])
+ assert len(prepared) == 1
+
+ db_doc = prepared[0]
+ result = await service.index(db_doc, doc, llm=mocker.Mock())
+ assert result is not None
+
+ docs = (
+ (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ )
+ .scalars()
+ .all()
+ )
+ assert len(docs) == 1
+ assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY)
+
+
+# ====================================================================
+# Tier 7: Direct Converters (DC1-DC4)
+# ====================================================================
+
+
+class TestDirectConvert:
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_dc1_csv_produces_markdown_table(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """DC1: CSV file is indexed as a markdown table, not raw comma-separated text."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "data.csv").write_text("name,age,city\nAlice,30,NYC\nBob,25,LA\n")
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ assert err is None
+ assert count == 1
+
+ doc = (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ ).scalar_one()
+
+ assert "| name" in doc.source_markdown
+ assert "| Alice" in doc.source_markdown
+ assert "name,age,city" not in doc.source_markdown
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_dc2_tsv_produces_markdown_table(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """DC2: TSV file is indexed as a markdown table."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "data.tsv").write_text(
+ "name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n"
+ )
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ assert err is None
+ assert count == 1
+
+ doc = (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ ).scalar_one()
+
+ assert "| name" in doc.source_markdown
+ assert "| Alice" in doc.source_markdown
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_dc3_html_produces_clean_markdown(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """DC3: HTML file is indexed as clean markdown, not raw HTML."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "page.html").write_text("
Title
Hello world
")
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ assert err is None
+ assert count == 1
+
+ doc = (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ ).scalar_one()
+
+ assert "Title" in doc.source_markdown
+ assert "" not in doc.source_markdown
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_dc4_csv_single_file_mode(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """DC4: CSV via single-file batch mode also produces a markdown table."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ (tmp_path / "data.csv").write_text("name,age,city\nAlice,30,NYC\nBob,25,LA\n")
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[str(tmp_path / "data.csv")],
+ )
+
+ assert err is None
+ assert count == 1
+
+ doc = (
+ await db_session.execute(
+ select(Document).where(
+ Document.document_type == DocumentType.LOCAL_FOLDER_FILE,
+ Document.search_space_id == db_search_space.id,
+ )
+ )
+ ).scalar_one()
+
+ assert "| name" in doc.source_markdown
+ assert "name,age,city" not in doc.source_markdown
+
+
+# ====================================================================
+# Tier 8: Page Limits (PL1-PL6)
+# ====================================================================
+
+
+class TestPageLimits:
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_pl1_full_scan_increments_pages_used(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """PL1: Successful full-scan sync increments user.pages_used."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ db_user.pages_used = 0
+ db_user.pages_limit = 500
+ await db_session.flush()
+
+ (tmp_path / "note.md").write_text("# Hello World\n\nContent here.")
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ assert err is None
+ assert count == 1
+
+ await db_session.refresh(db_user)
+ assert db_user.pages_used > 0, "pages_used should increase after indexing"
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_pl2_full_scan_blocked_when_limit_exhausted(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """PL2: Full-scan skips file when page limit is exhausted."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ db_user.pages_used = 100
+ db_user.pages_limit = 100
+ await db_session.flush()
+
+ (tmp_path / "note.md").write_text("# Hello World\n\nContent here.")
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+
+ assert count == 0
+
+ await db_session.refresh(db_user)
+ assert db_user.pages_used == 100, "pages_used should not change on rejection"
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_pl3_single_file_increments_pages_used(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """PL3: Single-file mode increments user.pages_used on success."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ db_user.pages_used = 0
+ db_user.pages_limit = 500
+ await db_session.flush()
+
+ (tmp_path / "note.md").write_text("# Hello World\n\nContent here.")
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[str(tmp_path / "note.md")],
+ )
+
+ assert err is None
+ assert count == 1
+
+ await db_session.refresh(db_user)
+ assert db_user.pages_used > 0, "pages_used should increase after indexing"
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_pl4_single_file_blocked_when_limit_exhausted(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """PL4: Single-file mode skips file when page limit is exhausted."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ db_user.pages_used = 100
+ db_user.pages_limit = 100
+ await db_session.flush()
+
+ (tmp_path / "note.md").write_text("# Hello World\n\nContent here.")
+
+ count, _skipped, _root_folder_id, err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[str(tmp_path / "note.md")],
+ )
+
+ assert count == 0
+ assert err is not None
+ assert "page limit" in err.lower()
+
+ await db_session.refresh(db_user)
+ assert db_user.pages_used == 100, "pages_used should not change on rejection"
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_pl5_unchanged_resync_no_extra_pages(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ ):
+ """PL5: Re-syncing an unchanged file does not consume additional pages."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ db_user.pages_used = 0
+ db_user.pages_limit = 500
+ await db_session.flush()
+
+ (tmp_path / "note.md").write_text("# Hello\n\nSame content.")
+
+ count1, _, root_folder_id, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ )
+ assert count1 == 1
+
+ await db_session.refresh(db_user)
+ pages_after_first = db_user.pages_used
+ assert pages_after_first > 0
+
+ count2, _, _, _ = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ root_folder_id=root_folder_id,
+ )
+ assert count2 == 0
+
+ await db_session.refresh(db_user)
+ assert db_user.pages_used == pages_after_first, (
+ "pages_used should not increase for unchanged files"
+ )
+
+ @pytest.mark.usefixtures(*UNIFIED_FIXTURES)
+ async def test_pl6_batch_partial_page_limit_exhaustion(
+ self,
+ db_session: AsyncSession,
+ db_user: User,
+ db_search_space: SearchSpace,
+ tmp_path: Path,
+ patched_batch_sessions,
+ ):
+ """PL6: Batch mode with a very low page limit: some files succeed, rest fail."""
+ from app.tasks.connector_indexers.local_folder_indexer import index_local_folder
+
+ db_user.pages_used = 0
+ db_user.pages_limit = 1
+ await db_session.flush()
+
+ (tmp_path / "a.md").write_text("File A content")
+ (tmp_path / "b.md").write_text("File B content")
+ (tmp_path / "c.md").write_text("File C content")
+
+ count, failed, _root_folder_id, _err = await index_local_folder(
+ session=db_session,
+ search_space_id=db_search_space.id,
+ user_id=str(db_user.id),
+ folder_path=str(tmp_path),
+ folder_name="test-folder",
+ target_file_paths=[
+ str(tmp_path / "a.md"),
+ str(tmp_path / "b.md"),
+ str(tmp_path / "c.md"),
+ ],
+ )
+
+ assert count >= 1, "at least one file should succeed"
+ assert failed >= 1, "at least one file should fail due to page limit"
+ assert count + failed == 3
+
+ await db_session.refresh(db_user)
+ assert db_user.pages_used > 0
+ assert db_user.pages_used <= db_user.pages_limit + 1
diff --git a/surfsense_backend/tests/integration/test_document_versioning.py b/surfsense_backend/tests/integration/test_document_versioning.py
new file mode 100644
index 000000000..9bd03d219
--- /dev/null
+++ b/surfsense_backend/tests/integration/test_document_versioning.py
@@ -0,0 +1,167 @@
+"""Integration tests for document versioning snapshot + cleanup."""
+
+from datetime import UTC, datetime, timedelta
+
+import pytest
+import pytest_asyncio
+from sqlalchemy import func, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentType, DocumentVersion, SearchSpace, User
+
+pytestmark = pytest.mark.integration
+
+
+@pytest_asyncio.fixture
+async def db_document(
+ db_session: AsyncSession, db_user: User, db_search_space: SearchSpace
+) -> Document:
+ doc = Document(
+ title="Test Doc",
+ document_type=DocumentType.LOCAL_FOLDER_FILE,
+ document_metadata={},
+ content="Summary of test doc.",
+ content_hash="abc123",
+ unique_identifier_hash="local_folder:test-folder:test.md",
+ source_markdown="# Test\n\nOriginal content.",
+ search_space_id=db_search_space.id,
+ created_by_id=db_user.id,
+ )
+ db_session.add(doc)
+ await db_session.flush()
+ return doc
+
+
+async def _version_count(session: AsyncSession, document_id: int) -> int:
+ result = await session.execute(
+ select(func.count())
+ .select_from(DocumentVersion)
+ .where(DocumentVersion.document_id == document_id)
+ )
+ return result.scalar_one()
+
+
+async def _get_versions(
+ session: AsyncSession, document_id: int
+) -> list[DocumentVersion]:
+ result = await session.execute(
+ select(DocumentVersion)
+ .where(DocumentVersion.document_id == document_id)
+ .order_by(DocumentVersion.version_number)
+ )
+ return list(result.scalars().all())
+
+
+class TestCreateVersionSnapshot:
+ """V1-V5: TDD slices for create_version_snapshot."""
+
+ async def test_v1_creates_first_version(self, db_session, db_document):
+ """V1: First snapshot creates version 1 with the document's current state."""
+ from app.utils.document_versioning import create_version_snapshot
+
+ await create_version_snapshot(db_session, db_document)
+
+ versions = await _get_versions(db_session, db_document.id)
+ assert len(versions) == 1
+ assert versions[0].version_number == 1
+ assert versions[0].source_markdown == "# Test\n\nOriginal content."
+ assert versions[0].content_hash == "abc123"
+ assert versions[0].title == "Test Doc"
+ assert versions[0].document_id == db_document.id
+
+ async def test_v2_creates_version_2_after_30_min(
+ self, db_session, db_document, monkeypatch
+ ):
+ """V2: After 30+ minutes, a new version is created (not overwritten)."""
+ from app.utils.document_versioning import create_version_snapshot
+
+ t0 = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC)
+ monkeypatch.setattr("app.utils.document_versioning._now", lambda: t0)
+ await create_version_snapshot(db_session, db_document)
+
+ # Simulate content change and time passing
+ db_document.source_markdown = "# Test\n\nUpdated content."
+ db_document.content_hash = "def456"
+ t1 = t0 + timedelta(minutes=31)
+ monkeypatch.setattr("app.utils.document_versioning._now", lambda: t1)
+ await create_version_snapshot(db_session, db_document)
+
+ versions = await _get_versions(db_session, db_document.id)
+ assert len(versions) == 2
+ assert versions[0].version_number == 1
+ assert versions[1].version_number == 2
+ assert versions[1].source_markdown == "# Test\n\nUpdated content."
+
+ async def test_v3_overwrites_within_30_min(
+ self, db_session, db_document, monkeypatch
+ ):
+ """V3: Within 30 minutes, the latest version is overwritten."""
+ from app.utils.document_versioning import create_version_snapshot
+
+ t0 = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC)
+ monkeypatch.setattr("app.utils.document_versioning._now", lambda: t0)
+ await create_version_snapshot(db_session, db_document)
+ count_after_first = await _version_count(db_session, db_document.id)
+ assert count_after_first == 1
+
+ # Simulate quick edit within 30 minutes
+ db_document.source_markdown = "# Test\n\nQuick edit."
+ db_document.content_hash = "quick123"
+ t1 = t0 + timedelta(minutes=10)
+ monkeypatch.setattr("app.utils.document_versioning._now", lambda: t1)
+ await create_version_snapshot(db_session, db_document)
+
+ count_after_second = await _version_count(db_session, db_document.id)
+ assert count_after_second == 1 # still 1, not 2
+
+ versions = await _get_versions(db_session, db_document.id)
+ assert versions[0].source_markdown == "# Test\n\nQuick edit."
+ assert versions[0].content_hash == "quick123"
+
+ async def test_v4_cleanup_90_day_old_versions(
+ self, db_session, db_document, monkeypatch
+ ):
+ """V4: Versions older than 90 days are cleaned up."""
+ from app.utils.document_versioning import create_version_snapshot
+
+ base = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC)
+
+ # Create 5 versions spread across time: 3 older than 90 days, 2 recent
+ for i in range(5):
+ db_document.source_markdown = f"Content v{i + 1}"
+ db_document.content_hash = f"hash_{i + 1}"
+ t = base + timedelta(days=i) if i < 3 else base + timedelta(days=100 + i)
+ monkeypatch.setattr("app.utils.document_versioning._now", lambda _t=t: _t)
+ await create_version_snapshot(db_session, db_document)
+
+ # Now trigger cleanup from a "current" time that makes the first 3 versions > 90 days old
+ now = base + timedelta(days=200)
+ monkeypatch.setattr("app.utils.document_versioning._now", lambda: now)
+ db_document.source_markdown = "Content v6"
+ db_document.content_hash = "hash_6"
+ await create_version_snapshot(db_session, db_document)
+
+ versions = await _get_versions(db_session, db_document.id)
+ # The first 3 (old) should be cleaned up; versions 4, 5, 6 remain
+ for v in versions:
+ age = now - v.created_at.replace(tzinfo=UTC)
+ assert age <= timedelta(days=90), f"Version {v.version_number} is too old"
+
+ async def test_v5_cap_at_20_versions(self, db_session, db_document, monkeypatch):
+ """V5: More than 20 versions triggers cap — oldest gets deleted."""
+ from app.utils.document_versioning import create_version_snapshot
+
+ base = datetime(2025, 6, 1, 12, 0, 0, tzinfo=UTC)
+
+ # Create 21 versions (all within 90 days, each 31 min apart)
+ for i in range(21):
+ db_document.source_markdown = f"Content v{i + 1}"
+ db_document.content_hash = f"hash_{i + 1}"
+ t = base + timedelta(minutes=31 * i)
+ monkeypatch.setattr("app.utils.document_versioning._now", lambda _t=t: _t)
+ await create_version_snapshot(db_session, db_document)
+
+ versions = await _get_versions(db_session, db_document.id)
+ assert len(versions) == 20
+ # The lowest version_number should be 2 (version 1 was the oldest and got capped)
+ assert versions[0].version_number == 2
diff --git a/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py b/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py
new file mode 100644
index 000000000..c6e7b160c
--- /dev/null
+++ b/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py
@@ -0,0 +1,78 @@
+"""Unit tests for scan_folder() pure logic — Tier 2 TDD slices (S1-S4)."""
+
+from pathlib import Path
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+class TestScanFolder:
+ """S1-S4: scan_folder() with real tmp_path filesystem."""
+
+ def test_s1_single_md_file(self, tmp_path: Path):
+ """S1: scan_folder on a dir with one .md file returns correct entry."""
+ from app.tasks.connector_indexers.local_folder_indexer import scan_folder
+
+ md = tmp_path / "note.md"
+ md.write_text("# Hello")
+
+ results = scan_folder(str(tmp_path))
+
+ assert len(results) == 1
+ entry = results[0]
+ assert entry["relative_path"] == "note.md"
+ assert entry["size"] > 0
+ assert "modified_at" in entry
+ assert entry["path"] == str(md)
+
+ def test_s2_extension_filter(self, tmp_path: Path):
+ """S2: file_extensions filter returns only matching files."""
+ from app.tasks.connector_indexers.local_folder_indexer import scan_folder
+
+ (tmp_path / "a.md").write_text("md")
+ (tmp_path / "b.txt").write_text("txt")
+ (tmp_path / "c.pdf").write_bytes(b"%PDF")
+
+ results = scan_folder(str(tmp_path), file_extensions=[".md"])
+ names = {r["relative_path"] for r in results}
+
+ assert names == {"a.md"}
+
+ def test_s3_exclude_patterns(self, tmp_path: Path):
+ """S3: exclude_patterns skips files inside excluded directories."""
+ from app.tasks.connector_indexers.local_folder_indexer import scan_folder
+
+ (tmp_path / "good.md").write_text("good")
+ nm = tmp_path / "node_modules"
+ nm.mkdir()
+ (nm / "dep.js").write_text("module")
+ git = tmp_path / ".git"
+ git.mkdir()
+ (git / "config").write_text("gitconfig")
+
+ results = scan_folder(str(tmp_path), exclude_patterns=["node_modules", ".git"])
+ names = {r["relative_path"] for r in results}
+
+ assert "good.md" in names
+ assert not any("node_modules" in n for n in names)
+ assert not any(".git" in n for n in names)
+
+ def test_s4_nested_dirs(self, tmp_path: Path):
+ """S4: nested subdirectories produce correct relative paths."""
+ from app.tasks.connector_indexers.local_folder_indexer import scan_folder
+
+ daily = tmp_path / "notes" / "daily"
+ daily.mkdir(parents=True)
+ weekly = tmp_path / "notes" / "weekly"
+ weekly.mkdir(parents=True)
+ (daily / "today.md").write_text("today")
+ (weekly / "review.md").write_text("review")
+ (tmp_path / "root.txt").write_text("root")
+
+ results = scan_folder(str(tmp_path))
+ paths = {r["relative_path"] for r in results}
+
+ assert "notes/daily/today.md" in paths or "notes\\daily\\today.md" in paths
+ assert "notes/weekly/review.md" in paths or "notes\\weekly\\review.md" in paths
+ assert "root.txt" in paths
diff --git a/surfsense_desktop/package.json b/surfsense_desktop/package.json
index bd0cc67ab..21e7f4bea 100644
--- a/surfsense_desktop/package.json
+++ b/surfsense_desktop/package.json
@@ -27,6 +27,8 @@
"wait-on": "^9.0.4"
},
"dependencies": {
+ "chokidar": "^5.0.0",
+ "electron-store": "^11.0.2",
"electron-updater": "^6.8.3",
"get-port-please": "^3.2.0"
}
diff --git a/surfsense_desktop/pnpm-lock.yaml b/surfsense_desktop/pnpm-lock.yaml
index ea65be0bb..528f81539 100644
--- a/surfsense_desktop/pnpm-lock.yaml
+++ b/surfsense_desktop/pnpm-lock.yaml
@@ -8,6 +8,12 @@ importers:
.:
dependencies:
+ chokidar:
+ specifier: ^5.0.0
+ version: 5.0.0
+ electron-store:
+ specifier: ^11.0.2
+ version: 11.0.2
electron-updater:
specifier: ^6.8.3
version: 6.8.3
@@ -352,6 +358,14 @@ packages:
resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==}
engines: {node: '>= 14'}
+ ajv-formats@3.0.1:
+ resolution: {integrity: sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ==}
+ peerDependencies:
+ ajv: ^8.0.0
+ peerDependenciesMeta:
+ ajv:
+ optional: true
+
ajv-keywords@3.5.2:
resolution: {integrity: sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==}
peerDependencies:
@@ -360,6 +374,9 @@ packages:
ajv@6.14.0:
resolution: {integrity: sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==}
+ ajv@8.18.0:
+ resolution: {integrity: sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==}
+
ansi-regex@5.0.1:
resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
engines: {node: '>=8'}
@@ -411,6 +428,9 @@ packages:
resolution: {integrity: sha512-+q/t7Ekv1EDY2l6Gda6LLiX14rU9TV20Wa3ofeQmwPFZbOMo9DXrLbOjFaaclkXKWidIaopwAObQDqwWtGUjqg==}
engines: {node: '>= 4.0.0'}
+ atomically@2.1.1:
+ resolution: {integrity: sha512-P4w9o2dqARji6P7MHprklbfiArZAWvo07yW7qs3pdljb3BWr12FIB7W+p0zJiuiVsUpRO0iZn1kFFcpPegg0tQ==}
+
axios@1.13.6:
resolution: {integrity: sha512-ChTCHMouEe2kn713WHbQGcuYrr6fXTBiu460OTwWrWob16g1bXn4vtz07Ope7ewMozJAnEquLk5lWQWtBig9DQ==}
@@ -477,6 +497,10 @@ packages:
resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==}
engines: {node: '>=10'}
+ chokidar@5.0.0:
+ resolution: {integrity: sha512-TQMmc3w+5AxjpL8iIiwebF73dRDF4fBIieAqGn9RGCWaEVwQ6Fb2cGe31Yns0RRIzii5goJ1Y7xbMwo1TxMplw==}
+ engines: {node: '>= 20.19.0'}
+
chownr@3.0.0:
resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==}
engines: {node: '>=18'}
@@ -546,6 +570,10 @@ packages:
engines: {node: '>=18'}
hasBin: true
+ conf@15.1.0:
+ resolution: {integrity: sha512-Uy5YN9KEu0WWDaZAVJ5FAmZoaJt9rdK6kH+utItPyGsCqCgaTKkrmZx3zoE0/3q6S3bcp3Ihkk+ZqPxWxFK5og==}
+ engines: {node: '>=20'}
+
core-util-is@1.0.2:
resolution: {integrity: sha512-3lqz5YjWTYnW6dlDa5TLaTCcShfar1e40rmcJVwCBJC6mWlFuj0eCHIElmG1g5kyuJ/GD+8Wn4FFCcz4gJPfaQ==}
@@ -559,6 +587,10 @@ packages:
resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==}
engines: {node: '>= 8'}
+ debounce-fn@6.0.0:
+ resolution: {integrity: sha512-rBMW+F2TXryBwB54Q0d8drNEI+TfoS9JpNTAoVpukbWEhjXQq4rySFYLaqXMFXwdv61Zb2OHtj5bviSoimqxRQ==}
+ engines: {node: '>=18'}
+
debug@4.4.3:
resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==}
engines: {node: '>=6.0'}
@@ -610,6 +642,10 @@ packages:
os: [darwin]
hasBin: true
+ dot-prop@10.1.0:
+ resolution: {integrity: sha512-MVUtAugQMOff5RnBy2d9N31iG0lNwg1qAoAOn7pOK5wf94WIaE3My2p3uwTQuvS2AcqchkcR3bHByjaM0mmi7Q==}
+ engines: {node: '>=20'}
+
dotenv-expand@11.0.7:
resolution: {integrity: sha512-zIHwmZPRshsCdpMDyVsqGmgyP0yT8GAgXUnkdAoJisxvf33k7yO6OuoKmcTGuXPWSsm8Oh88nZicRLA9Y0rUeA==}
engines: {node: '>=12'}
@@ -645,6 +681,10 @@ packages:
electron-publish@26.8.1:
resolution: {integrity: sha512-q+jrSTIh/Cv4eGZa7oVR+grEJo/FoLMYBAnSL5GCtqwUpr1T+VgKB/dn1pnzxIxqD8S/jP1yilT9VrwCqINR4w==}
+ electron-store@11.0.2:
+ resolution: {integrity: sha512-4VkNRdN+BImL2KcCi41WvAYbh6zLX5AUTi4so68yPqiItjbgTjqpEnGAqasgnG+lB6GuAyUltKwVopp6Uv+gwQ==}
+ engines: {node: '>=20'}
+
electron-updater@6.8.3:
resolution: {integrity: sha512-Z6sgw3jgbikWKXei1ENdqFOxBP0WlXg3TtKfz0rgw2vIZFJUyI4pD7ZN7jrkm7EoMK+tcm/qTnPUdqfZukBlBQ==}
@@ -673,6 +713,10 @@ packages:
resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==}
engines: {node: '>=6'}
+ env-paths@3.0.0:
+ resolution: {integrity: sha512-dtJUTepzMW3Lm/NPxRf3wP4642UWhjL2sQxc+ym2YMj1m/H2zDNQOlezafzkHwn6sMstjHTwG6iQQsctDW/b1A==}
+ engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
+
err-code@2.0.3:
resolution: {integrity: sha512-2bmlRpNKBxT/CRmPOlyISQpNj+qSeYvcym/uT0Jx2bMOlKLtSy1ZmLuVxSEKKyor/N5yhvp/ZiG1oE3DEYMSFA==}
@@ -726,6 +770,9 @@ packages:
fast-json-stable-stringify@2.1.0:
resolution: {integrity: sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==}
+ fast-uri@3.1.0:
+ resolution: {integrity: sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==}
+
fd-slicer@1.1.0:
resolution: {integrity: sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==}
@@ -953,6 +1000,12 @@ packages:
json-schema-traverse@0.4.1:
resolution: {integrity: sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==}
+ json-schema-traverse@1.0.0:
+ resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==}
+
+ json-schema-typed@8.0.2:
+ resolution: {integrity: sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA==}
+
json-stringify-safe@5.0.1:
resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==}
@@ -983,6 +1036,9 @@ packages:
lodash@4.17.23:
resolution: {integrity: sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==}
+ lodash@4.18.1:
+ resolution: {integrity: sha512-dMInicTPVE8d1e5otfwmmjlxkZoUpiVLwyeTdUsi/Caj/gfzzblBcCE5sRHV/AsjuCmxWrte2TNGSYuCeCq+0Q==}
+
log-symbols@4.1.0:
resolution: {integrity: sha512-8XPvpAA8uyhfteu8pIvQxpJZ7SYYdpUivZpGy6sFsBuKRY/7rQGavedeB8aK+Zkyq6upMFVL/9AW6vOYzfRyLg==}
engines: {node: '>=10'}
@@ -1027,6 +1083,10 @@ packages:
resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==}
engines: {node: '>=6'}
+ mimic-function@5.0.1:
+ resolution: {integrity: sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==}
+ engines: {node: '>=18'}
+
mimic-response@1.0.1:
resolution: {integrity: sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ==}
engines: {node: '>=4'}
@@ -1222,10 +1282,18 @@ packages:
resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
engines: {node: '>= 6'}
+ readdirp@5.0.0:
+ resolution: {integrity: sha512-9u/XQ1pvrQtYyMpZe7DXKv2p5CNvyVwzUB6uhLAnQwHMSgKMBR62lc7AHljaeteeHXn11XTAaLLUVZYVZyuRBQ==}
+ engines: {node: '>= 20.19.0'}
+
require-directory@2.1.1:
resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==}
engines: {node: '>=0.10.0'}
+ require-from-string@2.0.2:
+ resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==}
+ engines: {node: '>=0.10.0'}
+
resedit@1.7.2:
resolution: {integrity: sha512-vHjcY2MlAITJhC0eRD/Vv8Vlgmu9Sd3LX9zZvtGzU5ZImdTN3+d6e/4mnTyV8vEbyf1sgNIrWxhWlrys52OkEA==}
engines: {node: '>=12', npm: '>=6'}
@@ -1365,6 +1433,12 @@ packages:
resolution: {integrity: sha512-yDPMNjp4WyfYBkHnjIRLfca1i6KMyGCtsVgoKe/z1+6vukgaENdgGBZt+ZmKPc4gavvEZ5OgHfHdrazhgNyG7w==}
engines: {node: '>=12'}
+ stubborn-fs@2.0.0:
+ resolution: {integrity: sha512-Y0AvSwDw8y+nlSNFXMm2g6L51rBGdAQT20J3YSOqxC53Lo3bjWRtr2BKcfYoAf352WYpsZSTURrA0tqhfgudPA==}
+
+ stubborn-utils@1.0.2:
+ resolution: {integrity: sha512-zOh9jPYI+xrNOyisSelgym4tolKTJCQd5GBhK0+0xJvcYDcwlOoxF/rnFKQ2KRZknXSG9jWAp66fwP6AxN9STg==}
+
sumchecker@3.0.1:
resolution: {integrity: sha512-MvjXzkz/BOfyVDkG0oFOtBxHX2u3gKbMHIF/dXblZsgD3BWOFLmHovIpZY7BykJdAjcqRCBi1WYBNdEC9yI7vg==}
engines: {node: '>= 8.0'}
@@ -1377,6 +1451,10 @@ packages:
resolution: {integrity: sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==}
engines: {node: '>=10'}
+ tagged-tag@1.0.0:
+ resolution: {integrity: sha512-yEFYrVhod+hdNyx7g5Bnkkb0G6si8HJurOoOEgC8B/O0uXLHlaey/65KRv6cuWBNhBgHKAROVpc7QyYqE5gFng==}
+ engines: {node: '>=20'}
+
tar@7.5.11:
resolution: {integrity: sha512-ChjMH33/KetonMTAtpYdgUFr0tbz69Fp2v7zWxQfYZX4g5ZN2nOBXm1R2xyA+lMIKrLKIoKAwFj93jE/avX9cQ==}
engines: {node: '>=18'}
@@ -1419,11 +1497,19 @@ packages:
resolution: {integrity: sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==}
engines: {node: '>=10'}
+ type-fest@5.5.0:
+ resolution: {integrity: sha512-PlBfpQwiUvGViBNX84Yxwjsdhd1TUlXr6zjX7eoirtCPIr08NAmxwa+fcYBTeRQxHo9YC9wwF3m9i700sHma8g==}
+ engines: {node: '>=20'}
+
typescript@5.9.3:
resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==}
engines: {node: '>=14.17'}
hasBin: true
+ uint8array-extras@1.5.0:
+ resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==}
+ engines: {node: '>=18'}
+
undici-types@7.16.0:
resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==}
@@ -1467,6 +1553,9 @@ packages:
wcwidth@1.0.1:
resolution: {integrity: sha512-XHPEwS0q6TaxcvG85+8EYkbiCux2XtWG2mkc47Ng2A77BQu9+DqIOJldST4HgPkuea7dvKSj5VgX3P1d4rW8Tg==}
+ when-exit@2.1.5:
+ resolution: {integrity: sha512-VGkKJ564kzt6Ms1dbgPP/yuIoQCrsFAnRbptpC5wOEsDaNsbCB2bnfnaA8i/vRs5tjUSEOtIuvl9/MyVsvQZCg==}
+
which@2.0.2:
resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==}
engines: {node: '>= 8'}
@@ -1827,6 +1916,10 @@ snapshots:
agent-base@7.1.4: {}
+ ajv-formats@3.0.1(ajv@8.18.0):
+ optionalDependencies:
+ ajv: 8.18.0
+
ajv-keywords@3.5.2(ajv@6.14.0):
dependencies:
ajv: 6.14.0
@@ -1838,6 +1931,13 @@ snapshots:
json-schema-traverse: 0.4.1
uri-js: 4.4.1
+ ajv@8.18.0:
+ dependencies:
+ fast-deep-equal: 3.1.3
+ fast-uri: 3.1.0
+ json-schema-traverse: 1.0.0
+ require-from-string: 2.0.2
+
ansi-regex@5.0.1: {}
ansi-regex@6.2.2: {}
@@ -1909,6 +2009,11 @@ snapshots:
at-least-node@1.0.0: {}
+ atomically@2.1.1:
+ dependencies:
+ stubborn-fs: 2.0.0
+ when-exit: 2.1.5
+
axios@1.13.6:
dependencies:
follow-redirects: 1.15.11
@@ -2019,6 +2124,10 @@ snapshots:
ansi-styles: 4.3.0
supports-color: 7.2.0
+ chokidar@5.0.0:
+ dependencies:
+ readdirp: 5.0.0
+
chownr@3.0.0: {}
chromium-pickle-js@0.2.0: {}
@@ -2079,6 +2188,18 @@ snapshots:
tree-kill: 1.2.2
yargs: 17.7.2
+ conf@15.1.0:
+ dependencies:
+ ajv: 8.18.0
+ ajv-formats: 3.0.1(ajv@8.18.0)
+ atomically: 2.1.1
+ debounce-fn: 6.0.0
+ dot-prop: 10.1.0
+ env-paths: 3.0.0
+ json-schema-typed: 8.0.2
+ semver: 7.7.4
+ uint8array-extras: 1.5.0
+
core-util-is@1.0.2:
optional: true
@@ -2096,6 +2217,10 @@ snapshots:
shebang-command: 2.0.0
which: 2.0.2
+ debounce-fn@6.0.0:
+ dependencies:
+ mimic-function: 5.0.1
+
debug@4.4.3:
dependencies:
ms: 2.1.3
@@ -2161,6 +2286,10 @@ snapshots:
verror: 1.10.1
optional: true
+ dot-prop@10.1.0:
+ dependencies:
+ type-fest: 5.5.0
+
dotenv-expand@11.0.7:
dependencies:
dotenv: 16.6.1
@@ -2219,6 +2348,11 @@ snapshots:
transitivePeerDependencies:
- supports-color
+ electron-store@11.0.2:
+ dependencies:
+ conf: 15.1.0
+ type-fest: 5.5.0
+
electron-updater@6.8.3:
dependencies:
builder-util-runtime: 9.5.1
@@ -2237,7 +2371,7 @@ snapshots:
'@electron/asar': 3.4.1
debug: 4.4.3
fs-extra: 7.0.1
- lodash: 4.17.23
+ lodash: 4.18.1
temp: 0.9.4
optionalDependencies:
'@electron/windows-sign': 1.2.2
@@ -2267,6 +2401,8 @@ snapshots:
env-paths@2.2.1: {}
+ env-paths@3.0.0: {}
+
err-code@2.0.3: {}
es-define-property@1.0.1: {}
@@ -2340,6 +2476,8 @@ snapshots:
fast-json-stable-stringify@2.1.0: {}
+ fast-uri@3.1.0: {}
+
fd-slicer@1.1.0:
dependencies:
pend: 1.2.0
@@ -2595,6 +2733,10 @@ snapshots:
json-schema-traverse@0.4.1: {}
+ json-schema-traverse@1.0.0: {}
+
+ json-schema-typed@8.0.2: {}
+
json-stringify-safe@5.0.1:
optional: true
@@ -2622,6 +2764,8 @@ snapshots:
lodash@4.17.23: {}
+ lodash@4.18.1: {}
+
log-symbols@4.1.0:
dependencies:
chalk: 4.1.2
@@ -2668,6 +2812,8 @@ snapshots:
mimic-fn@2.1.0: {}
+ mimic-function@5.0.1: {}
+
mimic-response@1.0.1: {}
mimic-response@3.1.0: {}
@@ -2863,8 +3009,12 @@ snapshots:
string_decoder: 1.3.0
util-deprecate: 1.0.2
+ readdirp@5.0.0: {}
+
require-directory@2.1.1: {}
+ require-from-string@2.0.2: {}
+
resedit@1.7.2:
dependencies:
pe-library: 0.4.1
@@ -3002,6 +3152,12 @@ snapshots:
dependencies:
ansi-regex: 6.2.2
+ stubborn-fs@2.0.0:
+ dependencies:
+ stubborn-utils: 1.0.2
+
+ stubborn-utils@1.0.2: {}
+
sumchecker@3.0.1:
dependencies:
debug: 4.4.3
@@ -3016,6 +3172,8 @@ snapshots:
dependencies:
has-flag: 4.0.0
+ tagged-tag@1.0.0: {}
+
tar@7.5.11:
dependencies:
'@isaacs/fs-minipass': 4.0.1
@@ -3062,8 +3220,14 @@ snapshots:
type-fest@0.13.1:
optional: true
+ type-fest@5.5.0:
+ dependencies:
+ tagged-tag: 1.0.0
+
typescript@5.9.3: {}
+ uint8array-extras@1.5.0: {}
+
undici-types@7.16.0: {}
undici-types@7.18.2: {}
@@ -3109,6 +3273,8 @@ snapshots:
dependencies:
defaults: 1.0.4
+ when-exit@2.1.5: {}
+
which@2.0.2:
dependencies:
isexe: 2.0.0
diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts
index 25ec1bc0e..2000964c7 100644
--- a/surfsense_desktop/src/ipc/channels.ts
+++ b/surfsense_desktop/src/ipc/channels.ts
@@ -6,4 +6,19 @@ export const IPC_CHANNELS = {
SET_QUICK_ASK_MODE: 'set-quick-ask-mode',
GET_QUICK_ASK_MODE: 'get-quick-ask-mode',
REPLACE_TEXT: 'replace-text',
+ // Folder sync channels
+ FOLDER_SYNC_SELECT_FOLDER: 'folder-sync:select-folder',
+ FOLDER_SYNC_ADD_FOLDER: 'folder-sync:add-folder',
+ FOLDER_SYNC_REMOVE_FOLDER: 'folder-sync:remove-folder',
+ FOLDER_SYNC_GET_FOLDERS: 'folder-sync:get-folders',
+ FOLDER_SYNC_GET_STATUS: 'folder-sync:get-status',
+ FOLDER_SYNC_FILE_CHANGED: 'folder-sync:file-changed',
+ FOLDER_SYNC_WATCHER_READY: 'folder-sync:watcher-ready',
+ FOLDER_SYNC_PAUSE: 'folder-sync:pause',
+ FOLDER_SYNC_RESUME: 'folder-sync:resume',
+ FOLDER_SYNC_RENDERER_READY: 'folder-sync:renderer-ready',
+ FOLDER_SYNC_GET_PENDING_EVENTS: 'folder-sync:get-pending-events',
+ FOLDER_SYNC_ACK_EVENTS: 'folder-sync:ack-events',
+ BROWSE_FILES: 'browse:files',
+ READ_LOCAL_FILES: 'browse:read-local-files',
} as const;
diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts
index 18e343719..c4251b30b 100644
--- a/surfsense_desktop/src/ipc/handlers.ts
+++ b/surfsense_desktop/src/ipc/handlers.ts
@@ -1,5 +1,19 @@
import { app, ipcMain, shell } from 'electron';
import { IPC_CHANNELS } from './channels';
+import {
+ selectFolder,
+ addWatchedFolder,
+ removeWatchedFolder,
+ getWatchedFolders,
+ getWatcherStatus,
+ getPendingFileEvents,
+ acknowledgeFileEvents,
+ pauseWatcher,
+ resumeWatcher,
+ markRendererReady,
+ browseFiles,
+ readLocalFiles,
+} from '../modules/folder-watcher';
export function registerIpcHandlers(): void {
ipcMain.on(IPC_CHANNELS.OPEN_EXTERNAL, (_event, url: string) => {
@@ -16,4 +30,41 @@ export function registerIpcHandlers(): void {
ipcMain.handle(IPC_CHANNELS.GET_APP_VERSION, () => {
return app.getVersion();
});
+
+ // Folder sync handlers
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_SELECT_FOLDER, () => selectFolder());
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_ADD_FOLDER, (_event, config) =>
+ addWatchedFolder(config)
+ );
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_REMOVE_FOLDER, (_event, folderPath: string) =>
+ removeWatchedFolder(folderPath)
+ );
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_GET_FOLDERS, () => getWatchedFolders());
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_GET_STATUS, () => getWatcherStatus());
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_PAUSE, () => pauseWatcher());
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_RESUME, () => resumeWatcher());
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY, () => {
+ markRendererReady();
+ });
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_GET_PENDING_EVENTS, () =>
+ getPendingFileEvents()
+ );
+
+ ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_ACK_EVENTS, (_event, eventIds: string[]) =>
+ acknowledgeFileEvents(eventIds)
+ );
+
+ ipcMain.handle(IPC_CHANNELS.BROWSE_FILES, () => browseFiles());
+
+ ipcMain.handle(IPC_CHANNELS.READ_LOCAL_FILES, (_event, paths: string[]) =>
+ readLocalFiles(paths)
+ );
}
diff --git a/surfsense_desktop/src/main.ts b/surfsense_desktop/src/main.ts
index 3ab41073b..f745d9b5e 100644
--- a/surfsense_desktop/src/main.ts
+++ b/surfsense_desktop/src/main.ts
@@ -6,6 +6,7 @@ import { setupDeepLinks, handlePendingDeepLink } from './modules/deep-links';
import { setupAutoUpdater } from './modules/auto-updater';
import { setupMenu } from './modules/menu';
import { registerQuickAsk, unregisterQuickAsk } from './modules/quick-ask';
+import { registerFolderWatcher, unregisterFolderWatcher } from './modules/folder-watcher';
import { registerIpcHandlers } from './ipc/handlers';
registerGlobalErrorHandlers();
@@ -28,6 +29,7 @@ app.whenReady().then(async () => {
}
createMainWindow();
registerQuickAsk();
+ registerFolderWatcher();
setupAutoUpdater();
handlePendingDeepLink();
@@ -47,4 +49,5 @@ app.on('window-all-closed', () => {
app.on('will-quit', () => {
unregisterQuickAsk();
+ unregisterFolderWatcher();
});
diff --git a/surfsense_desktop/src/modules/folder-watcher.ts b/surfsense_desktop/src/modules/folder-watcher.ts
new file mode 100644
index 000000000..969dabe97
--- /dev/null
+++ b/surfsense_desktop/src/modules/folder-watcher.ts
@@ -0,0 +1,534 @@
+import { BrowserWindow, dialog } from 'electron';
+import chokidar, { type FSWatcher } from 'chokidar';
+import { randomUUID } from 'crypto';
+import * as path from 'path';
+import * as fs from 'fs';
+import { IPC_CHANNELS } from '../ipc/channels';
+
+export interface WatchedFolderConfig {
+ path: string;
+ name: string;
+ excludePatterns: string[];
+ fileExtensions: string[] | null;
+ rootFolderId: number | null;
+ searchSpaceId: number;
+ active: boolean;
+}
+
+interface WatcherEntry {
+ config: WatchedFolderConfig;
+ watcher: FSWatcher | null;
+}
+
+type MtimeMap = Record;
+type FolderSyncAction = 'add' | 'change' | 'unlink';
+
+export interface FolderSyncFileChangedEvent {
+ id: string;
+ rootFolderId: number | null;
+ searchSpaceId: number;
+ folderPath: string;
+ folderName: string;
+ relativePath: string;
+ fullPath: string;
+ action: FolderSyncAction;
+ timestamp: number;
+}
+
+const STORE_KEY = 'watchedFolders';
+const OUTBOX_STORE_KEY = 'events';
+const MTIME_TOLERANCE_S = 1.0;
+
+let store: any = null;
+let mtimeStore: any = null;
+let outboxStore: any = null;
+let watchers: Map = new Map();
+
+/**
+ * In-memory cache of mtime maps, keyed by folder path.
+ * Persisted to electron-store on mutation.
+ */
+const mtimeMaps: Map = new Map();
+
+let rendererReady = false;
+const outboxEvents: Map = new Map();
+let outboxLoaded = false;
+
+export function markRendererReady() {
+ rendererReady = true;
+}
+
+async function getStore() {
+ if (!store) {
+ const { default: Store } = await import('electron-store');
+ store = new Store({
+ name: 'folder-watcher',
+ defaults: {
+ [STORE_KEY]: [] as WatchedFolderConfig[],
+ },
+ });
+ }
+ return store;
+}
+
+async function getMtimeStore() {
+ if (!mtimeStore) {
+ const { default: Store } = await import('electron-store');
+ mtimeStore = new Store({
+ name: 'folder-mtime-maps',
+ defaults: {} as Record,
+ });
+ }
+ return mtimeStore;
+}
+
+async function getOutboxStore() {
+ if (!outboxStore) {
+ const { default: Store } = await import('electron-store');
+ outboxStore = new Store({
+ name: 'folder-sync-outbox',
+ defaults: {
+ [OUTBOX_STORE_KEY]: [] as FolderSyncFileChangedEvent[],
+ },
+ });
+ }
+ return outboxStore;
+}
+
+function makeEventKey(event: Pick): string {
+ return `${event.folderPath}:${event.relativePath}`;
+}
+
+function persistOutbox() {
+ getOutboxStore().then((s) => {
+ s.set(OUTBOX_STORE_KEY, Array.from(outboxEvents.values()));
+ });
+}
+
+async function loadOutbox() {
+ if (outboxLoaded) return;
+ const s = await getOutboxStore();
+ const stored: FolderSyncFileChangedEvent[] = s.get(OUTBOX_STORE_KEY, []);
+ outboxEvents.clear();
+ for (const event of stored) {
+ if (!event?.id || !event.folderPath || !event.relativePath) continue;
+ outboxEvents.set(makeEventKey(event), event);
+ }
+ outboxLoaded = true;
+}
+
+function sendFileChangedEvent(
+ data: Omit
+) {
+ const event: FolderSyncFileChangedEvent = {
+ id: randomUUID(),
+ ...data,
+ };
+
+ outboxEvents.set(makeEventKey(event), event);
+ persistOutbox();
+
+ if (rendererReady) {
+ sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, event);
+ }
+}
+
+function loadMtimeMap(folderPath: string): MtimeMap {
+ return mtimeMaps.get(folderPath) ?? {};
+}
+
+function persistMtimeMap(folderPath: string) {
+ const map = mtimeMaps.get(folderPath) ?? {};
+ getMtimeStore().then((s) => s.set(folderPath, map));
+}
+
+function walkFolderMtimes(config: WatchedFolderConfig): MtimeMap {
+ const root = config.path;
+ const result: MtimeMap = {};
+ const excludes = new Set(config.excludePatterns);
+
+ function walk(dir: string) {
+ let entries: fs.Dirent[];
+ try {
+ entries = fs.readdirSync(dir, { withFileTypes: true });
+ } catch {
+ return;
+ }
+
+ for (const entry of entries) {
+ const name = entry.name;
+
+ if (name.startsWith('.') || excludes.has(name)) continue;
+
+ const full = path.join(dir, name);
+
+ if (entry.isDirectory()) {
+ walk(full);
+ } else if (entry.isFile()) {
+ if (
+ config.fileExtensions &&
+ config.fileExtensions.length > 0
+ ) {
+ const ext = path.extname(name).toLowerCase();
+ if (!config.fileExtensions.includes(ext)) continue;
+ }
+
+ try {
+ const stat = fs.statSync(full);
+ const rel = path.relative(root, full);
+ result[rel] = stat.mtimeMs;
+ } catch {
+ // File may have been removed between readdir and stat
+ }
+ }
+ }
+ }
+
+ walk(root);
+ return result;
+}
+
+function getMainWindow(): BrowserWindow | null {
+ const windows = BrowserWindow.getAllWindows();
+ return windows.length > 0 ? windows[0] : null;
+}
+
+function sendToRenderer(channel: string, data: any) {
+ const win = getMainWindow();
+ if (win && !win.isDestroyed()) {
+ win.webContents.send(channel, data);
+ }
+}
+
+async function startWatcher(config: WatchedFolderConfig) {
+ if (watchers.has(config.path)) {
+ return;
+ }
+
+ const ms = await getMtimeStore();
+ const storedMap: MtimeMap = ms.get(config.path) ?? {};
+ mtimeMaps.set(config.path, { ...storedMap });
+
+ const ignored = [
+ /(^|[/\\])\../, // dotfiles by default
+ ...config.excludePatterns.map((p) => `**/${p}/**`),
+ ];
+
+ const watcher = chokidar.watch(config.path, {
+ persistent: true,
+ ignoreInitial: true,
+ awaitWriteFinish: {
+ stabilityThreshold: 500,
+ pollInterval: 100,
+ },
+ ignored,
+ });
+
+ let ready = false;
+
+ watcher.on('ready', () => {
+ ready = true;
+
+ const currentMap = walkFolderMtimes(config);
+ const storedSnapshot = loadMtimeMap(config.path);
+ const now = Date.now();
+
+ // Track which files are unchanged so we can selectively update the mtime map
+ const unchangedMap: MtimeMap = {};
+
+ for (const [rel, currentMtime] of Object.entries(currentMap)) {
+ const storedMtime = storedSnapshot[rel];
+ if (storedMtime === undefined) {
+ sendFileChangedEvent({
+ rootFolderId: config.rootFolderId,
+ searchSpaceId: config.searchSpaceId,
+ folderPath: config.path,
+ folderName: config.name,
+ relativePath: rel,
+ fullPath: path.join(config.path, rel),
+ action: 'add',
+ timestamp: now,
+ });
+ } else if (Math.abs(currentMtime - storedMtime) >= MTIME_TOLERANCE_S * 1000) {
+ sendFileChangedEvent({
+ rootFolderId: config.rootFolderId,
+ searchSpaceId: config.searchSpaceId,
+ folderPath: config.path,
+ folderName: config.name,
+ relativePath: rel,
+ fullPath: path.join(config.path, rel),
+ action: 'change',
+ timestamp: now,
+ });
+ } else {
+ unchangedMap[rel] = currentMtime;
+ }
+ }
+
+ for (const rel of Object.keys(storedSnapshot)) {
+ if (!(rel in currentMap)) {
+ sendFileChangedEvent({
+ rootFolderId: config.rootFolderId,
+ searchSpaceId: config.searchSpaceId,
+ folderPath: config.path,
+ folderName: config.name,
+ relativePath: rel,
+ fullPath: path.join(config.path, rel),
+ action: 'unlink',
+ timestamp: now,
+ });
+ }
+ }
+
+ // Only update the mtime map for unchanged files; changed files keep their
+ // stored mtime so they'll be re-detected if the app crashes before indexing.
+ mtimeMaps.set(config.path, unchangedMap);
+ persistMtimeMap(config.path);
+
+ sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_WATCHER_READY, {
+ rootFolderId: config.rootFolderId,
+ folderPath: config.path,
+ });
+ });
+
+ const handleFileEvent = (filePath: string, action: FolderSyncAction) => {
+ if (!ready) return;
+
+ const relativePath = path.relative(config.path, filePath);
+
+ if (
+ config.fileExtensions &&
+ config.fileExtensions.length > 0
+ ) {
+ const ext = path.extname(filePath).toLowerCase();
+ if (!config.fileExtensions.includes(ext)) return;
+ }
+
+ const map = mtimeMaps.get(config.path);
+ if (map) {
+ if (action === 'unlink') {
+ delete map[relativePath];
+ } else {
+ try {
+ map[relativePath] = fs.statSync(filePath).mtimeMs;
+ } catch {
+ // File may have been removed between event and stat
+ }
+ }
+ persistMtimeMap(config.path);
+ }
+
+ sendFileChangedEvent({
+ rootFolderId: config.rootFolderId,
+ searchSpaceId: config.searchSpaceId,
+ folderPath: config.path,
+ folderName: config.name,
+ relativePath,
+ fullPath: filePath,
+ action,
+ timestamp: Date.now(),
+ });
+ };
+
+ watcher.on('add', (fp) => handleFileEvent(fp, 'add'));
+ watcher.on('change', (fp) => handleFileEvent(fp, 'change'));
+ watcher.on('unlink', (fp) => handleFileEvent(fp, 'unlink'));
+
+ watchers.set(config.path, { config, watcher });
+}
+
+function stopWatcher(folderPath: string) {
+ persistMtimeMap(folderPath);
+ const entry = watchers.get(folderPath);
+ if (entry?.watcher) {
+ entry.watcher.close();
+ }
+ watchers.delete(folderPath);
+}
+
+export async function selectFolder(): Promise {
+ const result = await dialog.showOpenDialog({
+ properties: ['openDirectory'],
+ title: 'Select a folder to watch',
+ });
+ if (result.canceled || result.filePaths.length === 0) {
+ return null;
+ }
+ return result.filePaths[0];
+}
+
+export async function addWatchedFolder(
+ config: WatchedFolderConfig
+): Promise {
+ const s = await getStore();
+ const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []);
+
+ const existing = folders.findIndex((f: WatchedFolderConfig) => f.path === config.path);
+ if (existing >= 0) {
+ folders[existing] = config;
+ } else {
+ folders.push(config);
+ }
+
+ s.set(STORE_KEY, folders);
+
+ if (config.active) {
+ await startWatcher(config);
+ }
+
+ return folders;
+}
+
+export async function removeWatchedFolder(
+ folderPath: string
+): Promise {
+ const s = await getStore();
+ const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []);
+ const updated = folders.filter((f: WatchedFolderConfig) => f.path !== folderPath);
+ s.set(STORE_KEY, updated);
+
+ stopWatcher(folderPath);
+
+ mtimeMaps.delete(folderPath);
+ const ms = await getMtimeStore();
+ ms.delete(folderPath);
+
+ return updated;
+}
+
+export async function getWatchedFolders(): Promise {
+ const s = await getStore();
+ return s.get(STORE_KEY, []);
+}
+
+export async function getWatcherStatus(): Promise<
+ { path: string; active: boolean; watching: boolean }[]
+> {
+ const s = await getStore();
+ const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []);
+ return folders.map((f: WatchedFolderConfig) => ({
+ path: f.path,
+ active: f.active,
+ watching: watchers.has(f.path),
+ }));
+}
+
+export async function getPendingFileEvents(): Promise {
+ await loadOutbox();
+ return Array.from(outboxEvents.values()).sort((a, b) => a.timestamp - b.timestamp);
+}
+
+export async function acknowledgeFileEvents(eventIds: string[]): Promise<{ acknowledged: number }> {
+ if (!eventIds || eventIds.length === 0) return { acknowledged: 0 };
+ await loadOutbox();
+
+ const ackSet = new Set(eventIds);
+ let acknowledged = 0;
+
+ for (const [key, event] of outboxEvents.entries()) {
+ if (ackSet.has(event.id)) {
+ outboxEvents.delete(key);
+ acknowledged += 1;
+ }
+ }
+
+ if (acknowledged > 0) {
+ persistOutbox();
+ }
+
+ return { acknowledged };
+}
+
+export async function pauseWatcher(): Promise {
+ for (const [, entry] of watchers) {
+ if (entry.watcher) {
+ await entry.watcher.close();
+ entry.watcher = null;
+ }
+ }
+}
+
+export async function resumeWatcher(): Promise {
+ for (const [, entry] of watchers) {
+ if (!entry.watcher && entry.config.active) {
+ await startWatcher(entry.config);
+ }
+ }
+}
+
+export async function registerFolderWatcher(): Promise {
+ await loadOutbox();
+ const s = await getStore();
+ const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []);
+
+ for (const config of folders) {
+ if (config.active && fs.existsSync(config.path)) {
+ await startWatcher(config);
+ }
+ }
+}
+
+export async function unregisterFolderWatcher(): Promise {
+ for (const [folderPath] of watchers) {
+ stopWatcher(folderPath);
+ }
+ watchers.clear();
+}
+
+export async function browseFiles(): Promise {
+ const result = await dialog.showOpenDialog({
+ properties: ['openFile', 'multiSelections'],
+ title: 'Select files',
+ });
+ if (result.canceled || result.filePaths.length === 0) return null;
+ return result.filePaths;
+}
+
+const MIME_MAP: Record = {
+ '.pdf': 'application/pdf',
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+ '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+ '.html': 'text/html', '.htm': 'text/html',
+ '.csv': 'text/csv',
+ '.txt': 'text/plain',
+ '.md': 'text/markdown', '.markdown': 'text/markdown',
+ '.mp3': 'audio/mpeg', '.mpeg': 'audio/mpeg', '.mpga': 'audio/mpeg',
+ '.mp4': 'audio/mp4', '.m4a': 'audio/mp4',
+ '.wav': 'audio/wav',
+ '.webm': 'audio/webm',
+ '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
+ '.png': 'image/png',
+ '.bmp': 'image/bmp',
+ '.webp': 'image/webp',
+ '.tiff': 'image/tiff',
+ '.doc': 'application/msword',
+ '.rtf': 'application/rtf',
+ '.xml': 'application/xml',
+ '.epub': 'application/epub+zip',
+ '.xls': 'application/vnd.ms-excel',
+ '.ppt': 'application/vnd.ms-powerpoint',
+ '.eml': 'message/rfc822',
+ '.odt': 'application/vnd.oasis.opendocument.text',
+ '.msg': 'application/vnd.ms-outlook',
+};
+
+export interface LocalFileData {
+ name: string;
+ data: ArrayBuffer;
+ mimeType: string;
+ size: number;
+}
+
+export function readLocalFiles(filePaths: string[]): LocalFileData[] {
+ return filePaths.map((p) => {
+ const buf = fs.readFileSync(p);
+ const ext = path.extname(p).toLowerCase();
+ return {
+ name: path.basename(p),
+ data: buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength),
+ mimeType: MIME_MAP[ext] || 'application/octet-stream',
+ size: buf.byteLength,
+ };
+ });
+}
diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts
index 264ec25b3..6fbfd354a 100644
--- a/surfsense_desktop/src/preload.ts
+++ b/surfsense_desktop/src/preload.ts
@@ -21,4 +21,34 @@ contextBridge.exposeInMainWorld('electronAPI', {
setQuickAskMode: (mode: string) => ipcRenderer.invoke(IPC_CHANNELS.SET_QUICK_ASK_MODE, mode),
getQuickAskMode: () => ipcRenderer.invoke(IPC_CHANNELS.GET_QUICK_ASK_MODE),
replaceText: (text: string) => ipcRenderer.invoke(IPC_CHANNELS.REPLACE_TEXT, text),
+
+ // Folder sync
+ selectFolder: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_SELECT_FOLDER),
+ addWatchedFolder: (config: any) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_ADD_FOLDER, config),
+ removeWatchedFolder: (folderPath: string) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_REMOVE_FOLDER, folderPath),
+ getWatchedFolders: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_GET_FOLDERS),
+ getWatcherStatus: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_GET_STATUS),
+ onFileChanged: (callback: (data: any) => void) => {
+ const listener = (_event: unknown, data: any) => callback(data);
+ ipcRenderer.on(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, listener);
+ return () => {
+ ipcRenderer.removeListener(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, listener);
+ };
+ },
+ onWatcherReady: (callback: (data: any) => void) => {
+ const listener = (_event: unknown, data: any) => callback(data);
+ ipcRenderer.on(IPC_CHANNELS.FOLDER_SYNC_WATCHER_READY, listener);
+ return () => {
+ ipcRenderer.removeListener(IPC_CHANNELS.FOLDER_SYNC_WATCHER_READY, listener);
+ };
+ },
+ pauseWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_PAUSE),
+ resumeWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RESUME),
+ signalRendererReady: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY),
+ getPendingFileEvents: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_GET_PENDING_EVENTS),
+ acknowledgeFileEvents: (eventIds: string[]) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_ACK_EVENTS, eventIds),
+
+ // Browse files via native dialog
+ browseFiles: () => ipcRenderer.invoke(IPC_CHANNELS.BROWSE_FILES),
+ readLocalFiles: (paths: string[]) => ipcRenderer.invoke(IPC_CHANNELS.READ_LOCAL_FILES, paths),
});
diff --git a/surfsense_web/app/(home)/login/LocalLoginForm.tsx b/surfsense_web/app/(home)/login/LocalLoginForm.tsx
index ee3b47683..e94857334 100644
--- a/surfsense_web/app/(home)/login/LocalLoginForm.tsx
+++ b/surfsense_web/app/(home)/login/LocalLoginForm.tsx
@@ -160,10 +160,10 @@ export function LocalLoginForm() {
placeholder="you@example.com"
value={username}
onChange={(e) => setUsername(e.target.value)}
- className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 bg-background text-foreground transition-all ${
+ className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${
error.title
- ? "border-destructive focus:border-destructive focus:ring-destructive"
- : "border-border focus:border-primary focus:ring-primary"
+ ? "border-destructive focus:border-destructive focus:ring-destructive/40"
+ : "border-border focus:border-primary focus:ring-primary/40"
}`}
disabled={isLoggingIn}
/>
@@ -181,10 +181,10 @@ export function LocalLoginForm() {
placeholder="Enter your password"
value={password}
onChange={(e) => setPassword(e.target.value)}
- className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 bg-background text-foreground transition-all ${
+ className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${
error.title
- ? "border-destructive focus:border-destructive focus:ring-destructive"
- : "border-border focus:border-primary focus:ring-primary"
+ ? "border-destructive focus:border-destructive focus:ring-destructive/40"
+ : "border-border focus:border-primary focus:ring-primary/40"
}`}
disabled={isLoggingIn}
/>
diff --git a/surfsense_web/app/(home)/login/page.tsx b/surfsense_web/app/(home)/login/page.tsx
index 8b3be3805..09bf770d8 100644
--- a/surfsense_web/app/(home)/login/page.tsx
+++ b/surfsense_web/app/(home)/login/page.tsx
@@ -115,7 +115,7 @@ function LoginContent() {
-
+
{t("sign_in")}
diff --git a/surfsense_web/app/(home)/register/page.tsx b/surfsense_web/app/(home)/register/page.tsx
index b9200c68f..1ec179b35 100644
--- a/surfsense_web/app/(home)/register/page.tsx
+++ b/surfsense_web/app/(home)/register/page.tsx
@@ -160,7 +160,7 @@ export default function RegisterPage() {
-
+
{t("create_account")}
@@ -229,10 +229,7 @@ export default function RegisterPage() {
-
-
+
{t("password")}
setPassword(e.target.value)}
- className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 dark:bg-gray-800 dark:text-white transition-all ${
+ className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${
error.title
- ? "border-red-300 focus:border-red-500 focus:ring-red-500 dark:border-red-700"
- : "border-gray-300 focus:border-blue-500 focus:ring-blue-500 dark:border-gray-700"
+ ? "border-destructive focus:border-destructive focus:ring-destructive/40"
+ : "border-border focus:border-primary focus:ring-primary/40"
}`}
disabled={isRegistering}
/>
@@ -277,7 +271,7 @@ export default function RegisterPage() {
{t("confirm_password")}
@@ -288,10 +282,10 @@ export default function RegisterPage() {
placeholder="Confirm your password"
value={confirmPassword}
onChange={(e) => setConfirmPassword(e.target.value)}
- className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 dark:bg-gray-800 dark:text-white transition-all ${
+ className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${
error.title
- ? "border-red-300 focus:border-red-500 focus:ring-red-500 dark:border-red-700"
- : "border-gray-300 focus:border-blue-500 focus:ring-blue-500 dark:border-gray-700"
+ ? "border-destructive focus:border-destructive focus:ring-destructive/40"
+ : "border-border focus:border-primary focus:ring-primary/40"
}`}
disabled={isRegistering}
/>
@@ -300,7 +294,7 @@ export default function RegisterPage() {