From 8a8af948dba50c3ecbfcbe0ceae9a8adc0f9a5d9 Mon Sep 17 00:00:00 2001 From: JoeMakuta Date: Wed, 1 Apr 2026 13:07:27 +0200 Subject: [PATCH 001/202] feat: add experimental package import optimization configuration --- surfsense_web/next.config.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/surfsense_web/next.config.ts b/surfsense_web/next.config.ts index 263b3e6f6..5414d548d 100644 --- a/surfsense_web/next.config.ts +++ b/surfsense_web/next.config.ts @@ -24,6 +24,16 @@ const nextConfig: NextConfig = { }, ], }, + experimental: { + optimizePackageImports: [ + "lucide-react", + "@tabler/icons-react", + "date-fns", + "@assistant-ui/react", + "@assistant-ui/react-markdown", + "motion", + ], + }, // Turbopack config (used during `next dev --turbopack`) turbopack: { rules: { From 9d6d81871219906432e8b35260a8f5c3de6edbc6 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 1 Apr 2026 18:52:04 +0530 Subject: [PATCH 002/202] fix: add select-none class to TabBar --- surfsense_web/components/layout/ui/tabs/TabBar.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfsense_web/components/layout/ui/tabs/TabBar.tsx b/surfsense_web/components/layout/ui/tabs/TabBar.tsx index 18e1ba141..8d0d986d3 100644 --- a/surfsense_web/components/layout/ui/tabs/TabBar.tsx +++ b/surfsense_web/components/layout/ui/tabs/TabBar.tsx @@ -72,7 +72,7 @@ export function TabBar({ onTabSwitch, onNewChat, rightActions, className }: TabB if (tabs.length <= 1) return null; return ( -
+
Date: Wed, 1 Apr 2026 15:41:11 +0200 Subject: [PATCH 003/202] refac: import of UI components for dropdown menu, separator, toggle group, toggle, tooltip, and checkbox --- surfsense_web/components/ui/checkbox.tsx | 2 +- surfsense_web/components/ui/dropdown-menu.tsx | 2 +- surfsense_web/components/ui/separator.tsx | 2 +- surfsense_web/components/ui/toggle-group.tsx | 2 +- surfsense_web/components/ui/toggle.tsx | 2 +- surfsense_web/components/ui/tooltip.tsx | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/surfsense_web/components/ui/checkbox.tsx b/surfsense_web/components/ui/checkbox.tsx index 0936a383e..586e3e602 100644 --- a/surfsense_web/components/ui/checkbox.tsx +++ b/surfsense_web/components/ui/checkbox.tsx @@ -1,7 +1,7 @@ "use client"; import { CheckIcon } from "lucide-react"; -import { Checkbox as CheckboxPrimitive } from "radix-ui"; +import * as CheckboxPrimitive from "@radix-ui/react-checkbox"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/dropdown-menu.tsx b/surfsense_web/components/ui/dropdown-menu.tsx index 24b99467e..2904b93dd 100644 --- a/surfsense_web/components/ui/dropdown-menu.tsx +++ b/surfsense_web/components/ui/dropdown-menu.tsx @@ -1,7 +1,7 @@ "use client"; import { CheckIcon, ChevronRightIcon, CircleIcon } from "lucide-react"; -import { DropdownMenu as DropdownMenuPrimitive } from "radix-ui"; +import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/separator.tsx b/surfsense_web/components/ui/separator.tsx index 63b8e6a69..d7cf2cd81 100644 --- a/surfsense_web/components/ui/separator.tsx +++ b/surfsense_web/components/ui/separator.tsx @@ -1,6 +1,6 @@ "use client"; -import { Separator as SeparatorPrimitive } from "radix-ui"; +import * as SeparatorPrimitive from "@radix-ui/react-separator"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/toggle-group.tsx b/surfsense_web/components/ui/toggle-group.tsx index eb212182a..33aa433b2 100644 --- a/surfsense_web/components/ui/toggle-group.tsx +++ b/surfsense_web/components/ui/toggle-group.tsx @@ -1,7 +1,7 @@ "use client"; import type { VariantProps } from "class-variance-authority"; -import { ToggleGroup as ToggleGroupPrimitive } from "radix-ui"; +import * as ToggleGroupPrimitive from "@radix-ui/react-toggle-group"; import * as React from "react"; import { toggleVariants } from "@/components/ui/toggle"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/toggle.tsx b/surfsense_web/components/ui/toggle.tsx index 5841cdc35..f0e68cec9 100644 --- a/surfsense_web/components/ui/toggle.tsx +++ b/surfsense_web/components/ui/toggle.tsx @@ -1,7 +1,7 @@ "use client"; import { cva, type VariantProps } from "class-variance-authority"; -import { Toggle as TogglePrimitive } from "radix-ui"; +import * as TogglePrimitive from "@radix-ui/react-toggle"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/tooltip.tsx b/surfsense_web/components/ui/tooltip.tsx index c19ce6f82..2fc85aae4 100644 --- a/surfsense_web/components/ui/tooltip.tsx +++ b/surfsense_web/components/ui/tooltip.tsx @@ -1,6 +1,6 @@ "use client"; -import { Tooltip as TooltipPrimitive } from "radix-ui"; +import * as TooltipPrimitive from "@radix-ui/react-tooltip"; import type * as React from "react"; import { cn } from "@/lib/utils"; From 0a65aa1a31cee5689f0f89ff00723deff5ac67f3 Mon Sep 17 00:00:00 2001 From: JoeMakuta Date: Wed, 1 Apr 2026 15:59:11 +0200 Subject: [PATCH 004/202] feat: dynamic import of PostHog --- surfsense_web/app/error.tsx | 8 +++++-- surfsense_web/lib/apis/base-api.service.ts | 26 ++++++++++++---------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/surfsense_web/app/error.tsx b/surfsense_web/app/error.tsx index 7bbd74e0e..3935f84d5 100644 --- a/surfsense_web/app/error.tsx +++ b/surfsense_web/app/error.tsx @@ -1,6 +1,6 @@ "use client"; -import posthog from "posthog-js"; + import { useEffect } from "react"; export default function ErrorPage({ @@ -11,7 +11,11 @@ export default function ErrorPage({ reset: () => void; }) { useEffect(() => { - posthog.captureException(error); + import("posthog-js") + .then(({ default: posthog }) => { + posthog.captureException(error); + }) + .catch(() => {}); }, [error]); return ( diff --git a/surfsense_web/lib/apis/base-api.service.ts b/surfsense_web/lib/apis/base-api.service.ts index 4c3371233..bc9e6c1d8 100644 --- a/surfsense_web/lib/apis/base-api.service.ts +++ b/surfsense_web/lib/apis/base-api.service.ts @@ -1,4 +1,3 @@ -import posthog from "posthog-js"; import type { ZodType } from "zod"; import { getBearerToken, handleUnauthorized, refreshAccessToken } from "../auth-utils"; import { AppError, AuthenticationError, AuthorizationError, NotFoundError } from "../error"; @@ -234,18 +233,21 @@ class BaseApiService { } catch (error) { console.error("Request failed:", JSON.stringify(error)); if (!(error instanceof AuthenticationError)) { - try { - posthog.captureException(error, { - api_url: url, - api_method: options?.method ?? "GET", - ...(error instanceof AppError && { - status_code: error.status, - status_text: error.statusText, - }), + import("posthog-js") + .then(({ default: posthog }) => { + posthog.captureException(error, { + api_url: url, + api_method: options?.method ?? "GET", + ...(error instanceof AppError && { + status_code: error.status, + status_text: error.statusText, + }), + }); + }) + .catch(() => { + // PostHog is not available in the current environment + console.error("Failed to capture exception in PostHog"); }); - } catch { - // PostHog capture failed — don't block the error flow - } } throw error; } From a42d4d219b1194a8aaab5019c6500042307f3d49 Mon Sep 17 00:00:00 2001 From: Jun Hyeok Lee Date: Wed, 1 Apr 2026 23:03:52 +0900 Subject: [PATCH 005/202] fix: avoid mutating shared arrays in changelog and comments sync --- surfsense_web/app/(home)/changelog/page.tsx | 2 +- surfsense_web/hooks/use-comments-sync.ts | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/surfsense_web/app/(home)/changelog/page.tsx b/surfsense_web/app/(home)/changelog/page.tsx index 8d38cb687..c14218dab 100644 --- a/surfsense_web/app/(home)/changelog/page.tsx +++ b/surfsense_web/app/(home)/changelog/page.tsx @@ -29,7 +29,7 @@ interface ChangelogPageItem { export default async function ChangelogPage() { const allPages = source.getPages() as ChangelogPageItem[]; - const sortedChangelogs = allPages.sort((a, b) => { + const sortedChangelogs = allPages.toSorted((a, b) => { const dateA = new Date(a.data.date).getTime(); const dateB = new Date(b.data.date).getTime(); return dateB - dateA; diff --git a/surfsense_web/hooks/use-comments-sync.ts b/surfsense_web/hooks/use-comments-sync.ts index b6a68364d..f8037c656 100644 --- a/surfsense_web/hooks/use-comments-sync.ts +++ b/surfsense_web/hooks/use-comments-sync.ts @@ -118,8 +118,9 @@ function transformComments( for (const [messageId, group] of byMessage) { const comments: Comment[] = group.topLevel.map((raw) => { - const replies = (group.replies.get(raw.id) || []) - .sort((a, b) => new Date(a.created_at).getTime() - new Date(b.created_at).getTime()) + const replies = (group.replies.get(raw.id) ?? []).toSorted( + (a, b) => new Date(a.created_at).getTime() - new Date(b.created_at).getTime() + ) .map((r) => transformReply(r, memberMap, currentUserId, isOwner)); return { From e6ec589e1251bcb4963bada86420daef9703dba6 Mon Sep 17 00:00:00 2001 From: JoeMakuta Date: Wed, 1 Apr 2026 16:10:27 +0200 Subject: [PATCH 006/202] feat: add dynamic import and dev only render of ReactQueryClientProvider --- surfsense_web/lib/query-client/query-client.provider.tsx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/surfsense_web/lib/query-client/query-client.provider.tsx b/surfsense_web/lib/query-client/query-client.provider.tsx index 453b9f531..6dc2a4258 100644 --- a/surfsense_web/lib/query-client/query-client.provider.tsx +++ b/surfsense_web/lib/query-client/query-client.provider.tsx @@ -1,13 +1,18 @@ "use client"; -import { ReactQueryDevtools } from "@tanstack/react-query-devtools"; +import dynamic from "next/dynamic"; import { QueryClientAtomProvider } from "jotai-tanstack-query/react"; import { queryClient } from "./client"; +const ReactQueryDevtools = dynamic( + () => import("@tanstack/react-query-devtools").then((m) => ({ default: m.ReactQueryDevtools })), + { ssr: false } +); + export function ReactQueryClientProvider({ children }: { children: React.ReactNode }) { return ( {children} - + {process.env.NODE_ENV === "development" && } ); } From 663096a074494b954c9f82a31652cf7dfc83426b Mon Sep 17 00:00:00 2001 From: Jun Hyeok Lee Date: Wed, 1 Apr 2026 23:23:36 +0900 Subject: [PATCH 007/202] chore: format issue 1046 frontend changes --- surfsense_web/hooks/use-comments-sync.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/surfsense_web/hooks/use-comments-sync.ts b/surfsense_web/hooks/use-comments-sync.ts index f8037c656..b5ea6cce7 100644 --- a/surfsense_web/hooks/use-comments-sync.ts +++ b/surfsense_web/hooks/use-comments-sync.ts @@ -118,9 +118,8 @@ function transformComments( for (const [messageId, group] of byMessage) { const comments: Comment[] = group.topLevel.map((raw) => { - const replies = (group.replies.get(raw.id) ?? []).toSorted( - (a, b) => new Date(a.created_at).getTime() - new Date(b.created_at).getTime() - ) + const replies = (group.replies.get(raw.id) ?? []) + .toSorted((a, b) => new Date(a.created_at).getTime() - new Date(b.created_at).getTime()) .map((r) => transformReply(r, memberMap, currentUserId, isOwner)); return { From d7dd6db1b9e4c9dc1c9b75c0f53ae3920b7453de Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 1 Apr 2026 20:31:45 +0530 Subject: [PATCH 008/202] fix: improve document loading error handling and UI feedback for processing state --- surfsense_backend/app/routes/editor_routes.py | 11 ++++-- .../components/DocumentsTableShell.tsx | 5 +++ .../components/documents/DocumentNode.tsx | 4 +-- .../components/documents/FolderTreeView.tsx | 7 ++-- .../components/editor-panel/editor-panel.tsx | 22 +++++++++--- .../layout/ui/tabs/DocumentTabContent.tsx | 34 +++++++++++++++---- .../new-chat/source-detail-panel.tsx | 10 +++--- 7 files changed, 69 insertions(+), 24 deletions(-) diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py index f54f18def..0fcbc475d 100644 --- a/surfsense_backend/app/routes/editor_routes.py +++ b/surfsense_backend/app/routes/editor_routes.py @@ -127,9 +127,16 @@ async def get_editor_content( chunks = sorted(document.chunks, key=lambda c: c.id) if not chunks: + doc_status = document.status or {} + state = doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready" + if state in ("pending", "processing"): + raise HTTPException( + status_code=409, + detail="This document is still being processed. Please wait a moment and try again.", + ) raise HTTPException( status_code=400, - detail="This document has no content and cannot be edited. Please re-upload to enable editing.", + detail="This document has no viewable content yet. It may still be syncing. Try again in a few seconds, or re-upload if the issue persists.", ) markdown_content = "\n\n".join(chunk.content for chunk in chunks) @@ -137,7 +144,7 @@ async def get_editor_content( if not markdown_content.strip(): raise HTTPException( status_code=400, - detail="This document has empty content and cannot be edited.", + detail="This document appears to be empty. Try re-uploading or editing it to add content.", ) # Persist the lazy migration diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index 92ced6e47..0758307f7 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -748,6 +748,7 @@ export function DocumentsTableShell({ onClick={() => onOpenInTab ? onOpenInTab(doc) : handleViewDocument(doc) } + disabled={isBeingProcessed} > Open @@ -1020,6 +1021,10 @@ export function DocumentsTableShell({ e.stopPropagation()}> - onPreview(doc)}> + onPreview(doc)} disabled={isProcessing}> Open @@ -259,7 +259,7 @@ export const DocumentNode = React.memo(function DocumentNode({ {contextMenuOpen && ( e.stopPropagation()}> - onPreview(doc)}> + onPreview(doc)} disabled={isProcessing}> Open diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index f63d5da5c..7695923e3 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtom } from "jotai"; -import { CirclePlus } from "lucide-react"; +import { Search } from "lucide-react"; import { useCallback, useMemo, useState } from "react"; import { DndProvider } from "react-dnd"; import { HTML5Backend } from "react-dnd-html5-backend"; @@ -250,8 +250,9 @@ export function FolderTreeView({ if (treeNodes.length === 0 && (activeTypes.length > 0 || searchQuery)) { return (
- -

No matching documents

+ +

No matching documents

+

Try a different search term

); } diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 3ea36f800..7496e6aec 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtomValue, useSetAtom } from "jotai"; -import { AlertCircle, XIcon } from "lucide-react"; +import { FileQuestionMark, RefreshCw, XIcon } from "lucide-react"; import dynamic from "next/dynamic"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; @@ -200,10 +200,22 @@ export function EditorPanelContent({ ) : error || !editorDoc ? (
- -
-

Failed to load document

-

{error || "An unknown error occurred"}

+ {error?.toLowerCase().includes("still being processed") ? ( +
+ +
+ ) : ( +
+ +
+ )} +
+

+ {error?.toLowerCase().includes("still being processed") + ? "Document is processing" + : "Document unavailable"} +

+

{error || "An unknown error occurred"}

) : isEditableType ? ( diff --git a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx index ac279cd4d..849bdbea5 100644 --- a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx +++ b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx @@ -1,6 +1,6 @@ "use client"; -import { AlertCircle, Pencil } from "lucide-react"; +import { FileQuestionMark, PenLine, RefreshCw } from "lucide-react"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { PlateEditor } from "@/components/editor/plate-editor"; @@ -160,15 +160,35 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen if (isLoading) return ; if (error || !doc) { + const isProcessing = error?.toLowerCase().includes("still being processed"); return ( -
- -
-

Failed to load document

-

+

+
+ {isProcessing ? ( + + ) : ( + + )} +
+
+

+ {isProcessing ? "Document is processing" : "Document unavailable"} +

+

{error || "An unknown error occurred"}

+ {!isProcessing && ( + + )}
); } @@ -229,7 +249,7 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen onClick={() => setIsEditing(true)} className="gap-1.5" > - + Edit )} diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx index b02b2e217..9c1167efe 100644 --- a/surfsense_web/components/new-chat/source-detail-panel.tsx +++ b/surfsense_web/components/new-chat/source-detail-panel.tsx @@ -1,7 +1,7 @@ "use client"; import { useQuery } from "@tanstack/react-query"; -import { BookOpen, ChevronDown, ExternalLink, FileText, Hash, Sparkles, X } from "lucide-react"; +import { BookOpen, ChevronDown, ExternalLink, FileQuestionMark, FileText, Hash, Sparkles, X } from "lucide-react"; import { AnimatePresence, motion, useReducedMotion } from "motion/react"; import { useTranslations } from "next-intl"; import type React from "react"; @@ -392,12 +392,12 @@ export function SourceDetailPanel({ animate={{ opacity: 1, scale: 1 }} className="flex flex-col items-center gap-4 text-center px-6" > -
- +
+
-

- Failed to load document +

+ Document unavailable

{documentByChunkFetchingError.message || From 5c11a15fb6534c55e3e50e49caac4e4ee3703cec Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 1 Apr 2026 20:51:55 +0530 Subject: [PATCH 009/202] refactor: update UI components by removing unused imports and adjusting loading states --- .../components/CommunityPromptsContent.tsx | 2 +- .../components/PromptsContent.tsx | 6 ++-- .../components/editor-panel/editor-panel.tsx | 29 +++++++++---------- .../layout/ui/right-panel/RightPanel.tsx | 2 +- .../settings/image-model-manager.tsx | 3 +- .../settings/model-config-manager.tsx | 2 -- 6 files changed, 20 insertions(+), 24 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent.tsx index 4bcdcba7e..239832b2d 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent.tsx @@ -60,7 +60,7 @@ export function CommunityPromptsContent() { {list.length === 0 && (

- +

No community prompts yet

Share your own prompts from the My Prompts tab diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx index 522d71e59..39362d244 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx @@ -1,7 +1,8 @@ "use client"; import { useAtomValue } from "jotai"; -import { AlertTriangle, Globe, Lock, PenLine, Plus, Sparkles, Trash2 } from "lucide-react"; +import { AlertTriangle, Globe, Lock, PenLine, Sparkles, Trash2 } from "lucide-react"; +import { ShortcutKbd } from "@/components/ui/shortcut-kbd"; import { useCallback, useState } from "react"; import { toast } from "sonner"; import { @@ -145,7 +146,7 @@ export function PromptsContent() {

Create prompt templates triggered with{" "} - / in the + in the chat composer.

{!showForm && ( @@ -158,7 +159,6 @@ export function PromptsContent() { }} className="shrink-0 gap-1.5" > - New )} diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 7496e6aec..802a5ffc3 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -9,24 +9,9 @@ import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-pan import { MarkdownViewer } from "@/components/markdown-viewer"; import { Button } from "@/components/ui/button"; import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer"; -import { Skeleton } from "@/components/ui/skeleton"; import { useMediaQuery } from "@/hooks/use-media-query"; import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils"; -const PlateEditor = dynamic( - () => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })), - { ssr: false, loading: () => } -); - -interface EditorContent { - document_id: number; - title: string; - document_type?: string; - source_markdown: string; -} - -const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]); - function EditorPanelSkeleton() { return (
@@ -47,6 +32,20 @@ function EditorPanelSkeleton() { ); } +const PlateEditor = dynamic( + () => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })), + { ssr: false, loading: () => null } +); + +interface EditorContent { + document_id: number; + title: string; + document_type?: string; + source_markdown: string; +} + +const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]); + export function EditorPanelContent({ documentId, searchSpaceId, diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx index ac2f65065..717f5a459 100644 --- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx +++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx @@ -19,7 +19,7 @@ const EditorPanelContent = dynamic( import("@/components/editor-panel/editor-panel").then((m) => ({ default: m.EditorPanelContent, })), - { ssr: false, loading: () => } + { ssr: false, loading: () => null } ); const HitlEditPanelContent = dynamic( diff --git a/surfsense_web/components/settings/image-model-manager.tsx b/surfsense_web/components/settings/image-model-manager.tsx index 8f08b7db3..0c45af7d4 100644 --- a/surfsense_web/components/settings/image-model-manager.tsx +++ b/surfsense_web/components/settings/image-model-manager.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtomValue } from "jotai"; -import { AlertCircle, Edit3, Info, Plus, RefreshCw, Trash2, Wand2 } from "lucide-react"; +import { AlertCircle, Edit3, Info, RefreshCw, Trash2, Wand2 } from "lucide-react"; import { useMemo, useState } from "react"; import { deleteImageGenConfigMutationAtom } from "@/atoms/image-gen-config/image-gen-config-mutation.atoms"; import { @@ -257,7 +257,6 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) { size="lg" className="gap-2 text-xs md:text-sm h-9 md:h-10" > - Add First Image Model )} diff --git a/surfsense_web/components/settings/model-config-manager.tsx b/surfsense_web/components/settings/model-config-manager.tsx index 046288a96..50d2ab5b7 100644 --- a/surfsense_web/components/settings/model-config-manager.tsx +++ b/surfsense_web/components/settings/model-config-manager.tsx @@ -7,7 +7,6 @@ import { FileText, Info, MessageSquareQuote, - Plus, RefreshCw, Trash2, Wand2, @@ -270,7 +269,6 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) { size="lg" className="gap-2 text-xs md:text-sm h-9 md:h-10" > - Create First Configuration )} From 33e7aeef9d275a49ad0b7e74a46244b42d4ef547 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 1 Apr 2026 21:00:52 +0530 Subject: [PATCH 010/202] style: enhance input field focus styles and transition effects in login and registration forms --- .../app/(home)/login/LocalLoginForm.tsx | 20 +-- surfsense_web/app/(home)/register/page.tsx | 142 +++++++++--------- 2 files changed, 81 insertions(+), 81 deletions(-) diff --git a/surfsense_web/app/(home)/login/LocalLoginForm.tsx b/surfsense_web/app/(home)/login/LocalLoginForm.tsx index ee3b47683..1ebbf46b6 100644 --- a/surfsense_web/app/(home)/login/LocalLoginForm.tsx +++ b/surfsense_web/app/(home)/login/LocalLoginForm.tsx @@ -160,11 +160,11 @@ export function LocalLoginForm() { placeholder="you@example.com" value={username} onChange={(e) => setUsername(e.target.value)} - className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 bg-background text-foreground transition-all ${ - error.title - ? "border-destructive focus:border-destructive focus:ring-destructive" - : "border-border focus:border-primary focus:ring-primary" - }`} + className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ + error.title + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" + }`} disabled={isLoggingIn} />
@@ -181,11 +181,11 @@ export function LocalLoginForm() { placeholder="Enter your password" value={password} onChange={(e) => setPassword(e.target.value)} - className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 bg-background text-foreground transition-all ${ - error.title - ? "border-destructive focus:border-destructive focus:ring-destructive" - : "border-border focus:border-primary focus:ring-primary" - }`} + className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ + error.title + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" + }`} disabled={isLoggingIn} />
-
- - setPassword(e.target.value)} - className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 dark:bg-gray-800 dark:text-white transition-all ${ - error.title - ? "border-red-300 focus:border-red-500 focus:ring-red-500 dark:border-red-700" - : "border-gray-300 focus:border-blue-500 focus:ring-blue-500 dark:border-gray-700" - }`} - disabled={isRegistering} - /> -
+
+ + setPassword(e.target.value)} + className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ + error.title + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" + }`} + disabled={isRegistering} + /> +
-
- - setConfirmPassword(e.target.value)} - className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 dark:bg-gray-800 dark:text-white transition-all ${ - error.title - ? "border-red-300 focus:border-red-500 focus:ring-red-500 dark:border-red-700" - : "border-gray-300 focus:border-blue-500 focus:ring-blue-500 dark:border-gray-700" - }`} - disabled={isRegistering} - /> +
+ + setConfirmPassword(e.target.value)} + className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ + error.title + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" + }`} + disabled={isRegistering} + />
{group.tools.map((tool) => { - const isDisabled = disabledTools.includes(tool.name); + const isDisabled = disabledToolsSet.has(tool.name); const ToolIcon = getToolIcon(tool.name); return (
= ({ isBlockedByOtherUser = false const iconKey = group.connectorIcon ?? ""; const iconInfo = CONNECTOR_TOOL_ICON_PATHS[iconKey]; const toolNames = group.tools.map((t) => t.name); - const allDisabled = toolNames.every((n) => disabledTools.includes(n)); + const allDisabled = toolNames.every((n) => disabledToolsSet.has(n)); return (
= ({ isBlockedByOtherUser = false {group.label}
{group.tools.map((tool) => { - const isDisabled = disabledTools.includes(tool.name); + const isDisabled = disabledToolsSet.has(tool.name); const ToolIcon = getToolIcon(tool.name); const row = (
@@ -1115,7 +1116,7 @@ const ComposerAction: FC = ({ isBlockedByOtherUser = false const iconKey = group.connectorIcon ?? ""; const iconInfo = CONNECTOR_TOOL_ICON_PATHS[iconKey]; const toolNames = group.tools.map((t) => t.name); - const allDisabled = toolNames.every((n) => disabledTools.includes(n)); + const allDisabled = toolNames.every((n) => disabledToolsSet.has(n)); const groupDef = TOOL_GROUPS.find((g) => g.label === group.label); const row = (
From 2df9708df9e5b3c9fafca48144a43b96ffe160b8 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 09:16:07 +0530 Subject: [PATCH 018/202] style: update alert components and empty state UI --- .../assistant-ui/connector-popup.tsx | 6 ++-- .../assistant-ui/document-upload-popup.tsx | 6 ++-- .../public-chat-snapshots-empty-state.tsx | 7 ++--- .../settings/image-model-manager.tsx | 18 ++---------- .../settings/model-config-manager.tsx | 28 +++++-------------- 5 files changed, 18 insertions(+), 47 deletions(-) diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index d1f6dd31f..b31b7cbd1 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -376,14 +376,14 @@ export const ConnectorIndicator = forwardRef {/* LLM Configuration Warning */} {!llmConfigLoading && !hasDocumentSummaryLLM && ( - + LLM Configuration Required

{isAutoMode && !hasGlobalConfigs - ? "Auto mode is selected but no global LLM configurations are available. Please configure a custom LLM in Settings to process and summarize documents from your connected sources." - : "You need to configure a Document Summary LLM before adding connectors. This LLM is used to process and summarize documents from your connected sources."} + ? "Auto mode requires a global LLM configuration. Please add one in Settings" + : "A Document Summary LLM is required to process uploads, configure one in Settings"}

); } diff --git a/surfsense_web/components/settings/image-model-manager.tsx b/surfsense_web/components/settings/image-model-manager.tsx index 0c45af7d4..55128dc59 100644 --- a/surfsense_web/components/settings/image-model-manager.tsx +++ b/surfsense_web/components/settings/image-model-manager.tsx @@ -240,26 +240,14 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) { {!isLoading && (
{(userConfigs?.length ?? 0) === 0 ? ( - + -
- -
-

No Image Models Yet

-

+

No Image Models Yet

+

{canCreate ? "Add your own image generation model (DALL-E 3, GPT Image 1, etc.)" : "No image models have been added to this space yet. Contact a space owner to add one."}

- {canCreate && ( - - )}
) : ( diff --git a/surfsense_web/components/settings/model-config-manager.tsx b/surfsense_web/components/settings/model-config-manager.tsx index 50d2ab5b7..837060c70 100644 --- a/surfsense_web/components/settings/model-config-manager.tsx +++ b/surfsense_web/components/settings/model-config-manager.tsx @@ -250,28 +250,14 @@ export function ModelConfigManager({ searchSpaceId }: ModelConfigManagerProps) {
{configs?.length === 0 ? (
- + -
- -
-
-

No Configurations Yet

-

- {canCreate - ? "Create your first AI configuration to customize how your agent responds" - : "No AI configurations have been added to this space yet. Contact a space owner to add one."} -

-
- {canCreate && ( - - )} +

No LLM Models Yet

+

+ {canCreate + ? "Add your first LLM model to power document summarization, chat, and other agent capabilities" + : "No LLM models have been added to this space yet. Contact a space owner to add one"} +

From 407175ffae46a23d775bcba9c35fdfb465766ba0 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 09:26:16 +0530 Subject: [PATCH 019/202] style: replace loading indicators with Spinner component in alert dialogs --- .../layout/providers/LayoutDataProvider.tsx | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index fd6b45c52..6138b67fb 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -775,7 +775,8 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid {t("delete_chat")} - {t("delete_chat_confirm")} {chatToDelete?.name}?{" "} + {t("delete_chat_confirm")}{" "} + {chatToDelete?.name}?{" "} {t("action_cannot_undone")} @@ -835,9 +836,7 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid {tSidebar("rename") || "Rename"} - {isRenamingChat && ( - - )} + {isRenamingChat && } @@ -865,9 +864,7 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid className="relative bg-destructive text-destructive-foreground hover:bg-destructive/90" > {tCommon("delete")} - {isDeletingSearchSpace && ( - - )} + {isDeletingSearchSpace && } @@ -895,9 +892,7 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid className="relative bg-destructive text-destructive-foreground hover:bg-destructive/90" > {t("leave")} - {isLeavingSearchSpace && ( - - )} + {isLeavingSearchSpace && } From d2cf3fb3b7eafce01c881e80049346239c84bc6f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 09:58:14 +0530 Subject: [PATCH 020/202] refactor: update LLM role management logic and enhance UI feedback --- .../new-llm-config-mutation.atoms.ts | 9 +- .../components/settings/llm-role-manager.tsx | 165 +++++------------- 2 files changed, 52 insertions(+), 122 deletions(-) diff --git a/surfsense_web/atoms/new-llm-config/new-llm-config-mutation.atoms.ts b/surfsense_web/atoms/new-llm-config/new-llm-config-mutation.atoms.ts index 861606f80..d6d3aa820 100644 --- a/surfsense_web/atoms/new-llm-config/new-llm-config-mutation.atoms.ts +++ b/surfsense_web/atoms/new-llm-config/new-llm-config-mutation.atoms.ts @@ -109,10 +109,11 @@ export const updateLLMPreferencesMutationAtom = atomWithMutation((get) => { mutationFn: async (request: UpdateLLMPreferencesRequest) => { return newLLMConfigApiService.updateLLMPreferences(request); }, - onSuccess: () => { - queryClient.invalidateQueries({ - queryKey: cacheKeys.newLLMConfigs.preferences(Number(searchSpaceId)), - }); + onSuccess: (_data, request: UpdateLLMPreferencesRequest) => { + queryClient.setQueryData( + cacheKeys.newLLMConfigs.preferences(Number(searchSpaceId)), + (old: Record | undefined) => ({ ...old, ...request.data }) + ); }, onError: (error: Error) => { toast.error(error.message || "Failed to update LLM preferences"); diff --git a/surfsense_web/components/settings/llm-role-manager.tsx b/surfsense_web/components/settings/llm-role-manager.tsx index 07ec492a3..22e17e431 100644 --- a/surfsense_web/components/settings/llm-role-manager.tsx +++ b/surfsense_web/components/settings/llm-role-manager.tsx @@ -4,16 +4,14 @@ import { useAtomValue } from "jotai"; import { AlertCircle, Bot, - CheckCircle, + CircleCheck, CircleDashed, FileText, ImageIcon, RefreshCw, - RotateCcw, - Save, Shuffle, } from "lucide-react"; -import { useEffect, useState } from "react"; +import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { globalImageGenConfigsAtom, @@ -40,6 +38,7 @@ import { SelectValue, } from "@/components/ui/select"; import { Skeleton } from "@/components/ui/skeleton"; +import { Spinner } from "@/components/ui/spinner"; import { getProviderIcon } from "@/lib/provider-icons"; import { cn } from "@/lib/utils"; @@ -48,8 +47,8 @@ const ROLE_DESCRIPTIONS = { icon: Bot, title: "Agent LLM", description: "Primary LLM for chat interactions and agent operations", - color: "text-blue-600 dark:text-blue-400", - bgColor: "bg-blue-500/10", + color: "text-muted-foreground", + bgColor: "bg-muted", prefKey: "agent_llm_id" as const, configType: "llm" as const, }, @@ -57,8 +56,8 @@ const ROLE_DESCRIPTIONS = { icon: FileText, title: "Document Summary LLM", description: "Handles document summarization and research synthesis", - color: "text-purple-600 dark:text-purple-400", - bgColor: "bg-purple-500/10", + color: "text-muted-foreground", + bgColor: "bg-muted", prefKey: "document_summary_llm_id" as const, configType: "llm" as const, }, @@ -66,8 +65,8 @@ const ROLE_DESCRIPTIONS = { icon: ImageIcon, title: "Image Generation Model", description: "Model used for AI image generation (DALL-E, GPT Image, etc.)", - color: "text-teal-600 dark:text-teal-400", - bgColor: "bg-teal-500/10", + color: "text-muted-foreground", + bgColor: "bg-muted", prefKey: "image_generation_config_id" as const, configType: "image" as const, }, @@ -118,88 +117,41 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { image_generation_config_id: preferences.image_generation_config_id ?? "", })); - const [hasChanges, setHasChanges] = useState(false); - const [isSaving, setIsSaving] = useState(false); + const [savingRole, setSavingRole] = useState(null); + const savingRef = useRef(false); useEffect(() => { - const newAssignments = { - agent_llm_id: preferences.agent_llm_id ?? "", - document_summary_llm_id: preferences.document_summary_llm_id ?? "", - image_generation_config_id: preferences.image_generation_config_id ?? "", - }; - setAssignments(newAssignments); - setHasChanges(false); + if (!savingRef.current) { + setAssignments({ + agent_llm_id: preferences.agent_llm_id ?? "", + document_summary_llm_id: preferences.document_summary_llm_id ?? "", + image_generation_config_id: preferences.image_generation_config_id ?? "", + }); + } }, [ preferences?.agent_llm_id, preferences?.document_summary_llm_id, preferences?.image_generation_config_id, ]); - const handleRoleAssignment = (prefKey: string, configId: string) => { - const newAssignments = { - ...assignments, - [prefKey]: configId === "unassigned" ? "" : parseInt(configId), - }; + const handleRoleAssignment = useCallback(async (prefKey: string, configId: string) => { + const value = configId === "unassigned" ? "" : parseInt(configId); - setAssignments(newAssignments); + setAssignments((prev) => ({ ...prev, [prefKey]: value })); + setSavingRole(prefKey); + savingRef.current = true; - const currentPrefs = { - agent_llm_id: preferences.agent_llm_id ?? "", - document_summary_llm_id: preferences.document_summary_llm_id ?? "", - image_generation_config_id: preferences.image_generation_config_id ?? "", - }; - - const hasChangesNow = Object.keys(newAssignments).some( - (key) => - newAssignments[key as keyof typeof newAssignments] !== - currentPrefs[key as keyof typeof currentPrefs] - ); - - setHasChanges(hasChangesNow); - }; - - const handleSave = async () => { - setIsSaving(true); - - const toNumericOrUndefined = (val: string | number) => - typeof val === "string" ? (val ? parseInt(val) : undefined) : val; - - const numericAssignments = { - agent_llm_id: toNumericOrUndefined(assignments.agent_llm_id), - document_summary_llm_id: toNumericOrUndefined(assignments.document_summary_llm_id), - image_generation_config_id: toNumericOrUndefined(assignments.image_generation_config_id), - }; - - await updatePreferences({ - search_space_id: searchSpaceId, - data: numericAssignments, - }); - - setHasChanges(false); - toast.success("Role assignments saved successfully!"); - - setIsSaving(false); - }; - - const handleReset = () => { - setAssignments({ - agent_llm_id: preferences.agent_llm_id ?? "", - document_summary_llm_id: preferences.document_summary_llm_id ?? "", - image_generation_config_id: preferences.image_generation_config_id ?? "", - }); - setHasChanges(false); - }; - - const isAssignmentComplete = - assignments.agent_llm_id !== "" && - assignments.agent_llm_id !== null && - assignments.agent_llm_id !== undefined && - assignments.document_summary_llm_id !== "" && - assignments.document_summary_llm_id !== null && - assignments.document_summary_llm_id !== undefined && - assignments.image_generation_config_id !== "" && - assignments.image_generation_config_id !== null && - assignments.image_generation_config_id !== undefined; + try { + await updatePreferences({ + search_space_id: searchSpaceId, + data: { [prefKey]: value || undefined }, + }); + toast.success("Role assignment updated"); + } finally { + setSavingRole(null); + savingRef.current = false; + } + }, [updatePreferences, searchSpaceId]); // Combine global and custom LLM configs const allLLMConfigs = [ @@ -213,6 +165,11 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { ...(userImageConfigs ?? []).filter((config) => config.id && config.id.toString().trim() !== ""), ]; + const isAssignmentComplete = + allLLMConfigs.some((c) => c.id === assignments.agent_llm_id) && + allLLMConfigs.some((c) => c.id === assignments.document_summary_llm_id) && + allImageConfigs.some((c) => c.id === assignments.image_generation_config_id); + const isLoading = configsLoading || preferencesLoading || @@ -244,9 +201,9 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { {isAssignmentComplete && !isLoading && !hasError && ( - + All roles assigned )} @@ -332,10 +289,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { const roleAllConfigs = isImageRole ? allImageConfigs : allLLMConfigs; const assignedConfig = roleAllConfigs.find((config) => config.id === currentAssignment); - const isAssigned = - currentAssignment !== "" && - currentAssignment !== null && - currentAssignment !== undefined; + const isAssigned = !!assignedConfig; const isAutoMode = assignedConfig && "is_auto_mode" in assignedConfig && assignedConfig.is_auto_mode; @@ -361,8 +315,10 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {

- {isAssigned ? ( - + {savingRole === role.prefKey ? ( + + ) : isAssigned ? ( + ) : ( )} @@ -374,7 +330,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { Configuration + + + + )} + /> + + ( + + Folder Path +
+ + + + {isElectron && ( + + )} +
+ + The absolute path to the folder to watch and sync. + + +
+ )} + /> + + ( + + Display Name + + + + + A friendly name shown in the documents sidebar. + + + + )} + /> + + ( + + Exclude Patterns + + + + + Comma-separated patterns of directories/files to exclude. + + + + )} + /> + + ( + + File Extensions (optional) + + + + + Leave empty to index all supported files, or specify comma-separated extensions. + + + + )} + /> + + + +
+ + {getConnectorBenefits(EnumConnectorName.LOCAL_FOLDER_CONNECTOR) && ( +
+

+ What you get with Local Folder sync: +

+
    + {getConnectorBenefits(EnumConnectorName.LOCAL_FOLDER_CONNECTOR)?.map( + (benefit) =>
  • {benefit}
  • + )} +
+
+ )} +
+ ); +}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts index 0dc093100..40c6a7fdd 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts @@ -111,6 +111,14 @@ export function getConnectorBenefits(connectorType: string): string[] | null { "Incremental sync - only changed files are re-indexed", "Full support for your vault's folder structure", ], + LOCAL_FOLDER_CONNECTOR: [ + "Watch local folders for real-time changes via the desktop app", + "Automatic change detection — only modified files are re-indexed", + "Version history with up to 20 snapshots per document", + "Mirrors your folder structure in the SurfSense sidebar", + "Supports any text-based file format", + "Works as a periodic sync fallback when the desktop app is not running", + ], }; return benefits[connectorType] || null; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/index.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/index.tsx index 37d4ad5d8..116893399 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/index.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/index.tsx @@ -7,6 +7,7 @@ import { GithubConnectForm } from "./components/github-connect-form"; import { LinkupApiConnectForm } from "./components/linkup-api-connect-form"; import { LumaConnectForm } from "./components/luma-connect-form"; import { MCPConnectForm } from "./components/mcp-connect-form"; +import { LocalFolderConnectForm } from "./components/local-folder-connect-form"; import { ObsidianConnectForm } from "./components/obsidian-connect-form"; import { TavilyApiConnectForm } from "./components/tavily-api-connect-form"; @@ -58,7 +59,8 @@ export function getConnectFormComponent(connectorType: string): ConnectFormCompo return MCPConnectForm; case "OBSIDIAN_CONNECTOR": return ObsidianConnectForm; - // Add other connector types here as needed + case "LOCAL_FOLDER_CONNECTOR": + return LocalFolderConnectForm; default: return null; } diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/local-folder-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/local-folder-config.tsx new file mode 100644 index 000000000..cb4295079 --- /dev/null +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/local-folder-config.tsx @@ -0,0 +1,163 @@ +"use client"; + +import type { FC } from "react"; +import { useState } from "react"; +import { FolderSync } from "lucide-react"; +import { Button } from "@/components/ui/button"; +import { Input } from "@/components/ui/input"; +import { Label } from "@/components/ui/label"; +import type { ConnectorConfigProps } from "../index"; + +export const LocalFolderConfig: FC = ({ + connector, + onConfigChange, + onNameChange, +}) => { + const isElectron = typeof window !== "undefined" && !!window.electronAPI; + + const [folderPath, setFolderPath] = useState( + (connector.config?.folder_path as string) || "" + ); + const [folderName, setFolderName] = useState( + (connector.config?.folder_name as string) || "" + ); + const [excludePatterns, setExcludePatterns] = useState(() => { + const patterns = connector.config?.exclude_patterns; + if (Array.isArray(patterns)) { + return patterns.join(", "); + } + return (patterns as string) || "node_modules, .git, .DS_Store"; + }); + const [fileExtensions, setFileExtensions] = useState(() => { + const exts = connector.config?.file_extensions; + if (Array.isArray(exts)) { + return exts.join(", "); + } + return (exts as string) || ""; + }); + const [name, setName] = useState(connector.name || ""); + + const handleFolderPathChange = (value: string) => { + setFolderPath(value); + onConfigChange?.({ ...connector.config, folder_path: value }); + }; + + const handleFolderNameChange = (value: string) => { + setFolderName(value); + onConfigChange?.({ ...connector.config, folder_name: value }); + }; + + const handleExcludePatternsChange = (value: string) => { + setExcludePatterns(value); + const arr = value + .split(",") + .map((p) => p.trim()) + .filter(Boolean); + onConfigChange?.({ ...connector.config, exclude_patterns: arr }); + }; + + const handleFileExtensionsChange = (value: string) => { + setFileExtensions(value); + const arr = value + ? value + .split(",") + .map((e) => { + const ext = e.trim(); + return ext.startsWith(".") ? ext : `.${ext}`; + }) + .filter(Boolean) + : null; + onConfigChange?.({ ...connector.config, file_extensions: arr }); + }; + + const handleNameChange = (value: string) => { + setName(value); + onNameChange?.(value); + }; + + const handleBrowse = async () => { + if (!isElectron) return; + const selected = await window.electronAPI!.selectFolder(); + if (selected) { + handleFolderPathChange(selected); + const autoName = selected.split(/[\\/]/).pop() || "folder"; + if (!folderName) handleFolderNameChange(autoName); + } + }; + + return ( +
+
+
+ + handleNameChange(e.target.value)} + placeholder="Local Folder" + className="border-slate-400/20 focus-visible:border-slate-400/40" + /> +
+
+ +
+

Folder Configuration

+ +
+
+ +
+ handleFolderPathChange(e.target.value)} + placeholder="/path/to/your/folder" + className="border-slate-400/20 focus-visible:border-slate-400/40 font-mono flex-1" + /> + {isElectron && ( + + )} +
+
+ +
+ + handleFolderNameChange(e.target.value)} + placeholder="My Notes" + className="border-slate-400/20 focus-visible:border-slate-400/40" + /> +
+ +
+ + handleExcludePatternsChange(e.target.value)} + placeholder="node_modules, .git, .DS_Store" + className="border-slate-400/20 focus-visible:border-slate-400/40 font-mono" + /> +

+ Comma-separated patterns of directories/files to exclude. +

+
+ +
+ + handleFileExtensionsChange(e.target.value)} + placeholder=".md, .txt, .rst" + className="border-slate-400/20 focus-visible:border-slate-400/40 font-mono" + /> +

+ Leave empty to index all supported files. +

+
+
+
+
+ ); +}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx index a63435260..3dc1891c8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx @@ -19,6 +19,7 @@ import { JiraConfig } from "./components/jira-config"; import { LinkupApiConfig } from "./components/linkup-api-config"; import { LumaConfig } from "./components/luma-config"; import { MCPConfig } from "./components/mcp-config"; +import { LocalFolderConfig } from "./components/local-folder-config"; import { ObsidianConfig } from "./components/obsidian-config"; import { OneDriveConfig } from "./components/onedrive-config"; import { SlackConfig } from "./components/slack-config"; @@ -82,6 +83,8 @@ export function getConnectorConfigComponent( return MCPConfig; case "OBSIDIAN_CONNECTOR": return ObsidianConfig; + case "LOCAL_FOLDER_CONNECTOR": + return LocalFolderConfig; case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": return ComposioDriveConfig; case "COMPOSIO_GMAIL_CONNECTOR": diff --git a/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts b/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts index f924bb15f..dd5978002 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts @@ -29,6 +29,7 @@ export const CONNECTOR_TO_DOCUMENT_TYPE: Record = { BOOKSTACK_CONNECTOR: "BOOKSTACK_CONNECTOR", CIRCLEBACK_CONNECTOR: "CIRCLEBACK", OBSIDIAN_CONNECTOR: "OBSIDIAN_CONNECTOR", + LOCAL_FOLDER_CONNECTOR: "LOCAL_FOLDER_FILE", // Special mappings (connector type differs from document type) GOOGLE_DRIVE_CONNECTOR: "GOOGLE_DRIVE_FILE", diff --git a/surfsense_web/components/documents/version-history.tsx b/surfsense_web/components/documents/version-history.tsx new file mode 100644 index 000000000..29740e079 --- /dev/null +++ b/surfsense_web/components/documents/version-history.tsx @@ -0,0 +1,185 @@ +"use client"; + +import { useCallback, useEffect, useState } from "react"; +import { Clock, RotateCcw } from "lucide-react"; +import { Button } from "@/components/ui/button"; +import { + Sheet, + SheetContent, + SheetHeader, + SheetTitle, + SheetTrigger, +} from "@/components/ui/sheet"; +import { Spinner } from "@/components/ui/spinner"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; +import { toast } from "sonner"; + +interface DocumentVersionSummary { + version_number: number; + title: string; + content_hash: string; + created_at: string | null; +} + +interface VersionHistoryProps { + documentId: number; + documentType: string; +} + +export function VersionHistoryButton({ documentId, documentType }: VersionHistoryProps) { + const showVersionHistory = documentType === "LOCAL_FOLDER_FILE" || documentType === "OBSIDIAN_CONNECTOR"; + if (!showVersionHistory) return null; + + return ( + + + + + + + Version History + + + + + ); +} + +function VersionHistoryPanel({ documentId }: { documentId: number }) { + const [versions, setVersions] = useState([]); + const [loading, setLoading] = useState(true); + const [selectedVersion, setSelectedVersion] = useState(null); + const [versionContent, setVersionContent] = useState(""); + const [contentLoading, setContentLoading] = useState(false); + const [restoring, setRestoring] = useState(false); + + const loadVersions = useCallback(async () => { + setLoading(true); + try { + const data = await documentsApiService.listDocumentVersions(documentId); + setVersions(data as DocumentVersionSummary[]); + } catch { + toast.error("Failed to load version history"); + } finally { + setLoading(false); + } + }, [documentId]); + + useEffect(() => { + loadVersions(); + }, [loadVersions]); + + const handleSelectVersion = async (versionNumber: number) => { + setSelectedVersion(versionNumber); + setContentLoading(true); + try { + const data = (await documentsApiService.getDocumentVersion( + documentId, + versionNumber + )) as { source_markdown: string }; + setVersionContent(data.source_markdown || ""); + } catch { + toast.error("Failed to load version content"); + } finally { + setContentLoading(false); + } + }; + + const handleRestore = async (versionNumber: number) => { + setRestoring(true); + try { + await documentsApiService.restoreDocumentVersion(documentId, versionNumber); + toast.success(`Restored version ${versionNumber}`); + await loadVersions(); + } catch { + toast.error("Failed to restore version"); + } finally { + setRestoring(false); + } + }; + + if (loading) { + return ( +
+ +
+ ); + } + + if (versions.length === 0) { + return ( +
+ +

No version history available yet.

+

Versions are created when file content changes.

+
+ ); + } + + return ( +
+
+ {versions.map((v) => ( +
handleSelectVersion(v.version_number)} + > +
+
+

Version {v.version_number}

+ {v.created_at && ( +

+ {new Date(v.created_at).toLocaleString()} +

+ )} + {v.title && ( +

+ {v.title} +

+ )} +
+ +
+
+ ))} +
+ + {selectedVersion !== null && ( +
+

+ Preview — Version {selectedVersion} +

+ {contentLoading ? ( +
+ +
+ ) : ( +
+							{versionContent || "(empty)"}
+						
+ )} +
+ )} +
+ ); +} diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index 1a3326bae..c663d6115 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -26,6 +26,7 @@ export const documentTypeEnum = z.enum([ "BOOKSTACK_CONNECTOR", "CIRCLEBACK", "OBSIDIAN_CONNECTOR", + "LOCAL_FOLDER_FILE", "SURFSENSE_DOCS", "NOTE", "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", diff --git a/surfsense_web/hooks/use-folder-sync.ts b/surfsense_web/hooks/use-folder-sync.ts new file mode 100644 index 000000000..a35faf98f --- /dev/null +++ b/surfsense_web/hooks/use-folder-sync.ts @@ -0,0 +1,41 @@ +"use client"; + +import { useEffect, useRef } from "react"; +import { connectorsApiService } from "@/lib/apis/connectors-api.service"; + +const DEBOUNCE_MS = 2000; + +export function useFolderSync() { + const pendingRef = useRef>>(new Map()); + + useEffect(() => { + const api = typeof window !== "undefined" ? window.electronAPI : null; + if (!api?.onFileChanged) return; + + const cleanup = api.onFileChanged((event) => { + const key = `${event.connectorId}:${event.fullPath}`; + + const existing = pendingRef.current.get(key); + if (existing) clearTimeout(existing); + + const timeout = setTimeout(async () => { + pendingRef.current.delete(key); + try { + await connectorsApiService.indexFile(event.connectorId, event.fullPath); + } catch (err) { + console.error("[FolderSync] Failed to trigger re-index:", err); + } + }, DEBOUNCE_MS); + + pendingRef.current.set(key, timeout); + }); + + return () => { + cleanup(); + for (const timeout of pendingRef.current.values()) { + clearTimeout(timeout); + } + pendingRef.current.clear(); + }; + }, []); +} diff --git a/surfsense_web/lib/apis/connectors-api.service.ts b/surfsense_web/lib/apis/connectors-api.service.ts index abd16c7a7..f2722df70 100644 --- a/surfsense_web/lib/apis/connectors-api.service.ts +++ b/surfsense_web/lib/apis/connectors-api.service.ts @@ -404,6 +404,18 @@ class ConnectorsApiService { listDiscordChannelsResponse ); }; + + // ============================================================================= + // Local Folder Connector Methods + // ============================================================================= + + indexFile = async (connectorId: number, filePath: string) => { + return baseApiService.post( + `/api/v1/search-source-connectors/${connectorId}/index-file`, + undefined, + { body: { file_path: filePath } } + ); + }; } export type { SlackChannel, DiscordChannel }; diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index 14a247032..d4a80f8a0 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -379,6 +379,22 @@ class DocumentsApiService { }); }; + listDocumentVersions = async (documentId: number) => { + return baseApiService.get(`/api/v1/documents/${documentId}/versions`); + }; + + getDocumentVersion = async (documentId: number, versionNumber: number) => { + return baseApiService.get( + `/api/v1/documents/${documentId}/versions/${versionNumber}` + ); + }; + + restoreDocumentVersion = async (documentId: number, versionNumber: number) => { + return baseApiService.post( + `/api/v1/documents/${documentId}/versions/${versionNumber}/restore` + ); + }; + /** * Delete a document */ diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index 90f7f5d21..6ce78be67 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -30,6 +30,7 @@ export const getConnectorTypeDisplay = (type: string): string => { YOUTUBE_CONNECTOR: "YouTube", CIRCLEBACK_CONNECTOR: "Circleback", OBSIDIAN_CONNECTOR: "Obsidian", + LOCAL_FOLDER_CONNECTOR: "Local Folder", DROPBOX_CONNECTOR: "Dropbox", MCP_CONNECTOR: "MCP Server", }; diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index 9cf1aa596..921449b41 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -1,5 +1,30 @@ import type { PostHog } from "posthog-js"; +interface WatchedFolderConfig { + path: string; + name: string; + excludePatterns: string[]; + fileExtensions: string[] | null; + connectorId: number; + searchSpaceId: number; + active: boolean; +} + +interface FolderSyncFileChangedEvent { + connectorId: number; + searchSpaceId: number; + folderPath: string; + relativePath: string; + fullPath: string; + action: "add" | "change" | "unlink"; + timestamp: number; +} + +interface FolderSyncWatcherReadyEvent { + connectorId: number; + folderPath: string; +} + interface ElectronAPI { versions: { electron: string; @@ -14,6 +39,16 @@ interface ElectronAPI { setQuickAskMode: (mode: string) => Promise; getQuickAskMode: () => Promise; replaceText: (text: string) => Promise; + // Folder sync + selectFolder: () => Promise; + addWatchedFolder: (config: WatchedFolderConfig) => Promise; + removeWatchedFolder: (folderPath: string) => Promise; + getWatchedFolders: () => Promise; + getWatcherStatus: () => Promise<{ path: string; active: boolean; watching: boolean }[]>; + onFileChanged: (callback: (data: FolderSyncFileChangedEvent) => void) => () => void; + onWatcherReady: (callback: (data: FolderSyncWatcherReadyEvent) => void) => () => void; + pauseWatcher: () => Promise; + resumeWatcher: () => Promise; } declare global { From b93da843dc6125434d712ad8881bf248906782d1 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 11:55:29 +0530 Subject: [PATCH 030/202] feat: implement mtime tracking and synchronization for folder watcher --- .../src/modules/folder-watcher.ts | 172 +++++++++++++++++- 1 file changed, 164 insertions(+), 8 deletions(-) diff --git a/surfsense_desktop/src/modules/folder-watcher.ts b/surfsense_desktop/src/modules/folder-watcher.ts index bfd2136c9..072ae7b3f 100644 --- a/surfsense_desktop/src/modules/folder-watcher.ts +++ b/surfsense_desktop/src/modules/folder-watcher.ts @@ -1,5 +1,5 @@ import { BrowserWindow, dialog } from 'electron'; -import chokidar from 'chokidar'; +import chokidar, { type FSWatcher } from 'chokidar'; import * as path from 'path'; import * as fs from 'fs'; import { IPC_CHANNELS } from '../ipc/channels'; @@ -16,13 +16,24 @@ export interface WatchedFolderConfig { interface WatcherEntry { config: WatchedFolderConfig; - watcher: chokidar.FSWatcher | null; + watcher: FSWatcher | null; } +type MtimeMap = Record; + const STORE_KEY = 'watchedFolders'; +const MTIME_TOLERANCE_S = 1.0; + let store: any = null; +let mtimeStore: any = null; let watchers: Map = new Map(); +/** + * In-memory cache of mtime maps, keyed by folder path. + * Persisted to electron-store on mutation. + */ +const mtimeMaps: Map = new Map(); + async function getStore() { if (!store) { const { default: Store } = await import('electron-store'); @@ -36,6 +47,73 @@ async function getStore() { return store; } +async function getMtimeStore() { + if (!mtimeStore) { + const { default: Store } = await import('electron-store'); + mtimeStore = new Store({ + name: 'folder-mtime-maps', + defaults: {} as Record, + }); + } + return mtimeStore; +} + +function loadMtimeMap(folderPath: string): MtimeMap { + return mtimeMaps.get(folderPath) ?? {}; +} + +function persistMtimeMap(folderPath: string) { + const map = mtimeMaps.get(folderPath) ?? {}; + getMtimeStore().then((s) => s.set(folderPath, map)); +} + +function walkFolderMtimes(config: WatchedFolderConfig): MtimeMap { + const root = config.path; + const result: MtimeMap = {}; + const excludes = new Set(config.excludePatterns); + + function walk(dir: string) { + let entries: fs.Dirent[]; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch { + return; + } + + for (const entry of entries) { + const name = entry.name; + + // Skip dotfiles/dotdirs and excluded names + if (name.startsWith('.') || excludes.has(name)) continue; + + const full = path.join(dir, name); + + if (entry.isDirectory()) { + walk(full); + } else if (entry.isFile()) { + if ( + config.fileExtensions && + config.fileExtensions.length > 0 + ) { + const ext = path.extname(name).toLowerCase(); + if (!config.fileExtensions.includes(ext)) continue; + } + + try { + const stat = fs.statSync(full); + const rel = path.relative(root, full); + result[rel] = stat.mtimeMs; + } catch { + // File may have been removed between readdir and stat + } + } + } + } + + walk(root); + return result; +} + function getMainWindow(): BrowserWindow | null { const windows = BrowserWindow.getAllWindows(); return windows.length > 0 ? windows[0] : null; @@ -48,11 +126,16 @@ function sendToRenderer(channel: string, data: any) { } } -function startWatcher(config: WatchedFolderConfig) { +async function startWatcher(config: WatchedFolderConfig) { if (watchers.has(config.path)) { return; } + // Load persisted mtime map into memory before starting the watcher + const ms = await getMtimeStore(); + const storedMap: MtimeMap = ms.get(config.path) ?? {}; + mtimeMaps.set(config.path, { ...storedMap }); + const ignored = [ /(^|[/\\])\../, // dotfiles by default ...config.excludePatterns.map((p) => `**/${p}/**`), @@ -60,7 +143,7 @@ function startWatcher(config: WatchedFolderConfig) { const watcher = chokidar.watch(config.path, { persistent: true, - ignoreInitial: false, + ignoreInitial: true, awaitWriteFinish: { stabilityThreshold: 500, pollInterval: 100, @@ -72,6 +155,58 @@ function startWatcher(config: WatchedFolderConfig) { watcher.on('ready', () => { ready = true; + + // Detect offline changes by diffing current filesystem against stored mtime map + const currentMap = walkFolderMtimes(config); + const storedSnapshot = loadMtimeMap(config.path); + const now = Date.now(); + + for (const [rel, currentMtime] of Object.entries(currentMap)) { + const storedMtime = storedSnapshot[rel]; + if (storedMtime === undefined) { + // New file added while app was closed + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, { + connectorId: config.connectorId, + searchSpaceId: config.searchSpaceId, + folderPath: config.path, + relativePath: rel, + fullPath: path.join(config.path, rel), + action: 'add', + timestamp: now, + }); + } else if (Math.abs(currentMtime - storedMtime) >= MTIME_TOLERANCE_S * 1000) { + // File modified while app was closed + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, { + connectorId: config.connectorId, + searchSpaceId: config.searchSpaceId, + folderPath: config.path, + relativePath: rel, + fullPath: path.join(config.path, rel), + action: 'change', + timestamp: now, + }); + } + } + + for (const rel of Object.keys(storedSnapshot)) { + if (!(rel in currentMap)) { + // File deleted while app was closed + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, { + connectorId: config.connectorId, + searchSpaceId: config.searchSpaceId, + folderPath: config.path, + relativePath: rel, + fullPath: path.join(config.path, rel), + action: 'unlink', + timestamp: now, + }); + } + } + + // Replace stored map with current filesystem state + mtimeMaps.set(config.path, currentMap); + persistMtimeMap(config.path); + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_WATCHER_READY, { connectorId: config.connectorId, folderPath: config.path, @@ -91,6 +226,21 @@ function startWatcher(config: WatchedFolderConfig) { if (!config.fileExtensions.includes(ext)) return; } + // Keep mtime map in sync with live changes + const map = mtimeMaps.get(config.path); + if (map) { + if (action === 'unlink') { + delete map[relativePath]; + } else { + try { + map[relativePath] = fs.statSync(filePath).mtimeMs; + } catch { + // File may have been removed between event and stat + } + } + persistMtimeMap(config.path); + } + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, { connectorId: config.connectorId, searchSpaceId: config.searchSpaceId, @@ -110,6 +260,7 @@ function startWatcher(config: WatchedFolderConfig) { } function stopWatcher(folderPath: string) { + persistMtimeMap(folderPath); const entry = watchers.get(folderPath); if (entry?.watcher) { entry.watcher.close(); @@ -144,7 +295,7 @@ export async function addWatchedFolder( s.set(STORE_KEY, folders); if (config.active) { - startWatcher(config); + await startWatcher(config); } return folders; @@ -160,6 +311,11 @@ export async function removeWatchedFolder( stopWatcher(folderPath); + // Clean up persisted mtime map for this folder + mtimeMaps.delete(folderPath); + const ms = await getMtimeStore(); + ms.delete(folderPath); + return updated; } @@ -190,9 +346,9 @@ export async function pauseWatcher(): Promise { } export async function resumeWatcher(): Promise { - for (const [folderPath, entry] of watchers) { + for (const [, entry] of watchers) { if (!entry.watcher && entry.config.active) { - startWatcher(entry.config); + await startWatcher(entry.config); } } } @@ -203,7 +359,7 @@ export async function registerFolderWatcher(): Promise { for (const config of folders) { if (config.active && fs.existsSync(config.path)) { - startWatcher(config); + await startWatcher(config); } } } From 543b8b9376eb4fd89698deffdfde0f6d9e69a2e8 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 12:08:18 +0530 Subject: [PATCH 031/202] feat: add real-time folder watcher registration and unregistration for Local Folder connector --- .../hooks/use-connector-dialog.ts | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 6543bbd72..2404b8eb5 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -586,6 +586,23 @@ export const useConnectorDialog = () => { }, }); + // Register folder watcher in Electron for real-time sync + if ( + currentConnectorType === EnumConnectorName.LOCAL_FOLDER_CONNECTOR && + window.electronAPI?.addWatchedFolder + ) { + const cfg = connector.config || {}; + await window.electronAPI.addWatchedFolder({ + path: cfg.folder_path as string, + name: cfg.folder_name as string, + excludePatterns: (cfg.exclude_patterns as string[]) || [], + fileExtensions: (cfg.file_extensions as string[] | null) ?? null, + connectorId: connector.id, + searchSpaceId: Number(searchSpaceId), + active: true, + }); + } + const successMessage = currentConnectorType === "MCP_CONNECTOR" ? `${connector.name} added successfully` @@ -1190,6 +1207,17 @@ export const useConnectorDialog = () => { id: editingConnector.id, }); + // Unregister folder watcher in Electron when removing a Local Folder connector + if ( + editingConnector.connector_type === EnumConnectorName.LOCAL_FOLDER_CONNECTOR && + window.electronAPI?.removeWatchedFolder && + editingConnector.config?.folder_path + ) { + await window.electronAPI.removeWatchedFolder( + editingConnector.config.folder_path as string + ); + } + // Track connector deleted event trackConnectorDeleted( Number(searchSpaceId), From 8e58094a861a1517e6587a8e8f55182c3cffed7b Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 12:13:14 +0530 Subject: [PATCH 032/202] refactor: update permission checks in document and connector routes to use specific permission values --- surfsense_backend/app/routes/documents_routes.py | 6 +++--- .../app/routes/search_source_connectors_routes.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 9271d4630..2d999eae3 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1156,7 +1156,7 @@ async def list_document_versions( if not document: raise HTTPException(status_code=404, detail="Document not found") - await check_permission(session, user, document.search_space_id, Permission.READ) + await check_permission(session, user, document.search_space_id, Permission.DOCUMENTS_READ.value) versions = ( await session.execute( @@ -1191,7 +1191,7 @@ async def get_document_version( if not document: raise HTTPException(status_code=404, detail="Document not found") - await check_permission(session, user, document.search_space_id, Permission.READ) + await check_permission(session, user, document.search_space_id, Permission.DOCUMENTS_READ.value) version = ( await session.execute( @@ -1229,7 +1229,7 @@ async def restore_document_version( if not document: raise HTTPException(status_code=404, detail="Document not found") - await check_permission(session, user, document.search_space_id, Permission.WRITE) + await check_permission(session, user, document.search_space_id, Permission.DOCUMENTS_UPDATE.value) version = ( await session.execute( diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 439d83ac1..5ea88c418 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -1355,7 +1355,7 @@ async def index_single_file( if not connector: raise HTTPException(status_code=404, detail="Local folder connector not found") - await check_permission(session, user, connector.search_space_id, Permission.WRITE) + await check_permission(session, user, connector.search_space_id, Permission.CONNECTORS_UPDATE.value) folder_path = connector.config.get("folder_path", "") From ccd0e3b807a1f0a9f27decb3c2be00600baa9ddb Mon Sep 17 00:00:00 2001 From: SohamBhattacharjee2003 <125297948+SohamBhattacharjee2003@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:16:55 +0530 Subject: [PATCH 033/202] fix(connector-popup): remove unused currentUserAtom import --- surfsense_web/components/assistant-ui/connector-popup.tsx | 2 -- 1 file changed, 2 deletions(-) diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index d1f6dd31f..791775cd3 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -11,7 +11,6 @@ import { } from "@/atoms/new-llm-config/new-llm-config-query.atoms"; import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms"; import { searchSpaceSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms"; -import { currentUserAtom } from "@/atoms/user/user-query.atoms"; import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; import { Dialog, DialogContent, DialogTitle } from "@/components/ui/dialog"; @@ -47,7 +46,6 @@ export const ConnectorIndicator = forwardRef { const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom); const setSearchSpaceSettingsDialog = useSetAtom(searchSpaceSettingsDialogAtom); - useAtomValue(currentUserAtom); const { data: preferences = {}, isFetching: preferencesLoading } = useAtomValue(llmPreferencesAtom); const { data: globalConfigs = [], isFetching: globalConfigsLoading } = From f1be2652a0543cf3ebda60b653106de4e278675e Mon Sep 17 00:00:00 2001 From: SohamBhattacharjee2003 <125297948+SohamBhattacharjee2003@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:31:48 +0530 Subject: [PATCH 034/202] fix(comment-composer): hoist RegExp out of loop to satisfy js-hoist-regexp rule --- .../comment-composer/comment-composer.tsx | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx index 3e9b4504f..bee3f2da6 100644 --- a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx +++ b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx @@ -15,13 +15,17 @@ function convertDisplayToData(displayContent: string, mentions: InsertedMention[ const sortedMentions = [...mentions].sort((a, b) => b.displayName.length - a.displayName.length); - for (const mention of sortedMentions) { - const displayPattern = new RegExp( + const mentionPatterns = sortedMentions.map((mention) => ({ + pattern: new RegExp( `@${escapeRegExp(mention.displayName)}(?=\\s|$|[.,!?;:])`, "g" - ); - const dataFormat = `@[${mention.id}]`; - result = result.replace(displayPattern, dataFormat); + ), + dataFormat: `@[${mention.id}]`, + })); + + for (const { pattern, dataFormat } of mentionPatterns) { + pattern.lastIndex = 0; // reset global regex state + result = result.replace(pattern, dataFormat); } return result; From 0cd2b8164da696daee5c2118782ed0a695185762 Mon Sep 17 00:00:00 2001 From: SohamBhattacharjee2003 <125297948+SohamBhattacharjee2003@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:41:51 +0530 Subject: [PATCH 035/202] fix(ui): use scoped radix-ui imports to optimize bundle --- surfsense_web/components/ui/checkbox.tsx | 2 +- surfsense_web/components/ui/dropdown-menu.tsx | 2 +- surfsense_web/components/ui/separator.tsx | 2 +- surfsense_web/components/ui/toggle-group.tsx | 2 +- surfsense_web/components/ui/toggle.tsx | 2 +- surfsense_web/components/ui/tooltip.tsx | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/surfsense_web/components/ui/checkbox.tsx b/surfsense_web/components/ui/checkbox.tsx index 0936a383e..586e3e602 100644 --- a/surfsense_web/components/ui/checkbox.tsx +++ b/surfsense_web/components/ui/checkbox.tsx @@ -1,7 +1,7 @@ "use client"; import { CheckIcon } from "lucide-react"; -import { Checkbox as CheckboxPrimitive } from "radix-ui"; +import * as CheckboxPrimitive from "@radix-ui/react-checkbox"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/dropdown-menu.tsx b/surfsense_web/components/ui/dropdown-menu.tsx index 24b99467e..2904b93dd 100644 --- a/surfsense_web/components/ui/dropdown-menu.tsx +++ b/surfsense_web/components/ui/dropdown-menu.tsx @@ -1,7 +1,7 @@ "use client"; import { CheckIcon, ChevronRightIcon, CircleIcon } from "lucide-react"; -import { DropdownMenu as DropdownMenuPrimitive } from "radix-ui"; +import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/separator.tsx b/surfsense_web/components/ui/separator.tsx index 63b8e6a69..d7cf2cd81 100644 --- a/surfsense_web/components/ui/separator.tsx +++ b/surfsense_web/components/ui/separator.tsx @@ -1,6 +1,6 @@ "use client"; -import { Separator as SeparatorPrimitive } from "radix-ui"; +import * as SeparatorPrimitive from "@radix-ui/react-separator"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/toggle-group.tsx b/surfsense_web/components/ui/toggle-group.tsx index eb212182a..33aa433b2 100644 --- a/surfsense_web/components/ui/toggle-group.tsx +++ b/surfsense_web/components/ui/toggle-group.tsx @@ -1,7 +1,7 @@ "use client"; import type { VariantProps } from "class-variance-authority"; -import { ToggleGroup as ToggleGroupPrimitive } from "radix-ui"; +import * as ToggleGroupPrimitive from "@radix-ui/react-toggle-group"; import * as React from "react"; import { toggleVariants } from "@/components/ui/toggle"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/toggle.tsx b/surfsense_web/components/ui/toggle.tsx index 5841cdc35..f0e68cec9 100644 --- a/surfsense_web/components/ui/toggle.tsx +++ b/surfsense_web/components/ui/toggle.tsx @@ -1,7 +1,7 @@ "use client"; import { cva, type VariantProps } from "class-variance-authority"; -import { Toggle as TogglePrimitive } from "radix-ui"; +import * as TogglePrimitive from "@radix-ui/react-toggle"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/tooltip.tsx b/surfsense_web/components/ui/tooltip.tsx index c19ce6f82..2fc85aae4 100644 --- a/surfsense_web/components/ui/tooltip.tsx +++ b/surfsense_web/components/ui/tooltip.tsx @@ -1,6 +1,6 @@ "use client"; -import { Tooltip as TooltipPrimitive } from "radix-ui"; +import * as TooltipPrimitive from "@radix-ui/react-tooltip"; import type * as React from "react"; import { cn } from "@/lib/utils"; From f4d197f7022b0ceec560925f38fe6c125a77b0fc Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 13:18:20 +0200 Subject: [PATCH 036/202] feat: add native module support for desktop autocomplete --- surfsense_desktop/.npmrc | 1 + surfsense_desktop/electron-builder.yml | 13 +++++ surfsense_desktop/package.json | 8 +++- surfsense_desktop/pnpm-lock.yaml | 50 ++++++++++++++++++++ surfsense_desktop/scripts/build-electron.mjs | 2 +- 5 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 surfsense_desktop/.npmrc diff --git a/surfsense_desktop/.npmrc b/surfsense_desktop/.npmrc new file mode 100644 index 000000000..d67f37488 --- /dev/null +++ b/surfsense_desktop/.npmrc @@ -0,0 +1 @@ +node-linker=hoisted diff --git a/surfsense_desktop/electron-builder.yml b/surfsense_desktop/electron-builder.yml index eaca0f19b..74c69d223 100644 --- a/surfsense_desktop/electron-builder.yml +++ b/surfsense_desktop/electron-builder.yml @@ -9,6 +9,16 @@ directories: files: - dist/**/* - "!node_modules" + - node_modules/uiohook-napi/**/* + - "!node_modules/uiohook-napi/build" + - "!node_modules/uiohook-napi/src" + - "!node_modules/uiohook-napi/libuiohook" + - "!node_modules/uiohook-napi/binding.gyp" + - node_modules/node-gyp-build/**/* + - node_modules/node-mac-permissions/**/* + - "!node_modules/node-mac-permissions/build" + - "!node_modules/node-mac-permissions/src" + - "!node_modules/node-mac-permissions/binding.gyp" - "!src" - "!scripts" - "!release" @@ -29,6 +39,9 @@ extraResources: filter: ["**/*"] asarUnpack: - "**/*.node" + - "node_modules/uiohook-napi/**/*" + - "node_modules/node-gyp-build/**/*" + - "node_modules/node-mac-permissions/**/*" mac: icon: assets/icon.icns category: public.app-category.productivity diff --git a/surfsense_desktop/package.json b/surfsense_desktop/package.json index bd0cc67ab..a2e452b7c 100644 --- a/surfsense_desktop/package.json +++ b/surfsense_desktop/package.json @@ -11,12 +11,14 @@ "dist:mac": "pnpm build && electron-builder --mac --config electron-builder.yml", "dist:win": "pnpm build && electron-builder --win --config electron-builder.yml", "dist:linux": "pnpm build && electron-builder --linux --config electron-builder.yml", - "typecheck": "tsc --noEmit" + "typecheck": "tsc --noEmit", + "postinstall": "electron-rebuild" }, "author": "MODSetter", "license": "MIT", "packageManager": "pnpm@10.24.0", "devDependencies": { + "@electron/rebuild": "^4.0.3", "@types/node": "^25.5.0", "concurrently": "^9.2.1", "dotenv": "^17.3.1", @@ -28,6 +30,8 @@ }, "dependencies": { "electron-updater": "^6.8.3", - "get-port-please": "^3.2.0" + "get-port-please": "^3.2.0", + "node-mac-permissions": "^2.5.0", + "uiohook-napi": "^1.5.5" } } diff --git a/surfsense_desktop/pnpm-lock.yaml b/surfsense_desktop/pnpm-lock.yaml index ea65be0bb..82bad9456 100644 --- a/surfsense_desktop/pnpm-lock.yaml +++ b/surfsense_desktop/pnpm-lock.yaml @@ -14,7 +14,16 @@ importers: get-port-please: specifier: ^3.2.0 version: 3.2.0 + node-mac-permissions: + specifier: ^2.5.0 + version: 2.5.0 + uiohook-napi: + specifier: ^1.5.5 + version: 1.5.5 devDependencies: + '@electron/rebuild': + specifier: ^4.0.3 + version: 4.0.3 '@types/node': specifier: ^25.5.0 version: 25.5.0 @@ -343,6 +352,7 @@ packages: '@xmldom/xmldom@0.8.11': resolution: {integrity: sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw==} engines: {node: '>=10.0.0'} + deprecated: this version has critical issues, please update to the latest version abbrev@3.0.1: resolution: {integrity: sha512-AO2ac6pjRB3SJmGJo+v5/aK6Omggp6fsLrs6wN9bd35ulu4cCwaAU9+7ZhXjeqHVkaHThLuzH0nZr0YpCDhygg==} @@ -424,6 +434,9 @@ packages: base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} + bindings@1.5.0: + resolution: {integrity: sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==} + bl@4.1.0: resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==} @@ -738,6 +751,9 @@ packages: picomatch: optional: true + file-uri-to-path@1.0.0: + resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==} + filelist@1.0.6: resolution: {integrity: sha512-5giy2PkLYY1cP39p17Ech+2xlpTRL9HLspOfEgm0L6CwBXBTgsK5ou0JtzYuepxkaQ/tvhCFIJ5uXo0OrM2DxA==} @@ -1103,14 +1119,25 @@ packages: node-addon-api@1.7.2: resolution: {integrity: sha512-ibPK3iA+vaY1eEjESkQkM0BbCqFOaZMiXRTtdB0u7b4djtY6JnsjvPdUHVMg6xQt3B8fpTTWHI9A+ADjM9frzg==} + node-addon-api@7.1.1: + resolution: {integrity: sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==} + node-api-version@0.2.1: resolution: {integrity: sha512-2xP/IGGMmmSQpI1+O/k72jF/ykvZ89JeuKX3TLJAYPDVLUalrshrLHkeVcCCZqG/eEa635cr8IBYzgnDvM2O8Q==} + node-gyp-build@4.8.4: + resolution: {integrity: sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==} + hasBin: true + node-gyp@11.5.0: resolution: {integrity: sha512-ra7Kvlhxn5V9Slyus0ygMa2h+UqExPqUIkfk7Pc8QTLT956JLSy51uWFwHtIYy0vI8cB4BDhc/S03+880My/LQ==} engines: {node: ^18.17.0 || >=20.5.0} hasBin: true + node-mac-permissions@2.5.0: + resolution: {integrity: sha512-zR8SVCaN3WqV1xwWd04XVAdzm3UTdjbxciLrZtB0Cc7F2Kd34AJfhPD4hm1HU0YH3oGUZO4X9OBLY5ijSTHsGw==} + os: [darwin] + nopt@8.1.0: resolution: {integrity: sha512-ieGu42u/Qsa4TFktmaKEwM6MQH0pOWnaB3htzh0JRtx84+Mebc0cbZYN5bC+6WTZ4+77xrL9Pn5m7CV6VIkV7A==} engines: {node: ^18.17.0 || >=20.5.0} @@ -1424,6 +1451,10 @@ packages: engines: {node: '>=14.17'} hasBin: true + uiohook-napi@1.5.5: + resolution: {integrity: sha512-oSlTdnECw2GBfsJPTbBQBeE4v/EXP0EZmX6BJq5nzH/JgFaBE8JpFwEA/kLhiEP7HxQw28FViWiYgdIZzWuuJQ==} + engines: {node: '>= 16'} + undici-types@7.16.0: resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==} @@ -1923,6 +1954,10 @@ snapshots: base64-js@1.5.1: {} + bindings@1.5.0: + dependencies: + file-uri-to-path: 1.0.0 + bl@4.1.0: dependencies: buffer: 5.7.1 @@ -2348,6 +2383,8 @@ snapshots: optionalDependencies: picomatch: 4.0.3 + file-uri-to-path@1.0.0: {} + filelist@1.0.6: dependencies: minimatch: 5.1.9 @@ -2739,10 +2776,14 @@ snapshots: node-addon-api@1.7.2: optional: true + node-addon-api@7.1.1: {} + node-api-version@0.2.1: dependencies: semver: 7.7.4 + node-gyp-build@4.8.4: {} + node-gyp@11.5.0: dependencies: env-paths: 2.2.1 @@ -2758,6 +2799,11 @@ snapshots: transitivePeerDependencies: - supports-color + node-mac-permissions@2.5.0: + dependencies: + bindings: 1.5.0 + node-addon-api: 7.1.1 + nopt@8.1.0: dependencies: abbrev: 3.0.1 @@ -3064,6 +3110,10 @@ snapshots: typescript@5.9.3: {} + uiohook-napi@1.5.5: + dependencies: + node-gyp-build: 4.8.4 + undici-types@7.16.0: {} undici-types@7.18.2: {} diff --git a/surfsense_desktop/scripts/build-electron.mjs b/surfsense_desktop/scripts/build-electron.mjs index 923830296..83d941dd2 100644 --- a/surfsense_desktop/scripts/build-electron.mjs +++ b/surfsense_desktop/scripts/build-electron.mjs @@ -104,7 +104,7 @@ async function buildElectron() { bundle: true, platform: 'node', target: 'node18', - external: ['electron'], + external: ['electron', 'uiohook-napi', 'node-mac-permissions'], sourcemap: true, minify: false, define: { From fbd033d0a4ed756a789879b1bba69c08bc71506d Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 13:19:21 +0200 Subject: [PATCH 037/202] feat: add autocomplete streaming endpoint with KB context --- surfsense_backend/app/routes/__init__.py | 2 + .../app/routes/autocomplete_routes.py | 136 ++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 surfsense_backend/app/routes/autocomplete_routes.py diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index 1937f11cb..a063b5976 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -3,6 +3,7 @@ from fastapi import APIRouter from .airtable_add_connector_route import ( router as airtable_add_connector_router, ) +from .autocomplete_routes import router as autocomplete_router from .chat_comments_routes import router as chat_comments_router from .circleback_webhook_route import router as circleback_webhook_router from .clickup_add_connector_route import router as clickup_add_connector_router @@ -95,3 +96,4 @@ router.include_router(incentive_tasks_router) # Incentive tasks for earning fre router.include_router(stripe_router) # Stripe checkout for additional page packs router.include_router(youtube_router) # YouTube playlist resolution router.include_router(prompts_router) +router.include_router(autocomplete_router) # Lightweight autocomplete with KB context diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py new file mode 100644 index 000000000..9a285a723 --- /dev/null +++ b/surfsense_backend/app/routes/autocomplete_routes.py @@ -0,0 +1,136 @@ +import logging +from typing import AsyncGenerator + +from fastapi import APIRouter, Depends, Query +from fastapi.responses import StreamingResponse +from langchain_core.messages import HumanMessage, SystemMessage +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import User, get_async_session +from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever +from app.services.llm_service import get_agent_llm +from app.services.new_streaming_service import VercelStreamingService +from app.users import current_active_user + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/autocomplete", tags=["autocomplete"]) + +AUTOCOMPLETE_SYSTEM_PROMPT = """You are an inline text autocomplete engine. Your job is to complete the user's text naturally. + +Rules: +- Output ONLY the continuation text. Do NOT repeat what the user already typed. +- Keep completions concise: 1-3 sentences maximum. +- Match the user's tone, style, and language. +- If knowledge base context is provided, use it to make the completion factually accurate and personalized. +- Do NOT add quotes, explanations, or meta-commentary. +- Do NOT start with a space unless grammatically required. +- If you cannot produce a useful completion, output nothing.""" + +KB_CONTEXT_TEMPLATE = """ +Relevant knowledge base context (use this to personalize the completion): +--- +{kb_context} +--- +""" + + +async def _stream_autocomplete( + text: str, + cursor_position: int, + search_space_id: int, + session: AsyncSession, +) -> AsyncGenerator[str, None]: + """Stream an autocomplete response with KB context.""" + streaming_service = VercelStreamingService() + + try: + # Text before cursor is what we're completing + text_before_cursor = text[:cursor_position] if cursor_position >= 0 else text + + if not text_before_cursor.strip(): + yield streaming_service.format_message_start() + yield streaming_service.format_finish() + yield streaming_service.format_done() + return + + # Fast KB lookup: vector-only search, top 3 chunks, no planner LLM + kb_context = "" + try: + retriever = ChucksHybridSearchRetriever(session) + chunks = await retriever.vector_search( + query_text=text_before_cursor[-200:], # last 200 chars for relevance + top_k=3, + search_space_id=search_space_id, + ) + if chunks: + kb_snippets = [] + for chunk in chunks: + content = getattr(chunk, "content", None) or getattr(chunk, "chunk_text", "") + if content: + kb_snippets.append(content[:300]) + if kb_snippets: + kb_context = KB_CONTEXT_TEMPLATE.format( + kb_context="\n\n".join(kb_snippets) + ) + except Exception as e: + logger.warning(f"KB search failed for autocomplete, proceeding without context: {e}") + + # Get the search space's configured LLM + llm = await get_agent_llm(session, search_space_id) + if not llm: + yield streaming_service.format_message_start() + error_msg = "No LLM configured for this search space" + yield streaming_service.format_error(error_msg) + yield streaming_service.format_done() + return + + system_prompt = AUTOCOMPLETE_SYSTEM_PROMPT + if kb_context: + system_prompt += kb_context + + messages = [ + SystemMessage(content=system_prompt), + HumanMessage(content=f"Complete this text:\n{text_before_cursor}"), + ] + + # Stream the response + yield streaming_service.format_message_start() + text_id = streaming_service.generate_text_id() + yield streaming_service.format_text_start(text_id) + + async for chunk in llm.astream(messages): + token = chunk.content if hasattr(chunk, "content") else str(chunk) + if token: + yield streaming_service.format_text_delta(text_id, token) + + yield streaming_service.format_text_end(text_id) + yield streaming_service.format_finish() + yield streaming_service.format_done() + + except Exception as e: + logger.error(f"Autocomplete streaming error: {e}") + yield streaming_service.format_error(str(e)) + yield streaming_service.format_done() + + +@router.post("/stream") +async def autocomplete_stream( + text: str = Query(..., description="Current text in the input field"), + cursor_position: int = Query(-1, description="Cursor position in the text (-1 for end)"), + search_space_id: int = Query(..., description="Search space ID for KB context and LLM config"), + user: User = Depends(current_active_user), + session: AsyncSession = Depends(get_async_session), +): + """Stream an autocomplete suggestion based on the current text and KB context.""" + if cursor_position < 0: + cursor_position = len(text) + + return StreamingResponse( + _stream_autocomplete(text, cursor_position, search_space_id, session), + media_type="text/event-stream", + headers={ + **VercelStreamingService.get_response_headers(), + "X-Accel-Buffering": "no", + }, + ) From bcc227a4ddc34fbd6af3fb1115e01af0997bac85 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 13:19:59 +0200 Subject: [PATCH 038/202] feat: add suggestion tooltip UI and autocomplete API types --- surfsense_web/app/suggestion/layout.tsx | 13 ++ surfsense_web/app/suggestion/page.tsx | 160 ++++++++++++++++++++ surfsense_web/app/suggestion/suggestion.css | 96 ++++++++++++ surfsense_web/types/window.d.ts | 6 + 4 files changed, 275 insertions(+) create mode 100644 surfsense_web/app/suggestion/layout.tsx create mode 100644 surfsense_web/app/suggestion/page.tsx create mode 100644 surfsense_web/app/suggestion/suggestion.css diff --git a/surfsense_web/app/suggestion/layout.tsx b/surfsense_web/app/suggestion/layout.tsx new file mode 100644 index 000000000..36b7e037b --- /dev/null +++ b/surfsense_web/app/suggestion/layout.tsx @@ -0,0 +1,13 @@ +import "./suggestion.css"; + +export const metadata = { + title: "SurfSense Suggestion", +}; + +export default function SuggestionLayout({ + children, +}: { + children: React.ReactNode; +}) { + return
{children}
; +} diff --git a/surfsense_web/app/suggestion/page.tsx b/surfsense_web/app/suggestion/page.tsx new file mode 100644 index 000000000..14dfab3af --- /dev/null +++ b/surfsense_web/app/suggestion/page.tsx @@ -0,0 +1,160 @@ +"use client"; + +import { useCallback, useEffect, useRef, useState } from "react"; +import { getBearerToken } from "@/lib/auth-utils"; + +type SSEEvent = + | { type: "text-delta"; id: string; delta: string } + | { type: "text-start"; id: string } + | { type: "text-end"; id: string } + | { type: "start"; messageId: string } + | { type: "finish" } + | { type: "error"; errorText: string }; + +export default function SuggestionPage() { + const [suggestion, setSuggestion] = useState(""); + const [isLoading, setIsLoading] = useState(true); + const [error, setError] = useState(null); + const abortRef = useRef(null); + + const fetchSuggestion = useCallback( + async (text: string, cursorPosition: number, searchSpaceId: string) => { + abortRef.current?.abort(); + const controller = new AbortController(); + abortRef.current = controller; + + setIsLoading(true); + setSuggestion(""); + setError(null); + + const token = getBearerToken(); + if (!token) { + setError("Not authenticated"); + setIsLoading(false); + return; + } + + const backendUrl = + process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000"; + + const params = new URLSearchParams({ + text, + cursor_position: String(cursorPosition), + search_space_id: searchSpaceId, + }); + + try { + const response = await fetch( + `${backendUrl}/api/v1/autocomplete/stream?${params}`, + { + method: "POST", + headers: { + Authorization: `Bearer ${token}`, + "Content-Type": "application/json", + }, + signal: controller.signal, + }, + ); + + if (!response.ok) { + setError(`Error: ${response.status}`); + setIsLoading(false); + return; + } + + if (!response.body) { + setError("No response body"); + setIsLoading(false); + return; + } + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const events = buffer.split(/\r?\n\r?\n/); + buffer = events.pop() || ""; + + for (const event of events) { + const lines = event.split(/\r?\n/); + for (const line of lines) { + if (!line.startsWith("data: ")) continue; + const data = line.slice(6).trim(); + if (!data || data === "[DONE]") continue; + + try { + const parsed: SSEEvent = JSON.parse(data); + if (parsed.type === "text-delta") { + setSuggestion((prev) => { + const updated = prev + parsed.delta; + window.electronAPI?.updateSuggestionText?.(updated); + return updated; + }); + } else if (parsed.type === "error") { + setError(parsed.errorText); + } + } catch { + continue; + } + } + } + } + } catch (err) { + if (err instanceof DOMException && err.name === "AbortError") return; + setError("Failed to get suggestion"); + } finally { + setIsLoading(false); + } + }, + [], + ); + + useEffect(() => { + if (!window.electronAPI?.onAutocompleteContext) return; + + const cleanup = window.electronAPI.onAutocompleteContext((data) => { + const searchSpaceId = data.searchSpaceId || "1"; + fetchSuggestion(data.text, data.cursorPosition, searchSpaceId); + }); + + return cleanup; + }, [fetchSuggestion]); + + if (error) { + return ( +
+ {error} +
+ ); + } + + if (isLoading && !suggestion) { + return ( +
+
+ + + +
+
+ ); + } + + if (!suggestion) return null; + + return ( +
+

{suggestion}

+
+ Tab accept + · + Esc dismiss +
+
+ ); +} diff --git a/surfsense_web/app/suggestion/suggestion.css b/surfsense_web/app/suggestion/suggestion.css new file mode 100644 index 000000000..e9471e7f8 --- /dev/null +++ b/surfsense_web/app/suggestion/suggestion.css @@ -0,0 +1,96 @@ +.suggestion-body { + margin: 0; + padding: 0; + background: transparent; + overflow: hidden; + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; + -webkit-font-smoothing: antialiased; + user-select: none; + -webkit-app-region: no-drag; +} + +.suggestion-tooltip { + background: rgba(30, 30, 30, 0.95); + backdrop-filter: blur(12px); + -webkit-backdrop-filter: blur(12px); + border: 1px solid rgba(255, 255, 255, 0.1); + border-radius: 10px; + padding: 10px 14px; + margin: 4px; + max-width: 400px; + box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4), + 0 2px 8px rgba(0, 0, 0, 0.2); +} + +.suggestion-text { + color: rgba(255, 255, 255, 0.9); + font-size: 13px; + line-height: 1.5; + margin: 0 0 8px 0; + word-wrap: break-word; + white-space: pre-wrap; +} + +.suggestion-hint { + color: rgba(255, 255, 255, 0.4); + font-size: 11px; + display: flex; + align-items: center; + gap: 4px; +} + +.suggestion-key { + background: rgba(255, 255, 255, 0.1); + border: 1px solid rgba(255, 255, 255, 0.15); + border-radius: 3px; + padding: 1px 5px; + font-size: 10px; + font-weight: 500; + color: rgba(255, 255, 255, 0.6); +} + +.suggestion-separator { + margin: 0 2px; +} + +.suggestion-error { + border-color: rgba(255, 80, 80, 0.3); +} + +.suggestion-error-text { + color: rgba(255, 120, 120, 0.9); + font-size: 12px; +} + +.suggestion-loading { + display: flex; + gap: 4px; + padding: 4px 0; +} + +.suggestion-dot { + width: 5px; + height: 5px; + border-radius: 50%; + background: rgba(255, 255, 255, 0.4); + animation: suggestion-pulse 1.2s infinite ease-in-out; +} + +.suggestion-dot:nth-child(2) { + animation-delay: 0.15s; +} + +.suggestion-dot:nth-child(3) { + animation-delay: 0.3s; +} + +@keyframes suggestion-pulse { + 0%, 80%, 100% { + opacity: 0.3; + transform: scale(0.8); + } + 40% { + opacity: 1; + transform: scale(1); + } +} diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index 9cf1aa596..a30358527 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -14,6 +14,12 @@ interface ElectronAPI { setQuickAskMode: (mode: string) => Promise; getQuickAskMode: () => Promise; replaceText: (text: string) => Promise; + onAutocompleteContext: (callback: (data: { text: string; cursorPosition: number; searchSpaceId?: string }) => void) => () => void; + acceptSuggestion: (text: string) => Promise; + dismissSuggestion: () => Promise; + updateSuggestionText: (text: string) => Promise; + setAutocompleteEnabled: (enabled: boolean) => Promise; + getAutocompleteEnabled: () => Promise; } declare global { From ec2b7851b6393147b09a9f24273fe84d4314371f Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 13:26:32 +0200 Subject: [PATCH 039/202] feat: add macOS permission infrastructure for autocomplete --- surfsense_desktop/src/ipc/channels.ts | 5 ++ surfsense_desktop/src/ipc/handlers.ts | 22 +++++++++ surfsense_desktop/src/modules/permissions.ts | 50 ++++++++++++++++++++ surfsense_desktop/src/preload.ts | 5 ++ surfsense_web/types/window.d.ts | 9 ++++ 5 files changed, 91 insertions(+) create mode 100644 surfsense_desktop/src/modules/permissions.ts diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts index 25ec1bc0e..a5209dcf3 100644 --- a/surfsense_desktop/src/ipc/channels.ts +++ b/surfsense_desktop/src/ipc/channels.ts @@ -6,4 +6,9 @@ export const IPC_CHANNELS = { SET_QUICK_ASK_MODE: 'set-quick-ask-mode', GET_QUICK_ASK_MODE: 'get-quick-ask-mode', REPLACE_TEXT: 'replace-text', + // Permissions + GET_PERMISSIONS_STATUS: 'get-permissions-status', + REQUEST_ACCESSIBILITY: 'request-accessibility', + REQUEST_INPUT_MONITORING: 'request-input-monitoring', + RESTART_APP: 'restart-app', } as const; diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts index 18e343719..fc31329f1 100644 --- a/surfsense_desktop/src/ipc/handlers.ts +++ b/surfsense_desktop/src/ipc/handlers.ts @@ -1,5 +1,11 @@ import { app, ipcMain, shell } from 'electron'; import { IPC_CHANNELS } from './channels'; +import { + getPermissionsStatus, + requestAccessibility, + requestInputMonitoring, + restartApp, +} from '../modules/permissions'; export function registerIpcHandlers(): void { ipcMain.on(IPC_CHANNELS.OPEN_EXTERNAL, (_event, url: string) => { @@ -16,4 +22,20 @@ export function registerIpcHandlers(): void { ipcMain.handle(IPC_CHANNELS.GET_APP_VERSION, () => { return app.getVersion(); }); + + ipcMain.handle(IPC_CHANNELS.GET_PERMISSIONS_STATUS, () => { + return getPermissionsStatus(); + }); + + ipcMain.handle(IPC_CHANNELS.REQUEST_ACCESSIBILITY, () => { + requestAccessibility(); + }); + + ipcMain.handle(IPC_CHANNELS.REQUEST_INPUT_MONITORING, () => { + requestInputMonitoring(); + }); + + ipcMain.handle(IPC_CHANNELS.RESTART_APP, () => { + restartApp(); + }); } diff --git a/surfsense_desktop/src/modules/permissions.ts b/surfsense_desktop/src/modules/permissions.ts new file mode 100644 index 000000000..9a6159c9a --- /dev/null +++ b/surfsense_desktop/src/modules/permissions.ts @@ -0,0 +1,50 @@ +import { app } from 'electron'; + +type PermissionStatus = 'authorized' | 'denied' | 'not determined' | 'restricted' | 'limited'; + +export interface PermissionsStatus { + accessibility: PermissionStatus; + inputMonitoring: PermissionStatus; +} + +function isMac(): boolean { + return process.platform === 'darwin'; +} + +function getNodeMacPermissions() { + return require('node-mac-permissions'); +} + +export function getPermissionsStatus(): PermissionsStatus { + if (!isMac()) { + return { accessibility: 'authorized', inputMonitoring: 'authorized' }; + } + + const perms = getNodeMacPermissions(); + return { + accessibility: perms.getAuthStatus('accessibility'), + inputMonitoring: perms.getAuthStatus('input-monitoring'), + }; +} + +export function allPermissionsGranted(): boolean { + const status = getPermissionsStatus(); + return status.accessibility === 'authorized' && status.inputMonitoring === 'authorized'; +} + +export function requestAccessibility(): void { + if (!isMac()) return; + const perms = getNodeMacPermissions(); + perms.askForAccessibilityAccess(); +} + +export async function requestInputMonitoring(): Promise { + if (!isMac()) return 'authorized'; + const perms = getNodeMacPermissions(); + return perms.askForInputMonitoringAccess('listen'); +} + +export function restartApp(): void { + app.relaunch(); + app.exit(0); +} diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 264ec25b3..069276489 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -21,4 +21,9 @@ contextBridge.exposeInMainWorld('electronAPI', { setQuickAskMode: (mode: string) => ipcRenderer.invoke(IPC_CHANNELS.SET_QUICK_ASK_MODE, mode), getQuickAskMode: () => ipcRenderer.invoke(IPC_CHANNELS.GET_QUICK_ASK_MODE), replaceText: (text: string) => ipcRenderer.invoke(IPC_CHANNELS.REPLACE_TEXT, text), + // Permissions + getPermissionsStatus: () => ipcRenderer.invoke(IPC_CHANNELS.GET_PERMISSIONS_STATUS), + requestAccessibility: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_ACCESSIBILITY), + requestInputMonitoring: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_INPUT_MONITORING), + restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP), }); diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index a30358527..8cf331b42 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -14,6 +14,15 @@ interface ElectronAPI { setQuickAskMode: (mode: string) => Promise; getQuickAskMode: () => Promise; replaceText: (text: string) => Promise; + // Permissions + getPermissionsStatus: () => Promise<{ + accessibility: 'authorized' | 'denied' | 'not determined' | 'restricted' | 'limited'; + inputMonitoring: 'authorized' | 'denied' | 'not determined' | 'restricted' | 'limited'; + }>; + requestAccessibility: () => Promise; + requestInputMonitoring: () => Promise; + restartApp: () => Promise; + // Autocomplete onAutocompleteContext: (callback: (data: { text: string; cursorPosition: number; searchSpaceId?: string }) => void) => () => void; acceptSuggestion: (text: string) => Promise; dismissSuggestion: () => Promise; From eaabad38fcd2bfc8d6bef87f8ea60ea4d8192d78 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 13:44:57 +0200 Subject: [PATCH 040/202] feat: add permission onboarding page and startup routing for macOS --- surfsense_desktop/src/ipc/handlers.ts | 4 +- surfsense_desktop/src/main.ts | 15 +- surfsense_desktop/src/modules/window.ts | 4 +- .../app/desktop/permissions/page.tsx | 212 ++++++++++++++++++ .../app/{ => desktop}/suggestion/layout.tsx | 0 .../app/{ => desktop}/suggestion/page.tsx | 0 .../{ => desktop}/suggestion/suggestion.css | 0 7 files changed, 228 insertions(+), 7 deletions(-) create mode 100644 surfsense_web/app/desktop/permissions/page.tsx rename surfsense_web/app/{ => desktop}/suggestion/layout.tsx (100%) rename surfsense_web/app/{ => desktop}/suggestion/page.tsx (100%) rename surfsense_web/app/{ => desktop}/suggestion/suggestion.css (100%) diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts index fc31329f1..a6d82be4b 100644 --- a/surfsense_desktop/src/ipc/handlers.ts +++ b/surfsense_desktop/src/ipc/handlers.ts @@ -31,8 +31,8 @@ export function registerIpcHandlers(): void { requestAccessibility(); }); - ipcMain.handle(IPC_CHANNELS.REQUEST_INPUT_MONITORING, () => { - requestInputMonitoring(); + ipcMain.handle(IPC_CHANNELS.REQUEST_INPUT_MONITORING, async () => { + return await requestInputMonitoring(); }); ipcMain.handle(IPC_CHANNELS.RESTART_APP, () => { diff --git a/surfsense_desktop/src/main.ts b/surfsense_desktop/src/main.ts index 3ab41073b..bc164758b 100644 --- a/surfsense_desktop/src/main.ts +++ b/surfsense_desktop/src/main.ts @@ -7,6 +7,7 @@ import { setupAutoUpdater } from './modules/auto-updater'; import { setupMenu } from './modules/menu'; import { registerQuickAsk, unregisterQuickAsk } from './modules/quick-ask'; import { registerIpcHandlers } from './ipc/handlers'; +import { allPermissionsGranted } from './modules/permissions'; registerGlobalErrorHandlers(); @@ -16,7 +17,13 @@ if (!setupDeepLinks()) { registerIpcHandlers(); -// App lifecycle +function getInitialPath(): string { + if (process.platform === 'darwin' && !allPermissionsGranted()) { + return '/desktop/permissions'; + } + return '/dashboard'; +} + app.whenReady().then(async () => { setupMenu(); try { @@ -26,7 +33,9 @@ app.whenReady().then(async () => { setTimeout(() => app.quit(), 0); return; } - createMainWindow(); + + const initialPath = getInitialPath(); + createMainWindow(initialPath); registerQuickAsk(); setupAutoUpdater(); @@ -34,7 +43,7 @@ app.whenReady().then(async () => { app.on('activate', () => { if (BrowserWindow.getAllWindows().length === 0) { - createMainWindow(); + createMainWindow(getInitialPath()); } }); }); diff --git a/surfsense_desktop/src/modules/window.ts b/surfsense_desktop/src/modules/window.ts index 245814cad..7a77773d8 100644 --- a/surfsense_desktop/src/modules/window.ts +++ b/surfsense_desktop/src/modules/window.ts @@ -12,7 +12,7 @@ export function getMainWindow(): BrowserWindow | null { return mainWindow; } -export function createMainWindow(): BrowserWindow { +export function createMainWindow(initialPath = '/dashboard'): BrowserWindow { mainWindow = new BrowserWindow({ width: 1280, height: 800, @@ -33,7 +33,7 @@ export function createMainWindow(): BrowserWindow { mainWindow?.show(); }); - mainWindow.loadURL(`http://localhost:${getServerPort()}/dashboard`); + mainWindow.loadURL(`http://localhost:${getServerPort()}${initialPath}`); mainWindow.webContents.setWindowOpenHandler(({ url }) => { if (url.startsWith('http://localhost')) { diff --git a/surfsense_web/app/desktop/permissions/page.tsx b/surfsense_web/app/desktop/permissions/page.tsx new file mode 100644 index 000000000..2bcdc42df --- /dev/null +++ b/surfsense_web/app/desktop/permissions/page.tsx @@ -0,0 +1,212 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { useRouter } from "next/navigation"; +import { Logo } from "@/components/Logo"; +import { Button } from "@/components/ui/button"; +import { Spinner } from "@/components/ui/spinner"; + +type PermissionStatus = "authorized" | "denied" | "not determined" | "restricted" | "limited"; + +interface PermissionsStatus { + accessibility: PermissionStatus; + inputMonitoring: PermissionStatus; +} + +const STEPS = [ + { + id: "input-monitoring", + title: "Input Monitoring", + description: "Helps you write faster by enriching your text with suggestions from your knowledge base.", + action: "requestInputMonitoring", + field: "inputMonitoring" as const, + }, + { + id: "accessibility", + title: "Accessibility", + description: "Lets you accept suggestions seamlessly, right where you're typing.", + action: "requestAccessibility", + field: "accessibility" as const, + }, +]; + +function StatusBadge({ status }: { status: PermissionStatus }) { + if (status === "authorized") { + return ( + + + Granted + + ); + } + if (status === "denied") { + return ( + + + Denied + + ); + } + return ( + + + Pending + + ); +} + +export default function DesktopPermissionsPage() { + const router = useRouter(); + const [permissions, setPermissions] = useState(null); + const [isElectron, setIsElectron] = useState(false); + + useEffect(() => { + if (!window.electronAPI) return; + setIsElectron(true); + + let interval: ReturnType | null = null; + + const isResolved = (s: string) => s === "authorized" || s === "restricted"; + + const poll = async () => { + const status = await window.electronAPI!.getPermissionsStatus(); + setPermissions(status); + + if (isResolved(status.accessibility) && isResolved(status.inputMonitoring)) { + if (interval) clearInterval(interval); + } + }; + + poll(); + interval = setInterval(poll, 2000); + return () => { if (interval) clearInterval(interval); }; + }, []); + + if (!isElectron) { + return ( +
+

This page is only available in the desktop app.

+
+ ); + } + + if (!permissions) { + return ( +
+ +
+ ); + } + + const allGranted = permissions.accessibility === "authorized" && permissions.inputMonitoring === "authorized"; + + const handleRequest = async (action: string) => { + if (action === "requestInputMonitoring") { + await window.electronAPI!.requestInputMonitoring(); + } else if (action === "requestAccessibility") { + await window.electronAPI!.requestAccessibility(); + } + }; + + const handleContinue = () => { + if (allGranted) { + window.electronAPI!.restartApp(); + } + }; + + const handleSkip = () => { + router.push("/dashboard"); + }; + + return ( +
+
+ {/* Header */} +
+ +
+

System Permissions

+

+ SurfSense needs two macOS permissions to provide system-wide autocomplete. +

+
+
+ + {/* Steps */} +
+ {STEPS.map((step, index) => { + const status = permissions[step.field]; + const isGranted = status === "authorized"; + + return ( +
+
+
+ + {isGranted ? "✓" : index + 1} + +
+

{step.title}

+

{step.description}

+
+
+ +
+ {!isGranted && ( +
+ + {status === "denied" && ( +

+ Toggle SurfSense on in System Settings to continue. +

+ )} +
+ )} +
+ ); + })} +
+ + {/* Footer */} +
+ {allGranted ? ( + <> + +

+ A restart is needed for permissions to take effect. +

+ + ) : ( + <> + + + + )} +
+
+
+ ); +} diff --git a/surfsense_web/app/suggestion/layout.tsx b/surfsense_web/app/desktop/suggestion/layout.tsx similarity index 100% rename from surfsense_web/app/suggestion/layout.tsx rename to surfsense_web/app/desktop/suggestion/layout.tsx diff --git a/surfsense_web/app/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx similarity index 100% rename from surfsense_web/app/suggestion/page.tsx rename to surfsense_web/app/desktop/suggestion/page.tsx diff --git a/surfsense_web/app/suggestion/suggestion.css b/surfsense_web/app/desktop/suggestion/suggestion.css similarity index 100% rename from surfsense_web/app/suggestion/suggestion.css rename to surfsense_web/app/desktop/suggestion/suggestion.css From b2706b00a1bf793e8d1aa63235a0c53d5cc6766c Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 14:29:12 +0200 Subject: [PATCH 041/202] feat: add autocomplete module with keystroke monitoring and IPC wiring --- surfsense_desktop/src/ipc/channels.ts | 7 + surfsense_desktop/src/main.ts | 3 + surfsense_desktop/src/modules/autocomplete.ts | 267 ++++++++++++++++++ surfsense_desktop/src/modules/platform.ts | 40 +++ surfsense_desktop/src/preload.ts | 13 + 5 files changed, 330 insertions(+) create mode 100644 surfsense_desktop/src/modules/autocomplete.ts diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts index a5209dcf3..2965f516f 100644 --- a/surfsense_desktop/src/ipc/channels.ts +++ b/surfsense_desktop/src/ipc/channels.ts @@ -11,4 +11,11 @@ export const IPC_CHANNELS = { REQUEST_ACCESSIBILITY: 'request-accessibility', REQUEST_INPUT_MONITORING: 'request-input-monitoring', RESTART_APP: 'restart-app', + // Autocomplete + AUTOCOMPLETE_CONTEXT: 'autocomplete-context', + ACCEPT_SUGGESTION: 'accept-suggestion', + DISMISS_SUGGESTION: 'dismiss-suggestion', + UPDATE_SUGGESTION_TEXT: 'update-suggestion-text', + SET_AUTOCOMPLETE_ENABLED: 'set-autocomplete-enabled', + GET_AUTOCOMPLETE_ENABLED: 'get-autocomplete-enabled', } as const; diff --git a/surfsense_desktop/src/main.ts b/surfsense_desktop/src/main.ts index bc164758b..9623be82e 100644 --- a/surfsense_desktop/src/main.ts +++ b/surfsense_desktop/src/main.ts @@ -6,6 +6,7 @@ import { setupDeepLinks, handlePendingDeepLink } from './modules/deep-links'; import { setupAutoUpdater } from './modules/auto-updater'; import { setupMenu } from './modules/menu'; import { registerQuickAsk, unregisterQuickAsk } from './modules/quick-ask'; +import { registerAutocomplete, unregisterAutocomplete } from './modules/autocomplete'; import { registerIpcHandlers } from './ipc/handlers'; import { allPermissionsGranted } from './modules/permissions'; @@ -37,6 +38,7 @@ app.whenReady().then(async () => { const initialPath = getInitialPath(); createMainWindow(initialPath); registerQuickAsk(); + registerAutocomplete(); setupAutoUpdater(); handlePendingDeepLink(); @@ -56,4 +58,5 @@ app.on('window-all-closed', () => { app.on('will-quit', () => { unregisterQuickAsk(); + unregisterAutocomplete(); }); diff --git a/surfsense_desktop/src/modules/autocomplete.ts b/surfsense_desktop/src/modules/autocomplete.ts new file mode 100644 index 000000000..2b877723f --- /dev/null +++ b/surfsense_desktop/src/modules/autocomplete.ts @@ -0,0 +1,267 @@ +import { BrowserWindow, clipboard, ipcMain, screen, shell } from 'electron'; +import path from 'path'; +import { IPC_CHANNELS } from '../ipc/channels'; +import { allPermissionsGranted } from './permissions'; +import { getFieldContent, getFrontmostApp, hasAccessibilityPermission, simulatePaste } from './platform'; +import { getServerPort } from './server'; +import { getMainWindow } from './window'; + +const DEBOUNCE_MS = 600; +const TOOLTIP_WIDTH = 420; +const TOOLTIP_HEIGHT = 140; + +let uIOhook: any = null; +let UiohookKey: any = {}; +let IGNORED_KEYCODES: Set = new Set(); + +let suggestionWindow: BrowserWindow | null = null; +let debounceTimer: ReturnType | null = null; +let hookStarted = false; +let autocompleteEnabled = true; +let savedClipboard = ''; +let sourceApp = ''; +let pendingSuggestionText = ''; + +function loadUiohook(): boolean { + if (uIOhook) return true; + try { + const mod = require('uiohook-napi'); + uIOhook = mod.uIOhook; + UiohookKey = mod.UiohookKey; + IGNORED_KEYCODES = new Set([ + UiohookKey.Shift, UiohookKey.ShiftRight, + UiohookKey.Ctrl, UiohookKey.CtrlRight, + UiohookKey.Alt, UiohookKey.AltRight, + UiohookKey.Meta, UiohookKey.MetaRight, + UiohookKey.CapsLock, UiohookKey.NumLock, UiohookKey.ScrollLock, + UiohookKey.F1, UiohookKey.F2, UiohookKey.F3, UiohookKey.F4, + UiohookKey.F5, UiohookKey.F6, UiohookKey.F7, UiohookKey.F8, + UiohookKey.F9, UiohookKey.F10, UiohookKey.F11, UiohookKey.F12, + UiohookKey.PrintScreen, + UiohookKey.Insert, UiohookKey.Delete, + UiohookKey.Home, UiohookKey.End, + UiohookKey.PageUp, UiohookKey.PageDown, + UiohookKey.ArrowUp, UiohookKey.ArrowDown, + UiohookKey.ArrowLeft, UiohookKey.ArrowRight, + ]); + console.log('[autocomplete] uiohook-napi loaded'); + return true; + } catch (err) { + console.error('[autocomplete] Failed to load uiohook-napi:', err); + return false; + } +} + +function destroySuggestion(): void { + if (suggestionWindow && !suggestionWindow.isDestroyed()) { + suggestionWindow.close(); + } + suggestionWindow = null; +} + +function clampToScreen(x: number, y: number, w: number, h: number): { x: number; y: number } { + const display = screen.getDisplayNearestPoint({ x, y }); + const { x: dx, y: dy, width: dw, height: dh } = display.workArea; + return { + x: Math.max(dx, Math.min(x, dx + dw - w)), + y: Math.max(dy, Math.min(y, dy + dh - h)), + }; +} + +function createSuggestionWindow(x: number, y: number): BrowserWindow { + destroySuggestion(); + + const pos = clampToScreen(x, y + 20, TOOLTIP_WIDTH, TOOLTIP_HEIGHT); + + suggestionWindow = new BrowserWindow({ + width: TOOLTIP_WIDTH, + height: TOOLTIP_HEIGHT, + x: pos.x, + y: pos.y, + frame: false, + transparent: true, + focusable: false, + alwaysOnTop: true, + skipTaskbar: true, + resizable: false, + hasShadow: true, + type: 'panel', + webPreferences: { + preload: path.join(__dirname, 'preload.js'), + contextIsolation: true, + nodeIntegration: false, + sandbox: true, + }, + show: false, + }); + + suggestionWindow.loadURL(`http://localhost:${getServerPort()}/desktop/suggestion?t=${Date.now()}`); + + suggestionWindow.once('ready-to-show', () => { + suggestionWindow?.showInactive(); + }); + + suggestionWindow.webContents.setWindowOpenHandler(({ url }) => { + if (url.startsWith('http://localhost')) { + return { action: 'allow' }; + } + shell.openExternal(url); + return { action: 'deny' }; + }); + + suggestionWindow.on('closed', () => { + suggestionWindow = null; + }); + + return suggestionWindow; +} + +function clearDebounce(): void { + if (debounceTimer) { + clearTimeout(debounceTimer); + debounceTimer = null; + } +} + +function isSurfSenseWindow(): boolean { + const app = getFrontmostApp(); + return app === 'Electron' || app === 'SurfSense' || app === 'surfsense-desktop'; +} + +function onKeyDown(event: { keycode: number; ctrlKey?: boolean; metaKey?: boolean; altKey?: boolean }): void { + if (!autocompleteEnabled) return; + + if (event.keycode === UiohookKey.Tab && suggestionWindow && !suggestionWindow.isDestroyed()) { + if (pendingSuggestionText) { + acceptAndInject(pendingSuggestionText); + } + return; + } + + if (event.keycode === UiohookKey.Escape) { + if (suggestionWindow && !suggestionWindow.isDestroyed()) { + destroySuggestion(); + pendingSuggestionText = ''; + } + clearDebounce(); + return; + } + + if (IGNORED_KEYCODES.has(event.keycode)) return; + if (event.ctrlKey || event.metaKey || event.altKey) return; + if (isSurfSenseWindow()) return; + + if (suggestionWindow && !suggestionWindow.isDestroyed()) { + destroySuggestion(); + } + + clearDebounce(); + debounceTimer = setTimeout(() => { + triggerAutocomplete(); + }, DEBOUNCE_MS); +} + +async function triggerAutocomplete(): Promise { + if (!hasAccessibilityPermission()) return; + if (isSurfSenseWindow()) return; + + const fieldContent = getFieldContent(); + if (!fieldContent || !fieldContent.text.trim()) return; + if (fieldContent.text.trim().length < 5) return; + + sourceApp = getFrontmostApp(); + savedClipboard = clipboard.readText(); + + const cursor = screen.getCursorScreenPoint(); + const win = createSuggestionWindow(cursor.x, cursor.y); + + let searchSpaceId = '1'; + const mainWin = getMainWindow(); + if (mainWin && !mainWin.isDestroyed()) { + const mainUrl = mainWin.webContents.getURL(); + const match = mainUrl.match(/\/dashboard\/(\d+)/); + if (match) { + searchSpaceId = match[1]; + } + } + + win.webContents.once('did-finish-load', () => { + if (suggestionWindow && !suggestionWindow.isDestroyed()) { + suggestionWindow.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, { + text: fieldContent.text, + cursorPosition: fieldContent.cursorPosition, + searchSpaceId, + }); + } + }); +} + +async function acceptAndInject(text: string): Promise { + if (!sourceApp) return; + if (!hasAccessibilityPermission()) return; + + clipboard.writeText(text); + destroySuggestion(); + pendingSuggestionText = ''; + + try { + await new Promise((r) => setTimeout(r, 50)); + simulatePaste(); + await new Promise((r) => setTimeout(r, 100)); + clipboard.writeText(savedClipboard); + } catch { + clipboard.writeText(savedClipboard); + } +} + +function registerIpcHandlers(): void { + ipcMain.handle(IPC_CHANNELS.ACCEPT_SUGGESTION, async (_event, text: string) => { + await acceptAndInject(text); + }); + ipcMain.handle(IPC_CHANNELS.DISMISS_SUGGESTION, () => { + destroySuggestion(); + pendingSuggestionText = ''; + }); + ipcMain.handle(IPC_CHANNELS.UPDATE_SUGGESTION_TEXT, (_event, text: string) => { + pendingSuggestionText = text; + }); + ipcMain.handle(IPC_CHANNELS.SET_AUTOCOMPLETE_ENABLED, (_event, enabled: boolean) => { + autocompleteEnabled = enabled; + if (!enabled) { + clearDebounce(); + destroySuggestion(); + } + }); + ipcMain.handle(IPC_CHANNELS.GET_AUTOCOMPLETE_ENABLED, () => autocompleteEnabled); +} + +export function registerAutocomplete(): void { + registerIpcHandlers(); + + if (!allPermissionsGranted()) { + console.log('[autocomplete] Permissions not granted — hook not started'); + return; + } + + if (!loadUiohook()) { + console.error('[autocomplete] Cannot start: uiohook-napi failed to load'); + return; + } + + uIOhook.on('keydown', onKeyDown); + try { + uIOhook.start(); + hookStarted = true; + console.log('[autocomplete] uIOhook started'); + } catch (err) { + console.error('[autocomplete] uIOhook.start() failed:', err); + } +} + +export function unregisterAutocomplete(): void { + clearDebounce(); + destroySuggestion(); + if (uIOhook && hookStarted) { + try { uIOhook.stop(); } catch { /* already stopped */ } + } +} diff --git a/surfsense_desktop/src/modules/platform.ts b/surfsense_desktop/src/modules/platform.ts index 37e126799..262866d07 100644 --- a/surfsense_desktop/src/modules/platform.ts +++ b/surfsense_desktop/src/modules/platform.ts @@ -53,3 +53,43 @@ export function checkAccessibilityPermission(): boolean { if (process.platform !== 'darwin') return true; return systemPreferences.isTrustedAccessibilityClient(true); } + +export function hasAccessibilityPermission(): boolean { + if (process.platform !== 'darwin') return true; + return systemPreferences.isTrustedAccessibilityClient(false); +} + +export interface FieldContent { + text: string; + cursorPosition: number; +} + +export function getFieldContent(): FieldContent | null { + if (process.platform !== 'darwin') return null; + + try { + const text = execSync( + 'osascript -e \'tell application "System Events" to get value of attribute "AXValue" of focused UI element of first application process whose frontmost is true\'', + { timeout: 500 } + ).toString().trim(); + + let cursorPosition = text.length; + try { + const rangeStr = execSync( + 'osascript -e \'tell application "System Events" to get value of attribute "AXSelectedTextRange" of focused UI element of first application process whose frontmost is true\'', + { timeout: 500 } + ).toString().trim(); + + const locationMatch = rangeStr.match(/location[:\s]*(\d+)/i); + if (locationMatch) { + cursorPosition = parseInt(locationMatch[1], 10); + } + } catch { + // Fall back to end of text + } + + return { text, cursorPosition }; + } catch { + return null; + } +} diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 069276489..956afcc46 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -26,4 +26,17 @@ contextBridge.exposeInMainWorld('electronAPI', { requestAccessibility: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_ACCESSIBILITY), requestInputMonitoring: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_INPUT_MONITORING), restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP), + // Autocomplete + onAutocompleteContext: (callback: (data: { text: string; cursorPosition: number; searchSpaceId?: string }) => void) => { + const listener = (_event: unknown, data: { text: string; cursorPosition: number; searchSpaceId?: string }) => callback(data); + ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener); + return () => { + ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener); + }; + }, + acceptSuggestion: (text: string) => ipcRenderer.invoke(IPC_CHANNELS.ACCEPT_SUGGESTION, text), + dismissSuggestion: () => ipcRenderer.invoke(IPC_CHANNELS.DISMISS_SUGGESTION), + updateSuggestionText: (text: string) => ipcRenderer.invoke(IPC_CHANNELS.UPDATE_SUGGESTION_TEXT, text), + setAutocompleteEnabled: (enabled: boolean) => ipcRenderer.invoke(IPC_CHANNELS.SET_AUTOCOMPLETE_ENABLED, enabled), + getAutocompleteEnabled: () => ipcRenderer.invoke(IPC_CHANNELS.GET_AUTOCOMPLETE_ENABLED), }); From 6899134a20605b619e372564666a25eba5bb76fa Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 14:37:26 +0200 Subject: [PATCH 042/202] feat: add autocomplete toggle in desktop settings --- .../components/DesktopContent.tsx | 79 +++++++++++++++++++ .../settings/user-settings-dialog.tsx | 7 +- 2 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx new file mode 100644 index 000000000..1522e153f --- /dev/null +++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/DesktopContent.tsx @@ -0,0 +1,79 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; +import { Label } from "@/components/ui/label"; +import { Switch } from "@/components/ui/switch"; +import { Spinner } from "@/components/ui/spinner"; + +export function DesktopContent() { + const [isElectron, setIsElectron] = useState(false); + const [loading, setLoading] = useState(true); + const [enabled, setEnabled] = useState(true); + + useEffect(() => { + if (!window.electronAPI) { + setLoading(false); + return; + } + setIsElectron(true); + + window.electronAPI.getAutocompleteEnabled().then((val) => { + setEnabled(val); + setLoading(false); + }); + }, []); + + if (!isElectron) { + return ( +
+

+ Desktop settings are only available in the SurfSense desktop app. +

+
+ ); + } + + if (loading) { + return ( +
+ +
+ ); + } + + const handleToggle = async (checked: boolean) => { + setEnabled(checked); + await window.electronAPI!.setAutocompleteEnabled(checked); + }; + + return ( +
+ + + Autocomplete + + Get inline writing suggestions powered by your knowledge base as you type in any app. + + + +
+
+ +

+ Show suggestions while typing in other applications. +

+
+ +
+
+
+
+ ); +} diff --git a/surfsense_web/components/settings/user-settings-dialog.tsx b/surfsense_web/components/settings/user-settings-dialog.tsx index 389ebc5fd..b74ff973b 100644 --- a/surfsense_web/components/settings/user-settings-dialog.tsx +++ b/surfsense_web/components/settings/user-settings-dialog.tsx @@ -1,13 +1,14 @@ "use client"; import { useAtom } from "jotai"; -import { Globe, KeyRound, Receipt, Sparkles, User } from "lucide-react"; +import { Globe, KeyRound, Monitor, Receipt, Sparkles, User } from "lucide-react"; import { useTranslations } from "next-intl"; import { ApiKeyContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent"; import { CommunityPromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent"; import { ProfileContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ProfileContent"; import { PromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PromptsContent"; import { PurchaseHistoryContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent"; +import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent"; import { userSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms"; import { SettingsDialog } from "@/components/settings/settings-dialog"; @@ -37,6 +38,9 @@ export function UserSettingsDialog() { label: "Purchase History", icon: , }, + ...(typeof window !== "undefined" && window.electronAPI + ? [{ value: "desktop", label: "Desktop", icon: }] + : []), ]; return ( @@ -54,6 +58,7 @@ export function UserSettingsDialog() { {state.initialTab === "prompts" && } {state.initialTab === "community-prompts" && } {state.initialTab === "purchases" && } + {state.initialTab === "desktop" && }
); From 40ade4889e23b18ed43eaaa39a0489483fa5bbdc Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 21:01:31 +0530 Subject: [PATCH 043/202] feat: add LOCAL_FOLDER_FILE document type and update document_versions table management --- ...d_local_folder_connector_and_versioning.py | 33 +++++++------------ 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/surfsense_backend/alembic/versions/117_add_local_folder_connector_and_versioning.py b/surfsense_backend/alembic/versions/117_add_local_folder_connector_and_versioning.py index e97a4787c..a9da3beb4 100644 --- a/surfsense_backend/alembic/versions/117_add_local_folder_connector_and_versioning.py +++ b/surfsense_backend/alembic/versions/117_add_local_folder_connector_and_versioning.py @@ -1,4 +1,4 @@ -"""Add local folder connector enums and document_versions table +"""Add LOCAL_FOLDER_FILE document type and document_versions table Revision ID: 117 Revises: 116 @@ -21,23 +21,6 @@ PUBLICATION_NAME = "zero_publication" def upgrade() -> None: conn = op.get_bind() - # Add LOCAL_FOLDER_CONNECTOR to searchsourceconnectortype enum - op.execute( - """ - DO $$ - BEGIN - IF NOT EXISTS ( - SELECT 1 FROM pg_type t - JOIN pg_enum e ON t.oid = e.enumtypid - WHERE t.typname = 'searchsourceconnectortype' AND e.enumlabel = 'LOCAL_FOLDER_CONNECTOR' - ) THEN - ALTER TYPE searchsourceconnectortype ADD VALUE 'LOCAL_FOLDER_CONNECTOR'; - END IF; - END - $$; - """ - ) - # Add LOCAL_FOLDER_FILE to documenttype enum op.execute( """ @@ -126,9 +109,17 @@ def downgrade() -> None: {"name": PUBLICATION_NAME}, ).fetchone() if pub_exists: - op.execute( - f"ALTER PUBLICATION {PUBLICATION_NAME} DROP TABLE IF EXISTS document_versions" - ) + already_in_pub = conn.execute( + sa.text( + "SELECT 1 FROM pg_publication_tables " + "WHERE pubname = :name AND tablename = 'document_versions'" + ), + {"name": PUBLICATION_NAME}, + ).fetchone() + if already_in_pub: + op.execute( + f"ALTER PUBLICATION {PUBLICATION_NAME} DROP TABLE document_versions" + ) op.execute("DROP INDEX IF EXISTS ix_document_versions_created_at") op.execute("DROP INDEX IF EXISTS ix_document_versions_document_id") From 1ef0d913e7471c7df6b03b94647064bae76abb39 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:19:55 +0530 Subject: [PATCH 044/202] refactor: remove Local Folder connector components and related configurations from the UI --- .../(manage)/components/DocumentsFilters.tsx | 43 ++- .../components/local-folder-connect-form.tsx | 272 ------------------ .../connect-forms/connector-benefits.ts | 8 - .../connector-popup/connect-forms/index.tsx | 3 - .../components/local-folder-config.tsx | 163 ----------- .../connector-configs/index.tsx | 3 - .../views/connector-connect-view.tsx | 1 - .../views/connector-edit-view.tsx | 7 +- .../views/indexing-configuration-view.tsx | 7 +- 9 files changed, 35 insertions(+), 472 deletions(-) delete mode 100644 surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/local-folder-connect-form.tsx delete mode 100644 surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/local-folder-config.tsx diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx index b85af13b7..fcd3a39da 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx @@ -1,6 +1,6 @@ "use client"; -import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react"; +import { Eye, FolderPlus, ListFilter, Search, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; import React, { useCallback, useMemo, useRef, useState } from "react"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; @@ -19,6 +19,7 @@ export function DocumentsFilters({ onToggleType, activeTypes, onCreateFolder, + onWatchFolder, }: { typeCounts: Partial>; onSearch: (v: string) => void; @@ -26,6 +27,7 @@ export function DocumentsFilters({ onToggleType: (type: DocumentTypeEnum, checked: boolean) => void; activeTypes: DocumentTypeEnum[]; onCreateFolder?: () => void; + onWatchFolder?: () => void; }) { const t = useTranslations("documents"); const id = React.useId(); @@ -214,17 +216,34 @@ export function DocumentsFilters({ )} - {/* Upload Button */} - + {/* Watch Folder Button (desktop only) */} + {onWatchFolder && ( + + + + + Watch folder + + )} + + {/* Upload Button */} +
); diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/local-folder-connect-form.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/local-folder-connect-form.tsx deleted file mode 100644 index 2e893c1c0..000000000 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/components/local-folder-connect-form.tsx +++ /dev/null @@ -1,272 +0,0 @@ -"use client"; - -import { zodResolver } from "@hookform/resolvers/zod"; -import { FolderSync, Info } from "lucide-react"; -import type { FC } from "react"; -import { useRef } from "react"; -import { useForm } from "react-hook-form"; -import * as z from "zod"; -import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"; -import { Button } from "@/components/ui/button"; -import { - Form, - FormControl, - FormDescription, - FormField, - FormItem, - FormLabel, - FormMessage, -} from "@/components/ui/form"; -import { Input } from "@/components/ui/input"; -import { EnumConnectorName } from "@/contracts/enums/connector"; -import { getConnectorBenefits } from "../connector-benefits"; -import type { ConnectFormProps } from "../index"; - -const localFolderFormSchema = z.object({ - name: z.string().min(3, { - message: "Connector name must be at least 3 characters.", - }), - folder_path: z.string().min(1, { - message: "Folder path is required.", - }), - folder_name: z.string().min(1, { - message: "Folder name is required.", - }), - exclude_patterns: z.string().optional(), - file_extensions: z.string().optional(), -}); - -type LocalFolderFormValues = z.infer; - -export const LocalFolderConnectForm: FC = ({ onSubmit, isSubmitting }) => { - const isSubmittingRef = useRef(false); - const isElectron = typeof window !== "undefined" && !!window.electronAPI; - - const form = useForm({ - resolver: zodResolver(localFolderFormSchema), - defaultValues: { - name: "Local Folder", - folder_path: "", - folder_name: "", - exclude_patterns: "node_modules,.git,.DS_Store", - file_extensions: "", - }, - }); - - const handleBrowse = async () => { - if (!isElectron) return; - const selected = await window.electronAPI!.selectFolder(); - if (selected) { - form.setValue("folder_path", selected); - const folderName = selected.split(/[\\/]/).pop() || "folder"; - if (!form.getValues("folder_name")) { - form.setValue("folder_name", folderName); - } - if (form.getValues("name") === "Local Folder") { - form.setValue("name", folderName); - } - } - }; - - const handleSubmit = async (values: LocalFolderFormValues) => { - if (isSubmittingRef.current || isSubmitting) return; - isSubmittingRef.current = true; - - try { - const excludePatterns = values.exclude_patterns - ? values.exclude_patterns - .split(",") - .map((p) => p.trim()) - .filter(Boolean) - : []; - - const fileExtensions = values.file_extensions - ? values.file_extensions - .split(",") - .map((e) => { - const ext = e.trim(); - return ext.startsWith(".") ? ext : `.${ext}`; - }) - .filter(Boolean) - : null; - - await onSubmit({ - name: values.name, - connector_type: EnumConnectorName.LOCAL_FOLDER_CONNECTOR, - config: { - folder_path: values.folder_path, - folder_name: values.folder_name, - exclude_patterns: excludePatterns, - file_extensions: fileExtensions, - }, - is_indexable: true, - is_active: true, - last_indexed_at: null, - periodic_indexing_enabled: false, - indexing_frequency_minutes: null, - next_scheduled_at: null, - }); - } finally { - isSubmittingRef.current = false; - } - }; - - return ( -
- - - Desktop App Required - - Real-time file watching is powered by the SurfSense desktop app. Files are - automatically synced whenever changes are detected. - - - -
-
- - ( - - Connector Name - - - - - - )} - /> - - ( - - Folder Path -
- - - - {isElectron && ( - - )} -
- - The absolute path to the folder to watch and sync. - - -
- )} - /> - - ( - - Display Name - - - - - A friendly name shown in the documents sidebar. - - - - )} - /> - - ( - - Exclude Patterns - - - - - Comma-separated patterns of directories/files to exclude. - - - - )} - /> - - ( - - File Extensions (optional) - - - - - Leave empty to index all supported files, or specify comma-separated extensions. - - - - )} - /> - - - -
- - {getConnectorBenefits(EnumConnectorName.LOCAL_FOLDER_CONNECTOR) && ( -
-

- What you get with Local Folder sync: -

-
    - {getConnectorBenefits(EnumConnectorName.LOCAL_FOLDER_CONNECTOR)?.map( - (benefit) =>
  • {benefit}
  • - )} -
-
- )} -
- ); -}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts index 40c6a7fdd..0dc093100 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/connector-benefits.ts @@ -111,14 +111,6 @@ export function getConnectorBenefits(connectorType: string): string[] | null { "Incremental sync - only changed files are re-indexed", "Full support for your vault's folder structure", ], - LOCAL_FOLDER_CONNECTOR: [ - "Watch local folders for real-time changes via the desktop app", - "Automatic change detection — only modified files are re-indexed", - "Version history with up to 20 snapshots per document", - "Mirrors your folder structure in the SurfSense sidebar", - "Supports any text-based file format", - "Works as a periodic sync fallback when the desktop app is not running", - ], }; return benefits[connectorType] || null; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/index.tsx b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/index.tsx index 116893399..b6d813748 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connect-forms/index.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connect-forms/index.tsx @@ -7,7 +7,6 @@ import { GithubConnectForm } from "./components/github-connect-form"; import { LinkupApiConnectForm } from "./components/linkup-api-connect-form"; import { LumaConnectForm } from "./components/luma-connect-form"; import { MCPConnectForm } from "./components/mcp-connect-form"; -import { LocalFolderConnectForm } from "./components/local-folder-connect-form"; import { ObsidianConnectForm } from "./components/obsidian-connect-form"; import { TavilyApiConnectForm } from "./components/tavily-api-connect-form"; @@ -59,8 +58,6 @@ export function getConnectFormComponent(connectorType: string): ConnectFormCompo return MCPConnectForm; case "OBSIDIAN_CONNECTOR": return ObsidianConnectForm; - case "LOCAL_FOLDER_CONNECTOR": - return LocalFolderConnectForm; default: return null; } diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/local-folder-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/local-folder-config.tsx deleted file mode 100644 index cb4295079..000000000 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/local-folder-config.tsx +++ /dev/null @@ -1,163 +0,0 @@ -"use client"; - -import type { FC } from "react"; -import { useState } from "react"; -import { FolderSync } from "lucide-react"; -import { Button } from "@/components/ui/button"; -import { Input } from "@/components/ui/input"; -import { Label } from "@/components/ui/label"; -import type { ConnectorConfigProps } from "../index"; - -export const LocalFolderConfig: FC = ({ - connector, - onConfigChange, - onNameChange, -}) => { - const isElectron = typeof window !== "undefined" && !!window.electronAPI; - - const [folderPath, setFolderPath] = useState( - (connector.config?.folder_path as string) || "" - ); - const [folderName, setFolderName] = useState( - (connector.config?.folder_name as string) || "" - ); - const [excludePatterns, setExcludePatterns] = useState(() => { - const patterns = connector.config?.exclude_patterns; - if (Array.isArray(patterns)) { - return patterns.join(", "); - } - return (patterns as string) || "node_modules, .git, .DS_Store"; - }); - const [fileExtensions, setFileExtensions] = useState(() => { - const exts = connector.config?.file_extensions; - if (Array.isArray(exts)) { - return exts.join(", "); - } - return (exts as string) || ""; - }); - const [name, setName] = useState(connector.name || ""); - - const handleFolderPathChange = (value: string) => { - setFolderPath(value); - onConfigChange?.({ ...connector.config, folder_path: value }); - }; - - const handleFolderNameChange = (value: string) => { - setFolderName(value); - onConfigChange?.({ ...connector.config, folder_name: value }); - }; - - const handleExcludePatternsChange = (value: string) => { - setExcludePatterns(value); - const arr = value - .split(",") - .map((p) => p.trim()) - .filter(Boolean); - onConfigChange?.({ ...connector.config, exclude_patterns: arr }); - }; - - const handleFileExtensionsChange = (value: string) => { - setFileExtensions(value); - const arr = value - ? value - .split(",") - .map((e) => { - const ext = e.trim(); - return ext.startsWith(".") ? ext : `.${ext}`; - }) - .filter(Boolean) - : null; - onConfigChange?.({ ...connector.config, file_extensions: arr }); - }; - - const handleNameChange = (value: string) => { - setName(value); - onNameChange?.(value); - }; - - const handleBrowse = async () => { - if (!isElectron) return; - const selected = await window.electronAPI!.selectFolder(); - if (selected) { - handleFolderPathChange(selected); - const autoName = selected.split(/[\\/]/).pop() || "folder"; - if (!folderName) handleFolderNameChange(autoName); - } - }; - - return ( -
-
-
- - handleNameChange(e.target.value)} - placeholder="Local Folder" - className="border-slate-400/20 focus-visible:border-slate-400/40" - /> -
-
- -
-

Folder Configuration

- -
-
- -
- handleFolderPathChange(e.target.value)} - placeholder="/path/to/your/folder" - className="border-slate-400/20 focus-visible:border-slate-400/40 font-mono flex-1" - /> - {isElectron && ( - - )} -
-
- -
- - handleFolderNameChange(e.target.value)} - placeholder="My Notes" - className="border-slate-400/20 focus-visible:border-slate-400/40" - /> -
- -
- - handleExcludePatternsChange(e.target.value)} - placeholder="node_modules, .git, .DS_Store" - className="border-slate-400/20 focus-visible:border-slate-400/40 font-mono" - /> -

- Comma-separated patterns of directories/files to exclude. -

-
- -
- - handleFileExtensionsChange(e.target.value)} - placeholder=".md, .txt, .rst" - className="border-slate-400/20 focus-visible:border-slate-400/40 font-mono" - /> -

- Leave empty to index all supported files. -

-
-
-
-
- ); -}; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx index 3dc1891c8..a63435260 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/index.tsx @@ -19,7 +19,6 @@ import { JiraConfig } from "./components/jira-config"; import { LinkupApiConfig } from "./components/linkup-api-config"; import { LumaConfig } from "./components/luma-config"; import { MCPConfig } from "./components/mcp-config"; -import { LocalFolderConfig } from "./components/local-folder-config"; import { ObsidianConfig } from "./components/obsidian-config"; import { OneDriveConfig } from "./components/onedrive-config"; import { SlackConfig } from "./components/slack-config"; @@ -83,8 +82,6 @@ export function getConnectorConfigComponent( return MCPConfig; case "OBSIDIAN_CONNECTOR": return ObsidianConfig; - case "LOCAL_FOLDER_CONNECTOR": - return LocalFolderConfig; case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": return ComposioDriveConfig; case "COMPOSIO_GMAIL_CONNECTOR": diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx index 0b6d0917a..596b98e93 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-connect-view.tsx @@ -20,7 +20,6 @@ const FORM_ID_MAP: Record = { CIRCLEBACK_CONNECTOR: "circleback-connect-form", MCP_CONNECTOR: "mcp-connect-form", OBSIDIAN_CONNECTOR: "obsidian-connect-form", - LOCAL_FOLDER_CONNECTOR: "local-folder-connect-form", }; interface ConnectorConnectViewProps { diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index dcedb4743..05d42adcb 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -278,8 +278,7 @@ export const ConnectorEditView: FC = ({ connector.connector_type !== "DROPBOX_CONNECTOR" && connector.connector_type !== "ONEDRIVE_CONNECTOR" && connector.connector_type !== "WEBCRAWLER_CONNECTOR" && - connector.connector_type !== "GITHUB_CONNECTOR" && - connector.connector_type !== "LOCAL_FOLDER_CONNECTOR" && ( + connector.connector_type !== "GITHUB_CONNECTOR" && ( = ({ /> )} - {/* Periodic sync - shown for all indexable connectors except Local Folder */} - {connector.connector_type !== "LOCAL_FOLDER_CONNECTOR" && - (() => { + {(() => { const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR"; const isComposioGoogleDrive = connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"; diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index 436ce7843..e583cbe17 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -164,8 +164,7 @@ export const IndexingConfigurationView: FC = ({ config.connectorType !== "DROPBOX_CONNECTOR" && config.connectorType !== "ONEDRIVE_CONNECTOR" && config.connectorType !== "WEBCRAWLER_CONNECTOR" && - config.connectorType !== "GITHUB_CONNECTOR" && - config.connectorType !== "LOCAL_FOLDER_CONNECTOR" && ( + config.connectorType !== "GITHUB_CONNECTOR" && ( = ({ /> )} - {/* Periodic sync - not shown for file-based connectors (Drive, Dropbox, OneDrive) or Local Folder in initial setup; configured in edit view instead */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "DROPBOX_CONNECTOR" && - config.connectorType !== "ONEDRIVE_CONNECTOR" && - config.connectorType !== "LOCAL_FOLDER_CONNECTOR" && ( + config.connectorType !== "ONEDRIVE_CONNECTOR" && ( Date: Thu, 2 Apr 2026 22:20:11 +0530 Subject: [PATCH 045/202] feat: add renderer readiness signaling and update IPC channels for folder sync --- surfsense_desktop/src/ipc/channels.ts | 1 + surfsense_desktop/src/ipc/handlers.ts | 5 ++ .../src/modules/folder-watcher.ts | 61 +++++++++++++------ surfsense_desktop/src/preload.ts | 1 + 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts index 362d3362d..66788d90e 100644 --- a/surfsense_desktop/src/ipc/channels.ts +++ b/surfsense_desktop/src/ipc/channels.ts @@ -16,4 +16,5 @@ export const IPC_CHANNELS = { FOLDER_SYNC_WATCHER_READY: 'folder-sync:watcher-ready', FOLDER_SYNC_PAUSE: 'folder-sync:pause', FOLDER_SYNC_RESUME: 'folder-sync:resume', + FOLDER_SYNC_RENDERER_READY: 'folder-sync:renderer-ready', } as const; diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts index 2baf957b0..19051e871 100644 --- a/surfsense_desktop/src/ipc/handlers.ts +++ b/surfsense_desktop/src/ipc/handlers.ts @@ -8,6 +8,7 @@ import { getWatcherStatus, pauseWatcher, resumeWatcher, + markRendererReady, } from '../modules/folder-watcher'; export function registerIpcHandlers(): void { @@ -44,4 +45,8 @@ export function registerIpcHandlers(): void { ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_PAUSE, () => pauseWatcher()); ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_RESUME, () => resumeWatcher()); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY, () => { + markRendererReady(); + }); } diff --git a/surfsense_desktop/src/modules/folder-watcher.ts b/surfsense_desktop/src/modules/folder-watcher.ts index 072ae7b3f..81a835c22 100644 --- a/surfsense_desktop/src/modules/folder-watcher.ts +++ b/surfsense_desktop/src/modules/folder-watcher.ts @@ -9,7 +9,7 @@ export interface WatchedFolderConfig { name: string; excludePatterns: string[]; fileExtensions: string[] | null; - connectorId: number; + rootFolderId: number | null; searchSpaceId: number; active: boolean; } @@ -34,6 +34,25 @@ let watchers: Map = new Map(); */ const mtimeMaps: Map = new Map(); +let rendererReady = false; +const pendingEvents: any[] = []; + +export function markRendererReady() { + rendererReady = true; + for (const event of pendingEvents) { + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, event); + } + pendingEvents.length = 0; +} + +function sendFileChangedEvent(data: any) { + if (rendererReady) { + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, data); + } else { + pendingEvents.push(data); + } +} + async function getStore() { if (!store) { const { default: Store } = await import('electron-store'); @@ -83,7 +102,6 @@ function walkFolderMtimes(config: WatchedFolderConfig): MtimeMap { for (const entry of entries) { const name = entry.name; - // Skip dotfiles/dotdirs and excluded names if (name.startsWith('.') || excludes.has(name)) continue; const full = path.join(dir, name); @@ -131,7 +149,6 @@ async function startWatcher(config: WatchedFolderConfig) { return; } - // Load persisted mtime map into memory before starting the watcher const ms = await getMtimeStore(); const storedMap: MtimeMap = ms.get(config.path) ?? {}; mtimeMaps.set(config.path, { ...storedMap }); @@ -156,45 +173,49 @@ async function startWatcher(config: WatchedFolderConfig) { watcher.on('ready', () => { ready = true; - // Detect offline changes by diffing current filesystem against stored mtime map const currentMap = walkFolderMtimes(config); const storedSnapshot = loadMtimeMap(config.path); const now = Date.now(); + // Track which files are unchanged so we can selectively update the mtime map + const unchangedMap: MtimeMap = {}; + for (const [rel, currentMtime] of Object.entries(currentMap)) { const storedMtime = storedSnapshot[rel]; if (storedMtime === undefined) { - // New file added while app was closed - sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, { - connectorId: config.connectorId, + sendFileChangedEvent({ + rootFolderId: config.rootFolderId, searchSpaceId: config.searchSpaceId, folderPath: config.path, + folderName: config.name, relativePath: rel, fullPath: path.join(config.path, rel), action: 'add', timestamp: now, }); } else if (Math.abs(currentMtime - storedMtime) >= MTIME_TOLERANCE_S * 1000) { - // File modified while app was closed - sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, { - connectorId: config.connectorId, + sendFileChangedEvent({ + rootFolderId: config.rootFolderId, searchSpaceId: config.searchSpaceId, folderPath: config.path, + folderName: config.name, relativePath: rel, fullPath: path.join(config.path, rel), action: 'change', timestamp: now, }); + } else { + unchangedMap[rel] = currentMtime; } } for (const rel of Object.keys(storedSnapshot)) { if (!(rel in currentMap)) { - // File deleted while app was closed - sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, { - connectorId: config.connectorId, + sendFileChangedEvent({ + rootFolderId: config.rootFolderId, searchSpaceId: config.searchSpaceId, folderPath: config.path, + folderName: config.name, relativePath: rel, fullPath: path.join(config.path, rel), action: 'unlink', @@ -203,12 +224,13 @@ async function startWatcher(config: WatchedFolderConfig) { } } - // Replace stored map with current filesystem state - mtimeMaps.set(config.path, currentMap); + // Only update the mtime map for unchanged files; changed files keep their + // stored mtime so they'll be re-detected if the app crashes before indexing. + mtimeMaps.set(config.path, unchangedMap); persistMtimeMap(config.path); sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_WATCHER_READY, { - connectorId: config.connectorId, + rootFolderId: config.rootFolderId, folderPath: config.path, }); }); @@ -226,7 +248,6 @@ async function startWatcher(config: WatchedFolderConfig) { if (!config.fileExtensions.includes(ext)) return; } - // Keep mtime map in sync with live changes const map = mtimeMaps.get(config.path); if (map) { if (action === 'unlink') { @@ -241,10 +262,11 @@ async function startWatcher(config: WatchedFolderConfig) { persistMtimeMap(config.path); } - sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, { - connectorId: config.connectorId, + sendFileChangedEvent({ + rootFolderId: config.rootFolderId, searchSpaceId: config.searchSpaceId, folderPath: config.path, + folderName: config.name, relativePath, fullPath: filePath, action, @@ -311,7 +333,6 @@ export async function removeWatchedFolder( stopWatcher(folderPath); - // Clean up persisted mtime map for this folder mtimeMaps.delete(folderPath); const ms = await getMtimeStore(); ms.delete(folderPath); diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 8f65aa633..7c190db10 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -44,4 +44,5 @@ contextBridge.exposeInMainWorld('electronAPI', { }, pauseWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_PAUSE), resumeWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RESUME), + signalRendererReady: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY), }); From 493d720b891cf6ef478223d2645ba9e4b9504ab6 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:21:01 +0530 Subject: [PATCH 046/202] refactor: remove Local Folder connector references and enhance folder management features --- .../constants/connector-constants.ts | 8 - .../hooks/use-connector-dialog.ts | 28 --- .../utils/connector-document-mapping.ts | 1 - .../components/documents/FolderNode.tsx | 139 ++++++++----- .../components/documents/FolderTreeView.tsx | 9 + .../components/editor-panel/editor-panel.tsx | 18 +- .../layout/ui/sidebar/DocumentsSidebar.tsx | 186 ++++++++++++++---- 7 files changed, 257 insertions(+), 132 deletions(-) diff --git a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts index 3f7d90cd8..2e92f637b 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/constants/connector-constants.ts @@ -184,14 +184,6 @@ export const OTHER_CONNECTORS = [ connectorType: EnumConnectorName.OBSIDIAN_CONNECTOR, selfHostedOnly: true, }, - { - id: "local-folder-connector", - title: "Local Folder", - description: "Watch and sync local folders (desktop only)", - connectorType: EnumConnectorName.LOCAL_FOLDER_CONNECTOR, - selfHostedOnly: true, - desktopOnly: true, - }, ] as const; // Composio Connectors - Individual entries for each supported toolkit diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 2404b8eb5..6543bbd72 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -586,23 +586,6 @@ export const useConnectorDialog = () => { }, }); - // Register folder watcher in Electron for real-time sync - if ( - currentConnectorType === EnumConnectorName.LOCAL_FOLDER_CONNECTOR && - window.electronAPI?.addWatchedFolder - ) { - const cfg = connector.config || {}; - await window.electronAPI.addWatchedFolder({ - path: cfg.folder_path as string, - name: cfg.folder_name as string, - excludePatterns: (cfg.exclude_patterns as string[]) || [], - fileExtensions: (cfg.file_extensions as string[] | null) ?? null, - connectorId: connector.id, - searchSpaceId: Number(searchSpaceId), - active: true, - }); - } - const successMessage = currentConnectorType === "MCP_CONNECTOR" ? `${connector.name} added successfully` @@ -1207,17 +1190,6 @@ export const useConnectorDialog = () => { id: editingConnector.id, }); - // Unregister folder watcher in Electron when removing a Local Folder connector - if ( - editingConnector.connector_type === EnumConnectorName.LOCAL_FOLDER_CONNECTOR && - window.electronAPI?.removeWatchedFolder && - editingConnector.config?.folder_path - ) { - await window.electronAPI.removeWatchedFolder( - editingConnector.config.folder_path as string - ); - } - // Track connector deleted event trackConnectorDeleted( Number(searchSpaceId), diff --git a/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts b/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts index dd5978002..f924bb15f 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/utils/connector-document-mapping.ts @@ -29,7 +29,6 @@ export const CONNECTOR_TO_DOCUMENT_TYPE: Record = { BOOKSTACK_CONNECTOR: "BOOKSTACK_CONNECTOR", CIRCLEBACK_CONNECTOR: "CIRCLEBACK", OBSIDIAN_CONNECTOR: "OBSIDIAN_CONNECTOR", - LOCAL_FOLDER_CONNECTOR: "LOCAL_FOLDER_FILE", // Special mappings (connector type differs from document type) GOOGLE_DRIVE_CONNECTOR: "GOOGLE_DRIVE_FILE", diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 6a36f724f..1521c06fe 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -3,12 +3,15 @@ import { ChevronDown, ChevronRight, + Eye, + EyeOff, Folder, FolderOpen, FolderPlus, MoreHorizontal, Move, PenLine, + RefreshCw, Trash2, } from "lucide-react"; import React, { useCallback, useEffect, useRef, useState } from "react"; @@ -70,6 +73,9 @@ interface FolderNodeProps { disabledDropIds?: Set; contextMenuOpen?: boolean; onContextMenuOpenChange?: (open: boolean) => void; + isWatched?: boolean; + onRescan?: (folder: FolderDisplay) => void; + onStopWatching?: (folder: FolderDisplay) => void; } function getDropZone( @@ -107,6 +113,9 @@ export const FolderNode = React.memo(function FolderNode({ disabledDropIds, contextMenuOpen, onContextMenuOpenChange, + isWatched, + onRescan, + onStopWatching, }: FolderNodeProps) { const [renameValue, setRenameValue] = useState(folder.name); const inputRef = useRef(null); @@ -307,73 +316,107 @@ export const FolderNode = React.memo(function FolderNode({ - + + {isWatched && onRescan && ( { e.stopPropagation(); - onCreateSubfolder(folder.id); + onRescan(folder); }} > - - New subfolder + + Re-scan + )} + {isWatched && onStopWatching && ( { e.stopPropagation(); - startRename(); + onStopWatching(folder); }} > - - Rename + + Stop watching - { - e.stopPropagation(); - onMove(folder); - }} - > - - Move to... - - { - e.stopPropagation(); - onDelete(folder); - }} - > - - Delete - - + )} + { + e.stopPropagation(); + onCreateSubfolder(folder.id); + }} + > + + New subfolder + + { + e.stopPropagation(); + startRename(); + }} + > + + Rename + + { + e.stopPropagation(); + onMove(folder); + }} + > + + Move to... + + { + e.stopPropagation(); + onDelete(folder); + }} + > + + Delete + + )}
- {!isRenaming && contextMenuOpen && ( - - onCreateSubfolder(folder.id)}> - - New subfolder + {!isRenaming && contextMenuOpen && ( + + {isWatched && onRescan && ( + onRescan(folder)}> + + Re-scan - startRename()}> - - Rename + )} + {isWatched && onStopWatching && ( + onStopWatching(folder)}> + + Stop watching - onMove(folder)}> - - Move to... - - onDelete(folder)} - > - - Delete - - - )} + )} + onCreateSubfolder(folder.id)}> + + New subfolder + + startRename()}> + + Rename + + onMove(folder)}> + + Move to... + + onDelete(folder)} + > + + Delete + + + )} ); }); diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index 7695923e3..5945edccb 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -40,6 +40,9 @@ interface FolderTreeViewProps { targetFolderId: number | null ) => void; onReorderFolder?: (folderId: number, beforePos: string | null, afterPos: string | null) => void; + watchedFolderIds?: Set; + onRescanFolder?: (folder: FolderDisplay) => void; + onStopWatchingFolder?: (folder: FolderDisplay) => void; } function groupBy(items: T[], keyFn: (item: T) => string | number): Record { @@ -73,6 +76,9 @@ export function FolderTreeView({ searchQuery, onDropIntoFolder, onReorderFolder, + watchedFolderIds, + onRescanFolder, + onStopWatchingFolder, }: FolderTreeViewProps) { const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]); @@ -204,6 +210,9 @@ export function FolderTreeView({ siblingPositions={siblingPositions} contextMenuOpen={openContextMenuId === `folder-${f.id}`} onContextMenuOpenChange={(open) => setOpenContextMenuId(open ? `folder-${f.id}` : null)} + isWatched={watchedFolderIds?.has(f.id)} + onRescan={onRescanFolder} + onStopWatching={onStopWatchingFolder} /> ); diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 802a5ffc3..a1195ef33 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -6,6 +6,7 @@ import dynamic from "next/dynamic"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom"; +import { VersionHistoryButton } from "@/components/documents/version-history"; import { MarkdownViewer } from "@/components/markdown-viewer"; import { Button } from "@/components/ui/button"; import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer"; @@ -180,12 +181,16 @@ export function EditorPanelContent({ return ( <>
-
-

{displayTitle}

- {isEditableType && editedMarkdown !== null && ( -

Unsaved changes

- )} -
+
+

{displayTitle}

+ {isEditableType && editedMarkdown !== null && ( +

Unsaved changes

+ )} +
+
+ {editorDoc?.document_type && ( + + )} {onClose && ( )}
+
{isLoading ? ( diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index d880524bd..202d170d9 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -40,6 +40,7 @@ import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; import type { DocumentTypeEnum } from "@/contracts/types/document.types"; import { useDebouncedValue } from "@/hooks/use-debounced-value"; import { useMediaQuery } from "@/hooks/use-media-query"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; import { foldersApiService } from "@/lib/apis/folders-api.service"; import { authenticatedFetch } from "@/lib/auth-utils"; import { queries } from "@/zero/queries/index"; @@ -92,6 +93,24 @@ export function DocumentsSidebar({ const [search, setSearch] = useState(""); const debouncedSearch = useDebouncedValue(search, 250); const [activeTypes, setActiveTypes] = useState([]); + const [watchedFolderIds, setWatchedFolderIds] = useState>(new Set()); + + useEffect(() => { + const api = typeof window !== "undefined" ? window.electronAPI : null; + if (!api?.getWatchedFolders) return; + + async function loadWatchedIds() { + const folders = await api!.getWatchedFolders(); + const ids = new Set( + folders + .filter((f) => f.rootFolderId != null) + .map((f) => f.rootFolderId as number) + ); + setWatchedFolderIds(ids); + } + + loadWatchedIds(); + }, []); const { mutateAsync: deleteDocumentMutation } = useAtomValue(deleteDocumentMutationAtom); const [sidebarDocs, setSidebarDocs] = useAtom(sidebarSelectedDocumentsAtom); @@ -223,6 +242,87 @@ export function DocumentsSidebar({ [createFolderParentId, searchSpaceId, setExpandedFolderMap] ); + const isElectron = typeof window !== "undefined" && !!window.electronAPI; + + const handleWatchFolder = useCallback(async () => { + const api = window.electronAPI; + if (!api) return; + + const folderPath = await api.selectFolder(); + if (!folderPath) return; + + const folderName = folderPath.split("/").pop() || folderPath.split("\\").pop() || folderPath; + + try { + const result = await documentsApiService.folderIndex(searchSpaceId, { + folder_path: folderPath, + folder_name: folderName, + search_space_id: searchSpaceId, + }); + + const rootFolderId = (result as { root_folder_id?: number })?.root_folder_id ?? null; + + await api.addWatchedFolder({ + path: folderPath, + name: folderName, + excludePatterns: [".git", "node_modules", "__pycache__", ".DS_Store", ".obsidian", ".trash"], + fileExtensions: null, + rootFolderId, + searchSpaceId, + active: true, + }); + + toast.success(`Watching folder: ${folderName}`); + } catch (err) { + toast.error((err as Error)?.message || "Failed to watch folder"); + } + }, [searchSpaceId]); + + const handleRescanFolder = useCallback( + async (folder: FolderDisplay) => { + const api = window.electronAPI; + if (!api) return; + + const watchedFolders = await api.getWatchedFolders(); + const matched = watchedFolders.find((wf) => wf.rootFolderId === folder.id); + if (!matched) { + toast.error("This folder is not being watched"); + return; + } + + try { + await documentsApiService.folderIndex(searchSpaceId, { + folder_path: matched.path, + folder_name: matched.name, + search_space_id: searchSpaceId, + root_folder_id: folder.id, + }); + toast.success(`Re-scanning folder: ${matched.name}`); + } catch (err) { + toast.error((err as Error)?.message || "Failed to re-scan folder"); + } + }, + [searchSpaceId] + ); + + const handleStopWatching = useCallback( + async (folder: FolderDisplay) => { + const api = window.electronAPI; + if (!api) return; + + const watchedFolders = await api.getWatchedFolders(); + const matched = watchedFolders.find((wf) => wf.rootFolderId === folder.id); + if (!matched) { + toast.error("This folder is not being watched"); + return; + } + + await api.removeWatchedFolder(matched.path); + toast.success(`Stopped watching: ${matched.name}`); + }, + [] + ); + const handleRenameFolder = useCallback(async (folder: FolderDisplay, newName: string) => { try { await foldersApiService.updateFolder(folder.id, { name: newName }); @@ -641,14 +741,15 @@ export function DocumentsSidebar({
- handleCreateFolder(null)} - /> + handleCreateFolder(null)} + onWatchFolder={isElectron ? handleWatchFolder : undefined} + />
{deletableSelectedIds.length > 0 && ( @@ -666,39 +767,42 @@ export function DocumentsSidebar({ )} { - openEditorPanel({ - documentId: doc.id, - searchSpaceId, - title: doc.title, - }); - }} - onEditDocument={(doc) => { - openEditorPanel({ - documentId: doc.id, - searchSpaceId, - title: doc.title, - }); - }} - onDeleteDocument={(doc) => handleDeleteDocument(doc.id)} - onMoveDocument={handleMoveDocument} - onExportDocument={handleExportDocument} - activeTypes={activeTypes} - onDropIntoFolder={handleDropIntoFolder} - onReorderFolder={handleReorderFolder} - /> + folders={treeFolders} + documents={searchFilteredDocuments} + expandedIds={expandedIds} + onToggleExpand={toggleFolderExpand} + mentionedDocIds={mentionedDocIds} + onToggleChatMention={handleToggleChatMention} + onToggleFolderSelect={handleToggleFolderSelect} + onRenameFolder={handleRenameFolder} + onDeleteFolder={handleDeleteFolder} + onMoveFolder={handleMoveFolder} + onCreateFolder={handleCreateFolder} + searchQuery={debouncedSearch.trim() || undefined} + onPreviewDocument={(doc) => { + openEditorPanel({ + documentId: doc.id, + searchSpaceId, + title: doc.title, + }); + }} + onEditDocument={(doc) => { + openEditorPanel({ + documentId: doc.id, + searchSpaceId, + title: doc.title, + }); + }} + onDeleteDocument={(doc) => handleDeleteDocument(doc.id)} + onMoveDocument={handleMoveDocument} + onExportDocument={handleExportDocument} + activeTypes={activeTypes} + onDropIntoFolder={handleDropIntoFolder} + onReorderFolder={handleReorderFolder} + watchedFolderIds={watchedFolderIds} + onRescanFolder={handleRescanFolder} + onStopWatchingFolder={handleStopWatching} + />
Date: Thu, 2 Apr 2026 22:21:16 +0530 Subject: [PATCH 047/202] refactor: completely remove Local Folder connector references and update folder sync logic --- surfsense_web/contracts/enums/connector.ts | 1 - .../contracts/enums/connectorIcons.tsx | 3 - .../contracts/types/connector.types.ts | 1 - surfsense_web/hooks/use-folder-sync.ts | 62 ++++++++++++++----- .../lib/apis/connectors-api.service.ts | 11 ---- .../lib/apis/documents-api.service.ts | 8 +++ surfsense_web/lib/connectors/utils.ts | 1 - surfsense_web/types/window.d.ts | 8 ++- 8 files changed, 60 insertions(+), 35 deletions(-) diff --git a/surfsense_web/contracts/enums/connector.ts b/surfsense_web/contracts/enums/connector.ts index ecf96d88e..501f5d9a3 100644 --- a/surfsense_web/contracts/enums/connector.ts +++ b/surfsense_web/contracts/enums/connector.ts @@ -25,7 +25,6 @@ export enum EnumConnectorName { YOUTUBE_CONNECTOR = "YOUTUBE_CONNECTOR", CIRCLEBACK_CONNECTOR = "CIRCLEBACK_CONNECTOR", OBSIDIAN_CONNECTOR = "OBSIDIAN_CONNECTOR", - LOCAL_FOLDER_CONNECTOR = "LOCAL_FOLDER_CONNECTOR", DROPBOX_CONNECTOR = "DROPBOX_CONNECTOR", MCP_CONNECTOR = "MCP_CONNECTOR", COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", diff --git a/surfsense_web/contracts/enums/connectorIcons.tsx b/surfsense_web/contracts/enums/connectorIcons.tsx index f7378b74b..2e609b060 100644 --- a/surfsense_web/contracts/enums/connectorIcons.tsx +++ b/surfsense_web/contracts/enums/connectorIcons.tsx @@ -3,7 +3,6 @@ import { BookOpen, File, FileText, - FolderSync, Globe, Microscope, Search, @@ -76,8 +75,6 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas return Circleback; case EnumConnectorName.MCP_CONNECTOR: return MCP; - case EnumConnectorName.LOCAL_FOLDER_CONNECTOR: - return ; case EnumConnectorName.OBSIDIAN_CONNECTOR: return Obsidian; case EnumConnectorName.COMPOSIO_GOOGLE_DRIVE_CONNECTOR: diff --git a/surfsense_web/contracts/types/connector.types.ts b/surfsense_web/contracts/types/connector.types.ts index 269941375..b83e05dcc 100644 --- a/surfsense_web/contracts/types/connector.types.ts +++ b/surfsense_web/contracts/types/connector.types.ts @@ -30,7 +30,6 @@ export const searchSourceConnectorTypeEnum = z.enum([ "DROPBOX_CONNECTOR", "MCP_CONNECTOR", "OBSIDIAN_CONNECTOR", - "LOCAL_FOLDER_CONNECTOR", "COMPOSIO_GOOGLE_DRIVE_CONNECTOR", "COMPOSIO_GMAIL_CONNECTOR", "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR", diff --git a/surfsense_web/hooks/use-folder-sync.ts b/surfsense_web/hooks/use-folder-sync.ts index a35faf98f..fcfb2814e 100644 --- a/surfsense_web/hooks/use-folder-sync.ts +++ b/surfsense_web/hooks/use-folder-sync.ts @@ -1,41 +1,73 @@ "use client"; import { useEffect, useRef } from "react"; -import { connectorsApiService } from "@/lib/apis/connectors-api.service"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; + +interface FileChangedEvent { + rootFolderId: number | null; + searchSpaceId: number; + folderPath: string; + folderName: string; + relativePath: string; + fullPath: string; + action: string; + timestamp: number; +} const DEBOUNCE_MS = 2000; export function useFolderSync() { - const pendingRef = useRef>>(new Map()); + const queueRef = useRef([]); + const processingRef = useRef(false); + const debounceTimers = useRef>>(new Map()); + + async function processQueue() { + if (processingRef.current) return; + processingRef.current = true; + while (queueRef.current.length > 0) { + const event = queueRef.current.shift()!; + try { + await documentsApiService.folderIndexFile(event.searchSpaceId, { + folder_path: event.folderPath, + folder_name: event.folderName, + search_space_id: event.searchSpaceId, + target_file_path: event.fullPath, + }); + } catch (err) { + console.error("[FolderSync] Failed to trigger re-index:", err); + } + } + processingRef.current = false; + } useEffect(() => { const api = typeof window !== "undefined" ? window.electronAPI : null; if (!api?.onFileChanged) return; - const cleanup = api.onFileChanged((event) => { - const key = `${event.connectorId}:${event.fullPath}`; + // Signal to main process that the renderer is ready to receive events + api.signalRendererReady?.(); - const existing = pendingRef.current.get(key); + const cleanup = api.onFileChanged((event: FileChangedEvent) => { + const key = `${event.folderPath}:${event.fullPath}`; + + const existing = debounceTimers.current.get(key); if (existing) clearTimeout(existing); - const timeout = setTimeout(async () => { - pendingRef.current.delete(key); - try { - await connectorsApiService.indexFile(event.connectorId, event.fullPath); - } catch (err) { - console.error("[FolderSync] Failed to trigger re-index:", err); - } + const timeout = setTimeout(() => { + debounceTimers.current.delete(key); + queueRef.current.push(event); + processQueue(); }, DEBOUNCE_MS); - pendingRef.current.set(key, timeout); + debounceTimers.current.set(key, timeout); }); return () => { cleanup(); - for (const timeout of pendingRef.current.values()) { + for (const timeout of debounceTimers.current.values()) { clearTimeout(timeout); } - pendingRef.current.clear(); + debounceTimers.current.clear(); }; }, []); } diff --git a/surfsense_web/lib/apis/connectors-api.service.ts b/surfsense_web/lib/apis/connectors-api.service.ts index f2722df70..7b94b3746 100644 --- a/surfsense_web/lib/apis/connectors-api.service.ts +++ b/surfsense_web/lib/apis/connectors-api.service.ts @@ -405,17 +405,6 @@ class ConnectorsApiService { ); }; - // ============================================================================= - // Local Folder Connector Methods - // ============================================================================= - - indexFile = async (connectorId: number, filePath: string) => { - return baseApiService.post( - `/api/v1/search-source-connectors/${connectorId}/index-file`, - undefined, - { body: { file_path: filePath } } - ); - }; } export type { SlackChannel, DiscordChannel }; diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index d4a80f8a0..c77cd6848 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -395,6 +395,14 @@ class DocumentsApiService { ); }; + folderIndex = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; exclude_patterns?: string[]; file_extensions?: string[]; root_folder_id?: number; enable_summary?: boolean }) => { + return baseApiService.post(`/api/v1/documents/folder-index`, undefined, { body }); + }; + + folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; enable_summary?: boolean }) => { + return baseApiService.post(`/api/v1/documents/folder-index-file`, undefined, { body }); + }; + /** * Delete a document */ diff --git a/surfsense_web/lib/connectors/utils.ts b/surfsense_web/lib/connectors/utils.ts index 6ce78be67..90f7f5d21 100644 --- a/surfsense_web/lib/connectors/utils.ts +++ b/surfsense_web/lib/connectors/utils.ts @@ -30,7 +30,6 @@ export const getConnectorTypeDisplay = (type: string): string => { YOUTUBE_CONNECTOR: "YouTube", CIRCLEBACK_CONNECTOR: "Circleback", OBSIDIAN_CONNECTOR: "Obsidian", - LOCAL_FOLDER_CONNECTOR: "Local Folder", DROPBOX_CONNECTOR: "Dropbox", MCP_CONNECTOR: "MCP Server", }; diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index 921449b41..b399664d6 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -5,15 +5,16 @@ interface WatchedFolderConfig { name: string; excludePatterns: string[]; fileExtensions: string[] | null; - connectorId: number; + rootFolderId: number | null; searchSpaceId: number; active: boolean; } interface FolderSyncFileChangedEvent { - connectorId: number; + rootFolderId: number | null; searchSpaceId: number; folderPath: string; + folderName: string; relativePath: string; fullPath: string; action: "add" | "change" | "unlink"; @@ -21,7 +22,7 @@ interface FolderSyncFileChangedEvent { } interface FolderSyncWatcherReadyEvent { - connectorId: number; + rootFolderId: number | null; folderPath: string; } @@ -49,6 +50,7 @@ interface ElectronAPI { onWatcherReady: (callback: (data: FolderSyncWatcherReadyEvent) => void) => () => void; pauseWatcher: () => Promise; resumeWatcher: () => Promise; + signalRendererReady: () => Promise; } declare global { From 22ee5c99cc9a656a3c5f0afae9c100874144e1b6 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:21:31 +0530 Subject: [PATCH 048/202] refactor: remove Local Folder connector and related tasks, implement new folder indexing endpoints --- surfsense_backend/app/db.py | 1 - .../app/routes/documents_routes.py | 143 ++++++++++++++++ .../routes/search_source_connectors_routes.py | 144 ---------------- .../app/tasks/celery_tasks/connector_tasks.py | 46 ------ .../app/tasks/celery_tasks/document_tasks.py | 66 ++++++++ .../app/tasks/connector_indexers/__init__.py | 2 - .../local_folder_indexer.py | 155 ++++++------------ .../tests/integration/conftest.py | 19 --- .../test_local_folder_pipeline.py | 126 +++++++------- 9 files changed, 326 insertions(+), 376 deletions(-) diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 25045e84a..1a4d3ea06 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -110,7 +110,6 @@ class SearchSourceConnectorType(StrEnum): COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR" COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" - LOCAL_FOLDER_CONNECTOR = "LOCAL_FOLDER_CONNECTOR" class PodcastStatus(StrEnum): diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 2d999eae3..d7974f9ff 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -2,6 +2,7 @@ import asyncio from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile +from pydantic import BaseModel as PydanticBaseModel from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload @@ -11,6 +12,7 @@ from app.db import ( Document, DocumentType, DocumentVersion, + Folder, Permission, SearchSpace, SearchSpaceMembership, @@ -1258,3 +1260,144 @@ async def restore_document_version( "document_id": document_id, "restored_version": version_number, } + + +# ===== Local folder indexing endpoints ===== + + +class FolderIndexRequest(PydanticBaseModel): + folder_path: str + folder_name: str + search_space_id: int + exclude_patterns: list[str] | None = None + file_extensions: list[str] | None = None + root_folder_id: int | None = None + enable_summary: bool = False + + +class FolderIndexFileRequest(PydanticBaseModel): + folder_path: str + folder_name: str + search_space_id: int + target_file_path: str + enable_summary: bool = False + + +@router.post("/documents/folder-index") +async def folder_index( + request: FolderIndexRequest, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Full-scan index of a local folder. Creates the root Folder row synchronously + and dispatches the heavy indexing work to a Celery task. + Returns the root_folder_id so the desktop can persist it. + """ + from app.config import config as app_config + + if not app_config.is_self_hosted(): + raise HTTPException( + status_code=400, + detail="Local folder indexing is only available in self-hosted mode", + ) + + await check_permission( + session, + user, + request.search_space_id, + Permission.DOCUMENTS_CREATE.value, + "You don't have permission to create documents in this search space", + ) + + root_folder_id = request.root_folder_id + if root_folder_id: + existing = ( + await session.execute( + select(Folder).where(Folder.id == root_folder_id) + ) + ).scalar_one_or_none() + if not existing: + root_folder_id = None + + if not root_folder_id: + root_folder = Folder( + name=request.folder_name, + search_space_id=request.search_space_id, + created_by_id=str(user.id), + position="a0", + ) + session.add(root_folder) + await session.flush() + root_folder_id = root_folder.id + await session.commit() + + from app.tasks.celery_tasks.document_tasks import index_local_folder_task + + index_local_folder_task.delay( + search_space_id=request.search_space_id, + user_id=str(user.id), + folder_path=request.folder_path, + folder_name=request.folder_name, + exclude_patterns=request.exclude_patterns, + file_extensions=request.file_extensions, + root_folder_id=root_folder_id, + enable_summary=request.enable_summary, + ) + + return { + "message": "Folder indexing started", + "status": "processing", + "root_folder_id": root_folder_id, + } + + +@router.post("/documents/folder-index-file") +async def folder_index_file( + request: FolderIndexFileRequest, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Index a single file within a watched folder (chokidar trigger). + Validates that target_file_path is under folder_path. + """ + from app.config import config as app_config + + if not app_config.is_self_hosted(): + raise HTTPException( + status_code=400, + detail="Local folder indexing is only available in self-hosted mode", + ) + + await check_permission( + session, + user, + request.search_space_id, + Permission.DOCUMENTS_CREATE.value, + "You don't have permission to create documents in this search space", + ) + + from pathlib import Path + + try: + Path(request.target_file_path).relative_to(request.folder_path) + except ValueError: + raise HTTPException( + status_code=400, + detail="target_file_path must be inside folder_path", + ) + + from app.tasks.celery_tasks.document_tasks import index_local_folder_task + + index_local_folder_task.delay( + search_space_id=request.search_space_id, + user_id=str(user.id), + folder_path=request.folder_path, + folder_name=request.folder_name, + target_file_path=request.target_file_path, + enable_summary=request.enable_summary, + ) + + return { + "message": "File indexing started", + "status": "processing", + } diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index 5ea88c418..f49ba2d5d 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -1170,24 +1170,6 @@ async def index_connector_content( ) response_message = "Obsidian vault indexing started in the background." - elif connector.connector_type == SearchSourceConnectorType.LOCAL_FOLDER_CONNECTOR: - from app.config import config as app_config - from app.tasks.celery_tasks.connector_tasks import index_local_folder_task - - if not app_config.is_self_hosted(): - raise HTTPException( - status_code=400, - detail="Local folder connector is only available in self-hosted mode", - ) - - logger.info( - f"Triggering local folder indexing for connector {connector_id} into search space {search_space_id}" - ) - index_local_folder_task.delay( - connector_id, search_space_id, str(user.id), indexing_from, indexing_to - ) - response_message = "Local folder indexing started in the background." - elif ( connector.connector_type == SearchSourceConnectorType.COMPOSIO_GOOGLE_DRIVE_CONNECTOR @@ -1320,76 +1302,6 @@ async def index_connector_content( ) from e -class IndexFileRequest(BaseModel): - file_path: str = Field(..., description="Absolute path to the file to index") - - -@router.post( - "/search-source-connectors/{connector_id}/index-file", - response_model=dict[str, Any], -) -async def index_single_file( - connector_id: int, - body: IndexFileRequest, - session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user), -): - """Index a single file from a local folder connector (chokidar real-time trigger).""" - from app.config import config as app_config - from app.tasks.celery_tasks.connector_tasks import index_local_folder_task - - if not app_config.is_self_hosted(): - raise HTTPException( - status_code=400, - detail="Local folder connector is only available in self-hosted mode", - ) - - result = await session.execute( - select(SearchSourceConnector).filter( - SearchSourceConnector.id == connector_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.LOCAL_FOLDER_CONNECTOR, - ) - ) - connector = result.scalars().first() - if not connector: - raise HTTPException(status_code=404, detail="Local folder connector not found") - - await check_permission(session, user, connector.search_space_id, Permission.CONNECTORS_UPDATE.value) - - folder_path = connector.config.get("folder_path", "") - - # Security: resolve symlinks and verify the file is inside folder_path - try: - resolved_file = os.path.realpath(body.file_path) - resolved_folder = os.path.realpath(folder_path) - if not resolved_file.startswith(resolved_folder + os.sep) and resolved_file != resolved_folder: - raise HTTPException( - status_code=403, - detail="File path is outside the configured folder", - ) - except (OSError, ValueError): - raise HTTPException( - status_code=403, - detail="Invalid file path", - ) - - index_local_folder_task.delay( - connector_id, - connector.search_space_id, - str(user.id), - None, - None, - target_file_path=resolved_file, - ) - - return { - "message": "Single file indexing started", - "connector_id": connector_id, - "file_path": body.file_path, - } - - async def _update_connector_timestamp_by_id(session: AsyncSession, connector_id: int): """ Update the last_indexed_at timestamp for a connector by its ID. @@ -3166,62 +3078,6 @@ async def run_obsidian_indexing( ) -async def run_local_folder_indexing_with_new_session( - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str, - end_date: str, - target_file_path: str | None = None, -): - """Wrapper to run local folder indexing with its own database session.""" - logger.info( - f"Background task started: Indexing local folder connector {connector_id} into space {search_space_id}" - ) - async with async_session_maker() as session: - await run_local_folder_indexing( - session, connector_id, search_space_id, user_id, start_date, end_date, - target_file_path=target_file_path, - ) - logger.info(f"Background task finished: Indexing local folder connector {connector_id}") - - -async def run_local_folder_indexing( - session: AsyncSession, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str, - end_date: str, - target_file_path: str | None = None, -): - """Background task to run local folder indexing.""" - from app.tasks.connector_indexers import index_local_folder - - await _run_indexing_with_notifications( - session=session, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - indexing_function=lambda session, connector_id, search_space_id, user_id, - start_date, end_date, update_last_indexed, on_heartbeat_callback: index_local_folder( - session=session, - connector_id=connector_id, - search_space_id=search_space_id, - user_id=user_id, - start_date=start_date, - end_date=end_date, - update_last_indexed=update_last_indexed, - on_heartbeat_callback=on_heartbeat_callback, - target_file_path=target_file_path, - ), - update_timestamp_func=_update_connector_timestamp_by_id, - supports_heartbeat_callback=True, - ) - - async def run_composio_indexing_with_new_session( connector_id: int, search_space_id: int, diff --git a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py index 9ff578ad2..57475c9fd 100644 --- a/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/connector_tasks.py @@ -926,52 +926,6 @@ async def _index_obsidian_vault( ) -@celery_app.task(name="index_local_folder", bind=True) -def index_local_folder_task( - self, - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str = None, - end_date: str = None, - target_file_path: str = None, -): - """Celery task to index a local folder.""" - import asyncio - - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - - try: - loop.run_until_complete( - _index_local_folder( - connector_id, search_space_id, user_id, start_date, end_date, target_file_path - ) - ) - finally: - loop.close() - - -async def _index_local_folder( - connector_id: int, - search_space_id: int, - user_id: str, - start_date: str = None, - end_date: str = None, - target_file_path: str = None, -): - """Index local folder with new session.""" - from app.routes.search_source_connectors_routes import ( - run_local_folder_indexing, - ) - - async with get_celery_session_maker()() as session: - await run_local_folder_indexing( - session, connector_id, search_space_id, user_id, start_date, end_date, - target_file_path=target_file_path, - ) - - @celery_app.task(name="index_composio_connector", bind=True) def index_composio_connector_task( self, diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index 662b41f2a..110f3deee 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -10,6 +10,7 @@ from app.config import config from app.services.notification_service import NotificationService from app.services.task_logging_service import TaskLoggingService from app.tasks.celery_tasks import get_celery_session_maker +from app.tasks.connector_indexers.local_folder_indexer import index_local_folder from app.tasks.document_processors import ( add_extension_received_document, add_youtube_video_document, @@ -1243,3 +1244,68 @@ async def _process_circleback_meeting( heartbeat_task.cancel() if notification: _stop_heartbeat(notification.id) + + +# ===== Local folder indexing task ===== + + +@celery_app.task(name="index_local_folder", bind=True) +def index_local_folder_task( + self, + search_space_id: int, + user_id: str, + folder_path: str, + folder_name: str, + exclude_patterns: list[str] | None = None, + file_extensions: list[str] | None = None, + root_folder_id: int | None = None, + enable_summary: bool = False, + target_file_path: str | None = None, +): + """Celery task to index a local folder. Config is passed directly — no connector row.""" + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + loop.run_until_complete( + _index_local_folder_async( + search_space_id=search_space_id, + user_id=user_id, + folder_path=folder_path, + folder_name=folder_name, + exclude_patterns=exclude_patterns, + file_extensions=file_extensions, + root_folder_id=root_folder_id, + enable_summary=enable_summary, + target_file_path=target_file_path, + ) + ) + finally: + loop.close() + + +async def _index_local_folder_async( + search_space_id: int, + user_id: str, + folder_path: str, + folder_name: str, + exclude_patterns: list[str] | None = None, + file_extensions: list[str] | None = None, + root_folder_id: int | None = None, + enable_summary: bool = False, + target_file_path: str | None = None, +): + """Run local folder indexing with a fresh DB session.""" + async with get_celery_session_maker()() as session: + await index_local_folder( + session=session, + search_space_id=search_space_id, + user_id=user_id, + folder_path=folder_path, + folder_name=folder_name, + exclude_patterns=exclude_patterns, + file_extensions=file_extensions, + root_folder_id=root_folder_id, + enable_summary=enable_summary, + target_file_path=target_file_path, + ) diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py index 8e4ad69e5..1b032d54a 100644 --- a/surfsense_backend/app/tasks/connector_indexers/__init__.py +++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py @@ -44,7 +44,6 @@ from .jira_indexer import index_jira_issues from .linear_indexer import index_linear_issues # Documentation and knowledge management -from .local_folder_indexer import index_local_folder from .luma_indexer import index_luma_events from .notion_indexer import index_notion_pages from .obsidian_indexer import index_obsidian_vault @@ -75,5 +74,4 @@ __all__ = [ # noqa: RUF022 # Communication platforms "index_slack_messages", "index_google_gmail_messages", - "index_local_folder", ] diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index fc7fdaf66..591914625 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -1,5 +1,5 @@ """ -Local folder connector indexer. +Local folder indexer. Indexes files from a local folder on disk. Supports: - Full-scan mode (startup reconciliation / manual trigger) @@ -8,7 +8,9 @@ Indexes files from a local folder on disk. Supports: - Document versioning via create_version_snapshot - ETL-based file parsing for binary formats (PDF, DOCX, images, audio, etc.) -Electron-only: all change detection is driven by chokidar in the desktop app. +Desktop-only: all change detection is driven by chokidar in the desktop app. +Config (folder_path, exclude_patterns, etc.) is passed in from the caller — +no connector row is read. """ import os @@ -17,10 +19,9 @@ from collections.abc import Awaitable, Callable from datetime import UTC, datetime from pathlib import Path -from sqlalchemy import delete, select +from sqlalchemy import select from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm.attributes import flag_modified from app.config import config from app.db import ( @@ -28,7 +29,6 @@ from app.db import ( DocumentStatus, DocumentType, Folder, - SearchSourceConnectorType, ) from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService @@ -45,11 +45,9 @@ from .base import ( build_document_metadata_string, check_document_by_unique_identifier, check_duplicate_document_by_hash, - get_connector_by_id, get_current_timestamp, logger, safe_set_chunks, - update_connector_last_indexed, ) PLAINTEXT_EXTENSIONS = frozenset({ @@ -131,12 +129,10 @@ def scan_folder( for dirpath, dirnames, filenames in os.walk(root): rel_dir = Path(dirpath).relative_to(root) - # Prune excluded directories in-place so os.walk skips them dirnames[:] = [ d for d in dirnames if d not in exclude_patterns ] - # Check if the current directory itself is excluded if any(part in exclude_patterns for part in rel_dir.parts): continue @@ -232,20 +228,18 @@ async def _mirror_folder_structure( folder_name: str, search_space_id: int, user_id: str, - connector_config: dict, - connector, + root_folder_id: int | None = None, exclude_patterns: list[str] | None = None, -) -> dict[str, int]: +) -> tuple[dict[str, int], int]: """Mirror the local filesystem directory structure into DB Folder rows. - Returns a mapping of relative_dir_path -> folder_id. - The empty string key ("") maps to the root folder. + Returns (mapping, root_folder_id) where mapping is + relative_dir_path -> folder_id. The empty string key maps to the root folder. """ root = Path(folder_path) if exclude_patterns is None: exclude_patterns = [] - # Collect all subdirectory paths relative to root subdirs: list[str] = [] for dirpath, dirnames, _ in os.walk(root): dirnames[:] = [d for d in dirnames if d not in exclude_patterns] @@ -256,13 +250,10 @@ async def _mirror_folder_structure( if rel_str: subdirs.append(rel_str) - # Sort by depth so parents are created before children subdirs.sort(key=lambda p: p.count(os.sep)) mapping: dict[str, int] = {} - # Get or create root folder - root_folder_id = connector_config.get("root_folder_id") if root_folder_id: existing = ( await session.execute( @@ -284,12 +275,8 @@ async def _mirror_folder_structure( session.add(root_folder) await session.flush() mapping[""] = root_folder.id - # Persist root_folder_id into connector config - connector_config["root_folder_id"] = root_folder.id - connector.config = {**connector.config, "root_folder_id": root_folder.id} - flag_modified(connector, "config") + root_folder_id = root_folder.id - # Create/reuse subdirectory Folder rows for rel_dir in subdirs: dir_parts = Path(rel_dir).parts dir_name = dir_parts[-1] @@ -322,7 +309,7 @@ async def _mirror_folder_structure( mapping[rel_dir] = new_folder.id await session.flush() - return mapping + return mapping, root_folder_id async def _cleanup_empty_folders( @@ -332,16 +319,11 @@ async def _cleanup_empty_folders( existing_dirs_on_disk: set[str], folder_mapping: dict[str, int], ) -> None: - """Delete Folder rows that are empty (no docs, no children) and no longer on disk. + """Delete Folder rows that are empty (no docs, no children) and no longer on disk.""" + from sqlalchemy import delete as sa_delete - Queries ALL folders under this search space (not just the current mapping) - so that stale folders from previous syncs are also cleaned up. - """ - # Build a reverse mapping from folder_id → rel_dir for known dirs id_to_rel: dict[int, str] = {fid: rel for rel, fid in folder_mapping.items() if rel} - # Also find any folders in the DB that are children of the root but NOT - # in the current mapping (stale from a previous sync). all_folders = ( await session.execute( select(Folder).where( @@ -351,7 +333,6 @@ async def _cleanup_empty_folders( ) ).scalars().all() - # Build candidates: folders not on disk that we might delete candidates: list[Folder] = [] for folder in all_folders: rel = id_to_rel.get(folder.id) @@ -359,8 +340,6 @@ async def _cleanup_empty_folders( continue candidates.append(folder) - # Sort deepest first (by name depth heuristic — folders with no children first) - # Repeat until no more deletions happen (cascading empty parents) changed = True while changed: changed = False @@ -384,57 +363,46 @@ async def _cleanup_empty_folders( remaining.append(folder) continue - await session.execute(delete(Folder).where(Folder.id == folder.id)) + await session.execute(sa_delete(Folder).where(Folder.id == folder.id)) changed = True candidates = remaining async def index_local_folder( session: AsyncSession, - connector_id: int, search_space_id: int, user_id: str, - start_date: str | None = None, - end_date: str | None = None, - update_last_indexed: bool = True, - on_heartbeat_callback: HeartbeatCallbackType | None = None, + folder_path: str, + folder_name: str, + exclude_patterns: list[str] | None = None, + file_extensions: list[str] | None = None, + root_folder_id: int | None = None, + enable_summary: bool = False, target_file_path: str | None = None, -) -> tuple[int, int, str | None]: + on_heartbeat_callback: HeartbeatCallbackType | None = None, +) -> tuple[int, int, int | None, str | None]: """Index files from a local folder. Supports two modes: - Full scan (target_file_path=None): walks entire folder, handles new/changed/deleted files. - Single-file (target_file_path set): processes only that file. - Returns (indexed_count, skipped_count, error_or_warning_message). + Returns (indexed_count, skipped_count, root_folder_id, error_or_warning_message). """ task_logger = TaskLoggingService(session, search_space_id) log_entry = await task_logger.log_task_start( task_name="local_folder_indexing", - source="connector_indexing_task", - message=f"Starting local folder indexing for connector {connector_id}", + source="local_folder_indexing_task", + message=f"Starting local folder indexing for {folder_name}", metadata={ - "connector_id": connector_id, + "folder_path": folder_path, "user_id": str(user_id), "target_file_path": target_file_path, }, ) try: - connector = await get_connector_by_id( - session, connector_id, SearchSourceConnectorType.LOCAL_FOLDER_CONNECTOR - ) - if not connector: - await task_logger.log_task_failure( - log_entry, - f"Connector {connector_id} not found", - "Connector not found", - {}, - ) - return 0, 0, f"Connector {connector_id} not found" - - folder_path = connector.config.get("folder_path") if not folder_path or not os.path.exists(folder_path): await task_logger.log_task_failure( log_entry, @@ -442,59 +410,54 @@ async def index_local_folder( "Folder not found", {}, ) - return 0, 0, f"Folder path missing or does not exist: {folder_path}" + return 0, 0, root_folder_id, f"Folder path missing or does not exist: {folder_path}" - folder_name = connector.config.get("folder_name") or os.path.basename(folder_path) - exclude_patterns = connector.config.get("exclude_patterns", DEFAULT_EXCLUDE_PATTERNS) - file_extensions = connector.config.get("file_extensions") # None = all + if exclude_patterns is None: + exclude_patterns = DEFAULT_EXCLUDE_PATTERNS # ==================================================================== # SINGLE-FILE MODE # ==================================================================== if target_file_path: - return await _index_single_file( + indexed, skipped, err = await _index_single_file( session=session, - connector=connector, - connector_id=connector_id, search_space_id=search_space_id, user_id=user_id, folder_path=folder_path, folder_name=folder_name, target_file_path=target_file_path, + enable_summary=enable_summary, task_logger=task_logger, log_entry=log_entry, - update_last_indexed=update_last_indexed, ) + return indexed, skipped, root_folder_id, err # ==================================================================== # FULL-SCAN MODE # ==================================================================== - # Phase 0: Mirror folder structure await task_logger.log_task_progress( log_entry, "Mirroring folder structure", {"stage": "folder_mirror"} ) - folder_mapping = await _mirror_folder_structure( + folder_mapping, root_folder_id = await _mirror_folder_structure( session=session, folder_path=folder_path, folder_name=folder_name, search_space_id=search_space_id, user_id=user_id, - connector_config=connector.config, - connector=connector, + root_folder_id=root_folder_id, exclude_patterns=exclude_patterns, ) await session.flush() - # Scan files on disk try: files = scan_folder(folder_path, file_extensions, exclude_patterns) except Exception as e: await task_logger.log_task_failure( log_entry, f"Failed to scan folder: {e}", "Scan error", {} ) - return 0, 0, f"Failed to scan folder: {e}" + return 0, 0, root_folder_id, f"Failed to scan folder: {e}" logger.info(f"Found {len(files)} files in folder") @@ -530,7 +493,6 @@ async def index_local_folder( ) if existing_document: - # Check mtime first (cheap) stored_mtime = (existing_document.document_metadata or {}).get("mtime") current_mtime = file_info["modified_at"].timestamp() @@ -542,7 +504,6 @@ async def index_local_folder( skipped_count += 1 continue - # mtime differs — read file and check content hash try: content, content_hash = await _compute_file_content_hash( file_path_abs, file_info["relative_path"], search_space_id @@ -553,7 +514,6 @@ async def index_local_folder( continue if existing_document.content_hash == content_hash: - # Content same, just update mtime in metadata meta = dict(existing_document.document_metadata or {}) meta["mtime"] = current_mtime existing_document.document_metadata = meta @@ -564,7 +524,6 @@ async def index_local_folder( skipped_count += 1 continue - # Content actually changed — snapshot version, queue for re-index await create_version_snapshot(session, existing_document) files_to_process.append( @@ -581,7 +540,6 @@ async def index_local_folder( ) continue - # New document — read content try: content, content_hash = await _compute_file_content_hash( file_path_abs, file_info["relative_path"], search_space_id @@ -595,7 +553,6 @@ async def index_local_folder( skipped_count += 1 continue - # Check for duplicate content from another connector with session.no_autoflush: dup = await check_duplicate_document_by_hash(session, content_hash) if dup: @@ -603,7 +560,6 @@ async def index_local_folder( skipped_count += 1 continue - # Determine folder_id for this file parent_dir = str(Path(relative_path).parent) if parent_dir == ".": parent_dir = "" @@ -616,17 +572,16 @@ async def index_local_folder( document_metadata={ "folder_name": folder_name, "file_path": relative_path, - "connector_id": connector_id, "mtime": file_info["modified_at"].timestamp(), }, content="Pending...", - content_hash=unique_identifier_hash, # Temp unique — updated in phase 2 + content_hash=unique_identifier_hash, unique_identifier_hash=unique_identifier_hash, embedding=None, status=DocumentStatus.pending(), updated_at=get_current_timestamp(), created_by_id=user_id, - connector_id=connector_id, + connector_id=None, folder_id=folder_id, ) session.add(document) @@ -655,16 +610,17 @@ async def index_local_folder( # ================================================================ # PHASE 1.5: Delete documents no longer on disk # ================================================================ - all_connector_docs = ( + all_folder_docs = ( await session.execute( select(Document).where( - Document.connector_id == connector_id, Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == search_space_id, + Document.folder_id.in_(list(folder_mapping.values())), ) ) ).scalars().all() - for doc in all_connector_docs: + for doc in all_folder_docs: if doc.unique_identifier_hash not in seen_unique_hashes: await session.delete(doc) @@ -709,7 +665,7 @@ async def index_local_folder( document_string = build_document_metadata_string(metadata_sections) summary_content = "" - if long_context_llm and connector.enable_summary: + if long_context_llm and enable_summary: doc_meta = { "folder_name": folder_name, "file_path": relative_path, @@ -721,7 +677,6 @@ async def index_local_folder( embedding = embed_text(document_string) chunks = await create_document_chunks(document_string) - # Determine folder_id parent_dir = str(Path(relative_path).parent) if parent_dir == ".": parent_dir = "" @@ -735,7 +690,6 @@ async def index_local_folder( document.document_metadata = { "folder_name": folder_name, "file_path": relative_path, - "connector_id": connector_id, "summary": summary_content, "mtime": file_info["modified_at"].timestamp(), } @@ -782,8 +736,6 @@ async def index_local_folder( session, root_fid, search_space_id, existing_dirs, folder_mapping ) - await update_connector_last_indexed(session, connector, update_last_indexed) - try: await session.commit() except Exception as e: @@ -802,7 +754,7 @@ async def index_local_folder( await task_logger.log_task_success( log_entry, - f"Completed local folder indexing for connector {connector_id}", + f"Completed local folder indexing for {folder_name}", { "indexed": indexed_count, "skipped": skipped_count, @@ -811,7 +763,7 @@ async def index_local_folder( }, ) - return indexed_count, skipped_count, warning_message + return indexed_count, skipped_count, root_folder_id, warning_message except SQLAlchemyError as e: logger.exception(f"Database error during local folder indexing: {e}") @@ -819,34 +771,31 @@ async def index_local_folder( await task_logger.log_task_failure( log_entry, f"DB error: {e}", "Database error", {} ) - return 0, 0, f"Database error: {e}" + return 0, 0, root_folder_id, f"Database error: {e}" except Exception as e: logger.exception(f"Error during local folder indexing: {e}") await task_logger.log_task_failure( log_entry, f"Error: {e}", "Unexpected error", {} ) - return 0, 0, str(e) + return 0, 0, root_folder_id, str(e) async def _index_single_file( session: AsyncSession, - connector, - connector_id: int, search_space_id: int, user_id: str, folder_path: str, folder_name: str, target_file_path: str, + enable_summary: bool, task_logger, log_entry, - update_last_indexed: bool = True, ) -> tuple[int, int, str | None]: """Process a single file (chokidar real-time trigger).""" try: full_path = Path(target_file_path) if not full_path.exists(): - # File was deleted — find and remove the document rel = str(full_path.relative_to(folder_path)) unique_id = f"{folder_name}:{rel}" uid_hash = generate_unique_identifier_hash( @@ -880,7 +829,6 @@ async def _index_single_file( if existing: if existing.content_hash == content_hash: - # Update mtime mtime = full_path.stat().st_mtime meta = dict(existing.document_metadata or {}) meta["mtime"] = mtime @@ -888,10 +836,8 @@ async def _index_single_file( await session.commit() return 0, 1, None - # Content changed — snapshot + re-index await create_version_snapshot(session, existing) - # Get LLM long_context_llm = await get_user_long_context_llm( session, user_id, search_space_id ) @@ -906,7 +852,7 @@ async def _index_single_file( document_string = build_document_metadata_string(metadata_sections) summary_content = "" - if long_context_llm and connector.enable_summary: + if long_context_llm and enable_summary: summary_content, _ = await generate_document_summary( document_string, long_context_llm, {"folder_name": folder_name, "file_path": rel_path} ) @@ -917,7 +863,6 @@ async def _index_single_file( doc_metadata = { "folder_name": folder_name, "file_path": rel_path, - "connector_id": connector_id, "summary": summary_content, "mtime": mtime, } @@ -946,16 +891,14 @@ async def _index_single_file( status=DocumentStatus.ready(), updated_at=get_current_timestamp(), created_by_id=user_id, - connector_id=connector_id, + connector_id=None, ) session.add(document) - # Set chunks await session.flush() for chunk in chunks: chunk.document_id = document.id session.add_all(chunks) - await update_connector_last_indexed(session, connector, update_last_indexed) await session.commit() await task_logger.log_task_success( diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index 840246e2f..9c91011ae 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -168,22 +168,3 @@ def make_connector_document(db_connector, db_user): return _make -@pytest_asyncio.fixture -async def db_local_folder_connector( - db_session: AsyncSession, db_user: User, db_search_space: SearchSpace, tmp_path -) -> SearchSourceConnector: - connector = SearchSourceConnector( - name="Test Local Folder", - connector_type=SearchSourceConnectorType.LOCAL_FOLDER_CONNECTOR, - config={ - "folder_path": str(tmp_path), - "folder_name": "test-folder", - "exclude_patterns": [], - "file_extensions": None, - }, - search_space_id=db_search_space.id, - user_id=db_user.id, - ) - db_session.add(connector) - await db_session.flush() - return connector diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index 988905f8f..e46d59a67 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -14,7 +14,6 @@ from app.db import ( DocumentType, DocumentVersion, Folder, - SearchSourceConnector, SearchSpace, User, ) @@ -72,7 +71,6 @@ class TestFullIndexer: async def test_i1_new_file_indexed( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -82,11 +80,12 @@ class TestFullIndexer: (tmp_path / "note.md").write_text("# Hello World\n\nContent here.") - count, skipped, err = await index_local_folder( + count, skipped, root_folder_id, err = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) assert err is None @@ -95,7 +94,8 @@ class TestFullIndexer: docs = ( await db_session.execute( select(Document).where( - Document.connector_id == db_local_folder_connector.id + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, ) ) ).scalars().all() @@ -112,7 +112,6 @@ class TestFullIndexer: async def test_i2_unchanged_skipped( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -122,27 +121,31 @@ class TestFullIndexer: (tmp_path / "note.md").write_text("# Hello\n\nSame content.") - count1, _, _ = await index_local_folder( + count1, _, root_folder_id, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) assert count1 == 1 - # Second run — unchanged - count2, _, _ = await index_local_folder( + # Second run — unchanged, pass root_folder_id from first run + count2, _, _, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, ) assert count2 == 0 total = ( await db_session.execute( select(func.count()).select_from(Document).where( - Document.connector_id == db_local_folder_connector.id + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, ) ) ).scalar_one() @@ -157,7 +160,6 @@ class TestFullIndexer: async def test_i3_changed_reindexed( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -168,11 +170,12 @@ class TestFullIndexer: f = tmp_path / "note.md" f.write_text("# Version 1\n\nOriginal.") - await index_local_folder( + _, _, root_folder_id, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) # Modify @@ -180,11 +183,13 @@ class TestFullIndexer: # Touch mtime to ensure it's detected as different os.utime(f, (f.stat().st_atime + 10, f.stat().st_mtime + 10)) - count, _, _ = await index_local_folder( + count, _, _, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, ) assert count == 1 @@ -192,7 +197,8 @@ class TestFullIndexer: versions = ( await db_session.execute( select(DocumentVersion).join(Document).where( - Document.connector_id == db_local_folder_connector.id + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, ) ) ).scalars().all() @@ -207,7 +213,6 @@ class TestFullIndexer: async def test_i4_deleted_removed( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -218,17 +223,19 @@ class TestFullIndexer: f = tmp_path / "to_delete.md" f.write_text("# Delete me") - await index_local_folder( + _, _, root_folder_id, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) docs_before = ( await db_session.execute( select(func.count()).select_from(Document).where( - Document.connector_id == db_local_folder_connector.id + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, ) ) ).scalar_one() @@ -238,15 +245,18 @@ class TestFullIndexer: await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, ) docs_after = ( await db_session.execute( select(func.count()).select_from(Document).where( - Document.connector_id == db_local_folder_connector.id + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, ) ) ).scalar_one() @@ -261,7 +271,6 @@ class TestFullIndexer: async def test_i5_single_file_mode( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -273,11 +282,12 @@ class TestFullIndexer: (tmp_path / "b.md").write_text("File B") (tmp_path / "c.md").write_text("File C") - count, _, _ = await index_local_folder( + count, _, _, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", target_file_path=str(tmp_path / "b.md"), ) assert count == 1 @@ -285,12 +295,13 @@ class TestFullIndexer: docs = ( await db_session.execute( select(Document).where( - Document.connector_id == db_local_folder_connector.id + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, ) ) ).scalars().all() assert len(docs) == 1 - assert docs[0].title == "b" + assert docs[0].title == "b.md" # ==================================================================== @@ -309,30 +320,27 @@ class TestFolderMirroring: async def test_f1_root_folder_created( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, ): - """F1: First sync creates a root Folder and stores root_folder_id.""" + """F1: First sync creates a root Folder and returns root_folder_id.""" from app.tasks.connector_indexers.local_folder_indexer import index_local_folder (tmp_path / "root.md").write_text("Root file") - await index_local_folder( + _, _, root_folder_id, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) - # Refresh connector - await db_session.refresh(db_local_folder_connector) - root_id = db_local_folder_connector.config.get("root_folder_id") - assert root_id is not None + assert root_folder_id is not None root_folder = ( - await db_session.execute(select(Folder).where(Folder.id == root_id)) + await db_session.execute(select(Folder).where(Folder.id == root_folder_id)) ).scalar_one() assert root_folder.name == "test-folder" @@ -345,7 +353,6 @@ class TestFolderMirroring: async def test_f2_nested_folder_rows( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -362,9 +369,10 @@ class TestFolderMirroring: await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) folders = ( @@ -394,7 +402,6 @@ class TestFolderMirroring: async def test_f3_resync_reuses_folders( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -406,11 +413,12 @@ class TestFolderMirroring: sub.mkdir() (sub / "file.md").write_text("content") - await index_local_folder( + _, _, root_folder_id, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) folders_before = ( @@ -420,12 +428,14 @@ class TestFolderMirroring: ).scalars().all() ids_before = {f.id for f in folders_before} - # Re-sync + # Re-sync with root_folder_id from first run await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, ) folders_after = ( @@ -446,7 +456,6 @@ class TestFolderMirroring: async def test_f4_folder_id_assigned( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -459,17 +468,19 @@ class TestFolderMirroring: (daily / "today.md").write_text("today note") (tmp_path / "root.md").write_text("root note") - await index_local_folder( + _, _, root_folder_id, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) docs = ( await db_session.execute( select(Document).where( - Document.connector_id == db_local_folder_connector.id + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, ) ) ).scalars().all() @@ -486,9 +497,7 @@ class TestFolderMirroring: assert today_doc.folder_id == daily_folder.id # Root doc should be in the root folder - await db_session.refresh(db_local_folder_connector) - root_fid = db_local_folder_connector.config.get("root_folder_id") - assert root_doc.folder_id == root_fid + assert root_doc.folder_id == root_folder_id @pytest.mark.usefixtures( "patched_self_hosted", @@ -499,7 +508,6 @@ class TestFolderMirroring: async def test_f5_empty_folder_cleanup( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, tmp_path: Path, @@ -515,11 +523,12 @@ class TestFolderMirroring: (daily / "today.md").write_text("today") (weekly / "review.md").write_text("review") - await index_local_folder( + _, _, root_folder_id, _ = await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", ) # Verify weekly folder exists @@ -535,9 +544,11 @@ class TestFolderMirroring: await index_local_folder( session=db_session, - connector_id=db_local_folder_connector.id, search_space_id=db_search_space.id, user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, ) # weekly Folder should be gone (empty, dir removed) @@ -570,7 +581,6 @@ class TestPipelineIntegration: async def test_p1_local_folder_file_through_pipeline( self, db_session: AsyncSession, - db_local_folder_connector: SearchSourceConnector, db_user: User, db_search_space: SearchSpace, mocker, @@ -585,7 +595,7 @@ class TestPipelineIntegration: unique_id="test-folder:test.md", document_type=DocumentType.LOCAL_FOLDER_FILE, search_space_id=db_search_space.id, - connector_id=db_local_folder_connector.id, + connector_id=None, created_by_id=str(db_user.id), ) From caf2525ab5d32ffbb6db0c96a5e4109996a24030 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:29:07 +0530 Subject: [PATCH 049/202] fix: update folder ID collection logic to include deleted directories and adjust test cases for document titles --- .../connector_indexers/local_folder_indexer.py | 15 ++++++++++++++- .../test_local_folder_pipeline.py | 4 ++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index 591914625..93c6649a2 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -610,12 +610,25 @@ async def index_local_folder( # ================================================================ # PHASE 1.5: Delete documents no longer on disk # ================================================================ + # Collect ALL folder IDs under this root (including folders that no + # longer exist on disk but still have rows in the DB) so we catch + # documents in deleted directories too. + all_root_folder_ids = set(folder_mapping.values()) + all_db_folders = ( + await session.execute( + select(Folder.id).where( + Folder.search_space_id == search_space_id, + ) + ) + ).scalars().all() + all_root_folder_ids.update(all_db_folders) + all_folder_docs = ( await session.execute( select(Document).where( Document.document_type == DocumentType.LOCAL_FOLDER_FILE, Document.search_space_id == search_space_id, - Document.folder_id.in_(list(folder_mapping.values())), + Document.folder_id.in_(list(all_root_folder_ids)), ) ) ).scalars().all() diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index e46d59a67..34efad789 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -485,8 +485,8 @@ class TestFolderMirroring: ) ).scalars().all() - today_doc = next(d for d in docs if d.title == "today") - root_doc = next(d for d in docs if d.title == "root") + today_doc = next(d for d in docs if d.title == "today.md") + root_doc = next(d for d in docs if d.title == "root.md") daily_folder = ( await db_session.execute( From c27d24a117633aac32de889b12f153239b58a832 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:41:45 +0530 Subject: [PATCH 050/202] feat: enhance folder indexing by adding root folder ID support and implement folder creation and cleanup logic --- .../app/routes/documents_routes.py | 2 + .../local_folder_indexer.py | 102 ++++++++++++++ .../test_local_folder_pipeline.py | 130 ++++++++++++++++++ surfsense_web/hooks/use-folder-sync.ts | 1 + .../lib/apis/documents-api.service.ts | 2 +- 5 files changed, 236 insertions(+), 1 deletion(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index d7974f9ff..05221b192 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1280,6 +1280,7 @@ class FolderIndexFileRequest(PydanticBaseModel): folder_name: str search_space_id: int target_file_path: str + root_folder_id: int | None = None enable_summary: bool = False @@ -1394,6 +1395,7 @@ async def folder_index_file( folder_path=request.folder_path, folder_name=request.folder_name, target_file_path=request.target_file_path, + root_folder_id=request.root_folder_id, enable_summary=request.enable_summary, ) diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index 93c6649a2..3d4ddc19e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -312,6 +312,92 @@ async def _mirror_folder_structure( return mapping, root_folder_id +async def _resolve_folder_for_file( + session: AsyncSession, + rel_path: str, + root_folder_id: int, + search_space_id: int, + user_id: str, +) -> int: + """Given a file's relative path, ensure all parent Folder rows exist and + return the folder_id for the file's immediate parent directory. + + For a file at "notes/daily/today.md", this ensures Folder rows exist for + "notes" and "notes/daily", and returns the id of "notes/daily". + For a file at "readme.md" (root level), returns root_folder_id. + """ + parent_dir = str(Path(rel_path).parent) + if parent_dir == ".": + return root_folder_id + + parts = Path(parent_dir).parts + current_parent_id = root_folder_id + + for part in parts: + existing = ( + await session.execute( + select(Folder).where( + Folder.name == part, + Folder.parent_id == current_parent_id, + Folder.search_space_id == search_space_id, + ) + ) + ).scalar_one_or_none() + + if existing: + current_parent_id = existing.id + else: + new_folder = Folder( + name=part, + parent_id=current_parent_id, + search_space_id=search_space_id, + created_by_id=user_id, + position="a0", + ) + session.add(new_folder) + await session.flush() + current_parent_id = new_folder.id + + return current_parent_id + + +async def _cleanup_empty_folder_chain( + session: AsyncSession, + folder_id: int, + root_folder_id: int, +) -> None: + """Walk up from folder_id toward root, deleting empty folders (no docs, no + children). Stops at root_folder_id which is never deleted.""" + current_id = folder_id + while current_id and current_id != root_folder_id: + has_doc = ( + await session.execute( + select(Document.id).where(Document.folder_id == current_id).limit(1) + ) + ).scalar_one_or_none() + if has_doc is not None: + break + + has_child = ( + await session.execute( + select(Folder.id).where(Folder.parent_id == current_id).limit(1) + ) + ).scalar_one_or_none() + if has_child is not None: + break + + folder = ( + await session.execute(select(Folder).where(Folder.id == current_id)) + ).scalar_one_or_none() + if not folder: + break + + parent_id = folder.parent_id + await session.delete(folder) + await session.flush() + current_id = parent_id + + async def _cleanup_empty_folders( session: AsyncSession, root_folder_id: int, @@ -427,6 +513,7 @@ async def index_local_folder( folder_name=folder_name, target_file_path=target_file_path, enable_summary=enable_summary, + root_folder_id=root_folder_id, task_logger=task_logger, log_entry=log_entry, ) @@ -802,6 +889,7 @@ async def _index_single_file( folder_name: str, target_file_path: str, enable_summary: bool, + root_folder_id: int | None, task_logger, log_entry, ) -> tuple[int, int, str | None]: @@ -816,7 +904,13 @@ async def _index_single_file( ) existing = await check_document_by_unique_identifier(session, uid_hash) if existing: + deleted_folder_id = existing.folder_id await session.delete(existing) + await session.flush() + if deleted_folder_id and root_folder_id: + await _cleanup_empty_folder_chain( + session, deleted_folder_id, root_folder_id + ) await session.commit() return 0, 0, None return 0, 0, None @@ -880,6 +974,12 @@ async def _index_single_file( "mtime": mtime, } + folder_id = None + if root_folder_id: + folder_id = await _resolve_folder_for_file( + session, rel_path, root_folder_id, search_space_id, user_id + ) + if existing: existing.title = title existing.content = document_string @@ -887,6 +987,7 @@ async def _index_single_file( existing.source_markdown = content existing.embedding = embedding existing.document_metadata = doc_metadata + existing.folder_id = folder_id await safe_set_chunks(session, existing, chunks) existing.updated_at = get_current_timestamp() existing.status = DocumentStatus.ready() @@ -905,6 +1006,7 @@ async def _index_single_file( updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=None, + folder_id=folder_id, ) session.add(document) await session.flush() diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index 34efad789..110aa6caf 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -567,6 +567,136 @@ class TestFolderMirroring: ).scalar_one_or_none() assert daily_after is not None + @pytest.mark.usefixtures( + "patched_self_hosted", + "patched_embed_for_indexer", + "patched_chunks_for_indexer", + "patched_summary_for_indexer", + ) + async def test_f6_single_file_creates_subfolder( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F6: Single-file mode creates missing Folder rows and assigns correct folder_id.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "root.md").write_text("root") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + sub = tmp_path / "notes" / "daily" + sub.mkdir(parents=True) + (sub / "new.md").write_text("new note in subfolder") + + count, _, _, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_path=str(sub / "new.md"), + root_folder_id=root_folder_id, + ) + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.title == "new.md", + ) + ) + ).scalar_one() + + daily_folder = ( + await db_session.execute( + select(Folder).where(Folder.name == "daily") + ) + ).scalar_one() + + assert doc.folder_id == daily_folder.id + assert daily_folder.parent_id is not None + + notes_folder = ( + await db_session.execute( + select(Folder).where(Folder.name == "notes") + ) + ).scalar_one() + assert daily_folder.parent_id == notes_folder.id + assert notes_folder.parent_id == root_folder_id + + @pytest.mark.usefixtures( + "patched_self_hosted", + "patched_embed_for_indexer", + "patched_chunks_for_indexer", + "patched_summary_for_indexer", + ) + async def test_f7_single_file_delete_cleans_empty_folders( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F7: Deleting the only file in a subfolder via single-file mode removes empty Folder rows.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + sub = tmp_path / "notes" / "ephemeral" + sub.mkdir(parents=True) + (sub / "temp.md").write_text("temporary") + (tmp_path / "keep.md").write_text("keep this") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + eph_folder = ( + await db_session.execute( + select(Folder).where(Folder.name == "ephemeral") + ) + ).scalar_one_or_none() + assert eph_folder is not None + + target = sub / "temp.md" + target.unlink() + + await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_path=str(target), + root_folder_id=root_folder_id, + ) + + eph_after = ( + await db_session.execute( + select(Folder).where(Folder.name == "ephemeral") + ) + ).scalar_one_or_none() + assert eph_after is None + + notes_after = ( + await db_session.execute( + select(Folder).where(Folder.name == "notes") + ) + ).scalar_one_or_none() + assert notes_after is None + # ==================================================================== # Tier 5: Pipeline Integration (P1) diff --git a/surfsense_web/hooks/use-folder-sync.ts b/surfsense_web/hooks/use-folder-sync.ts index fcfb2814e..f051b7df6 100644 --- a/surfsense_web/hooks/use-folder-sync.ts +++ b/surfsense_web/hooks/use-folder-sync.ts @@ -32,6 +32,7 @@ export function useFolderSync() { folder_name: event.folderName, search_space_id: event.searchSpaceId, target_file_path: event.fullPath, + root_folder_id: event.rootFolderId, }); } catch (err) { console.error("[FolderSync] Failed to trigger re-index:", err); diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index c77cd6848..a8e3831d4 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -399,7 +399,7 @@ class DocumentsApiService { return baseApiService.post(`/api/v1/documents/folder-index`, undefined, { body }); }; - folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; enable_summary?: boolean }) => { + folderIndexFile = async (searchSpaceId: number, body: { folder_path: string; folder_name: string; search_space_id: number; target_file_path: string; root_folder_id?: number | null; enable_summary?: boolean }) => { return baseApiService.post(`/api/v1/documents/folder-index-file`, undefined, { body }); }; From 53df393cf7ca300e9eb79f14429bb94857bde492 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:28:23 +0530 Subject: [PATCH 051/202] refactor: streamline local folder indexing logic by removing unused imports, enhancing content hashing, and improving document creation process --- .../local_folder_indexer.py | 415 +++++++----------- .../test_local_folder_pipeline.py | 139 +----- 2 files changed, 174 insertions(+), 380 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index 3d4ddc19e..a3281eaea 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -14,7 +14,6 @@ no connector row is read. """ import os -import time from collections.abc import Awaitable, Callable from datetime import UTC, datetime from pathlib import Path @@ -30,24 +29,16 @@ from app.db import ( DocumentType, Folder, ) +from app.indexing_pipeline.connector_document import ConnectorDocument +from app.indexing_pipeline.document_hashing import compute_identifier_hash +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - create_document_chunks, - embed_text, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) from app.utils.document_versioning import create_version_snapshot from .base import ( - build_document_metadata_string, check_document_by_unique_identifier, - check_duplicate_document_by_hash, - get_current_timestamp, logger, - safe_set_chunks, ) PLAINTEXT_EXTENSIONS = frozenset({ @@ -89,7 +80,6 @@ def _needs_etl(filename: str) -> bool: return not _is_plaintext_file(filename) and not _is_audio_file(filename) HeartbeatCallbackType = Callable[[int], Awaitable[None]] -HEARTBEAT_INTERVAL_SECONDS = 30 DEFAULT_EXCLUDE_PATTERNS = [ ".git", @@ -210,6 +200,16 @@ async def _read_file_content(file_path: str, filename: str) -> str: return await _parse_file_to_markdown(file_path, filename) +def _content_hash(content: str, search_space_id: int) -> str: + """SHA-256 hash of content scoped to a search space. + + Matches the format used by ``compute_content_hash`` in the unified + pipeline so that dedup checks are consistent. + """ + import hashlib + return hashlib.sha256(f"{search_space_id}:{content}".encode("utf-8")).hexdigest() + + async def _compute_file_content_hash( file_path: str, filename: str, search_space_id: int, ) -> tuple[str, str]: @@ -218,8 +218,7 @@ async def _compute_file_content_hash( Returns (content_text, content_hash). """ content = await _read_file_content(file_path, filename) - content_hash = generate_content_hash(content, search_space_id) - return content, content_hash + return content, _content_hash(content, search_space_id) async def _mirror_folder_structure( @@ -454,6 +453,40 @@ async def _cleanup_empty_folders( candidates = remaining +def _build_connector_doc( + title: str, + content: str, + relative_path: str, + folder_name: str, + *, + search_space_id: int, + user_id: str, + enable_summary: bool, +) -> ConnectorDocument: + """Build a ConnectorDocument from a local file's extracted content.""" + unique_id = f"{folder_name}:{relative_path}" + metadata = { + "folder_name": folder_name, + "file_path": relative_path, + "document_type": "Local Folder File", + "connector_type": "Local Folder", + } + fallback_summary = f"File: {title}\n\n{content[:4000]}" + + return ConnectorDocument( + title=title, + source_markdown=content, + unique_id=unique_id, + document_type=DocumentType.LOCAL_FOLDER_FILE, + search_space_id=search_space_id, + connector_id=None, + created_by_id=user_id, + should_summarize=enable_summary, + fallback_summary=fallback_summary, + metadata=metadata, + ) + + async def index_local_folder( session: AsyncSession, search_space_id: int, @@ -551,15 +584,13 @@ async def index_local_folder( indexed_count = 0 skipped_count = 0 failed_count = 0 - duplicate_count = 0 - - last_heartbeat_time = time.time() # ================================================================ - # PHASE 1: Analyze all files, create pending documents + # PHASE 1: Pre-filter files (mtime / content-hash), version changed # ================================================================ - files_to_process: list[dict] = [] - new_documents_created = False + connector_docs: list[ConnectorDocument] = [] + # Maps unique_id -> (relative_path, mtime) for post-pipeline folder_id assignment + file_meta_map: dict[str, dict] = {} seen_unique_hashes: set[str] = set() for file_info in files: @@ -568,8 +599,8 @@ async def index_local_folder( file_path_abs = file_info["path"] unique_identifier = f"{folder_name}:{relative_path}" - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.LOCAL_FOLDER_FILE, + unique_identifier_hash = compute_identifier_hash( + DocumentType.LOCAL_FOLDER_FILE.value, unique_identifier, search_space_id, ) @@ -612,94 +643,42 @@ async def index_local_folder( continue await create_version_snapshot(session, existing_document) + else: + try: + content, content_hash = await _compute_file_content_hash( + file_path_abs, file_info["relative_path"], search_space_id + ) + except Exception as read_err: + logger.warning(f"Could not read {file_path_abs}: {read_err}") + skipped_count += 1 + continue - files_to_process.append( - { - "document": existing_document, - "is_new": False, - "file_info": file_info, - "content": content, - "content_hash": content_hash, - "unique_identifier_hash": unique_identifier_hash, - "relative_path": relative_path, - "title": file_info["name"], - } - ) - continue + if not content.strip(): + skipped_count += 1 + continue - try: - content, content_hash = await _compute_file_content_hash( - file_path_abs, file_info["relative_path"], search_space_id - ) - except Exception as read_err: - logger.warning(f"Could not read {file_path_abs}: {read_err}") - skipped_count += 1 - continue - - if not content.strip(): - skipped_count += 1 - continue - - with session.no_autoflush: - dup = await check_duplicate_document_by_hash(session, content_hash) - if dup: - duplicate_count += 1 - skipped_count += 1 - continue - - parent_dir = str(Path(relative_path).parent) - if parent_dir == ".": - parent_dir = "" - folder_id = folder_mapping.get(parent_dir, folder_mapping.get("")) - - document = Document( - search_space_id=search_space_id, + doc = _build_connector_doc( title=file_info["name"], - document_type=DocumentType.LOCAL_FOLDER_FILE, - document_metadata={ - "folder_name": folder_name, - "file_path": relative_path, - "mtime": file_info["modified_at"].timestamp(), - }, - content="Pending...", - content_hash=unique_identifier_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=None, - status=DocumentStatus.pending(), - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=None, - folder_id=folder_id, - ) - session.add(document) - new_documents_created = True - - files_to_process.append( - { - "document": document, - "is_new": True, - "file_info": file_info, - "content": content, - "content_hash": content_hash, - "unique_identifier_hash": unique_identifier_hash, - "relative_path": relative_path, - "title": file_info["name"], - } + content=content, + relative_path=relative_path, + folder_name=folder_name, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=enable_summary, ) + connector_docs.append(doc) + file_meta_map[unique_identifier] = { + "relative_path": relative_path, + "mtime": file_info["modified_at"].timestamp(), + } except Exception as e: logger.exception(f"Phase 1 error for {file_info.get('path')}: {e}") failed_count += 1 - if new_documents_created: - await session.commit() - # ================================================================ # PHASE 1.5: Delete documents no longer on disk # ================================================================ - # Collect ALL folder IDs under this root (including folders that no - # longer exist on disk but still have rows in the DB) so we catch - # documents in deleted directories too. all_root_folder_ids = set(folder_mapping.values()) all_db_folders = ( await session.execute( @@ -727,98 +706,51 @@ async def index_local_folder( await session.flush() # ================================================================ - # PHASE 2: Process each document + # PHASE 2: Index via unified pipeline # ================================================================ - long_context_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) + if connector_docs: + from app.indexing_pipeline.document_hashing import ( + compute_unique_identifier_hash, + ) - for item in files_to_process: - if on_heartbeat_callback: - current_time = time.time() - if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + pipeline = IndexingPipelineService(session) + doc_map = { + compute_unique_identifier_hash(cd): cd for cd in connector_docs + } + documents = await pipeline.prepare_for_indexing(connector_docs) + + llm = await get_user_long_context_llm(session, user_id, search_space_id) + + for document in documents: + connector_doc = doc_map.get(document.unique_identifier_hash) + if connector_doc is None: + failed_count += 1 + continue + + result = await pipeline.index(document, connector_doc, llm) + + if DocumentStatus.is_state(result.status, DocumentStatus.READY): + indexed_count += 1 + + # Assign folder_id and mtime post-pipeline + rel_path = (connector_doc.metadata or {}).get("file_path", "") + parent_dir = str(Path(rel_path).parent) if rel_path else "" + if parent_dir == ".": + parent_dir = "" + fid = folder_mapping.get(parent_dir, folder_mapping.get("")) + + unique_id = connector_doc.unique_id + mtime_info = file_meta_map.get(unique_id, {}) + + result.folder_id = fid + doc_meta = dict(result.document_metadata or {}) + doc_meta["mtime"] = mtime_info.get("mtime") + result.document_metadata = doc_meta + else: + failed_count += 1 + + if on_heartbeat_callback and indexed_count % 5 == 0: await on_heartbeat_callback(indexed_count) - last_heartbeat_time = current_time - - document = item["document"] - try: - document.status = DocumentStatus.processing() - await session.commit() - - title = item["title"] - relative_path = item["relative_path"] - content = item["content"] - content_hash = item["content_hash"] - file_info = item["file_info"] - - metadata_sections = [ - ( - "METADATA", - [ - f"Title: {title}", - f"Folder: {folder_name}", - f"Path: {relative_path}", - ], - ), - ("CONTENT", [content]), - ] - document_string = build_document_metadata_string(metadata_sections) - - summary_content = "" - if long_context_llm and enable_summary: - doc_meta = { - "folder_name": folder_name, - "file_path": relative_path, - } - summary_content, _ = await generate_document_summary( - document_string, long_context_llm, doc_meta - ) - - embedding = embed_text(document_string) - chunks = await create_document_chunks(document_string) - - parent_dir = str(Path(relative_path).parent) - if parent_dir == ".": - parent_dir = "" - folder_id = folder_mapping.get(parent_dir, folder_mapping.get("")) - - document.title = title - document.content = document_string - document.content_hash = content_hash - document.source_markdown = content - document.embedding = embedding - document.document_metadata = { - "folder_name": folder_name, - "file_path": relative_path, - "summary": summary_content, - "mtime": file_info["modified_at"].timestamp(), - } - document.folder_id = folder_id - await safe_set_chunks(session, document, chunks) - document.updated_at = get_current_timestamp() - document.status = DocumentStatus.ready() - - indexed_count += 1 - - if indexed_count % 10 == 0: - await session.commit() - - except Exception as e: - logger.exception(f"Phase 2 error for {item.get('relative_path')}: {e}") - try: - await session.rollback() - except Exception: - pass - try: - document.status = DocumentStatus.failed(str(e)[:500]) - document.updated_at = get_current_timestamp() - await session.commit() - except Exception: - try: - await session.rollback() - except Exception: - pass - failed_count += 1 # Cleanup empty folders existing_dirs = set() @@ -846,8 +778,6 @@ async def index_local_folder( raise warning_parts = [] - if duplicate_count > 0: - warning_parts.append(f"{duplicate_count} duplicate") if failed_count > 0: warning_parts.append(f"{failed_count} failed") warning_message = ", ".join(warning_parts) if warning_parts else None @@ -859,7 +789,6 @@ async def index_local_folder( "indexed": indexed_count, "skipped": skipped_count, "failed": failed_count, - "duplicates": duplicate_count, }, ) @@ -899,8 +828,8 @@ async def _index_single_file( if not full_path.exists(): rel = str(full_path.relative_to(folder_path)) unique_id = f"{folder_name}:{rel}" - uid_hash = generate_unique_identifier_hash( - DocumentType.LOCAL_FOLDER_FILE, unique_id, search_space_id + uid_hash = compute_identifier_hash( + DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id ) existing = await check_document_by_unique_identifier(session, uid_hash) if existing: @@ -918,8 +847,8 @@ async def _index_single_file( rel_path = str(full_path.relative_to(folder_path)) unique_id = f"{folder_name}:{rel_path}" - uid_hash = generate_unique_identifier_hash( - DocumentType.LOCAL_FOLDER_FILE, unique_id, search_space_id + uid_hash = compute_identifier_hash( + DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id ) try: @@ -945,83 +874,51 @@ async def _index_single_file( await create_version_snapshot(session, existing) - long_context_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - title = full_path.name mtime = full_path.stat().st_mtime - metadata_sections = [ - ("METADATA", [f"Title: {title}", f"Folder: {folder_name}", f"Path: {rel_path}"]), - ("CONTENT", [content]), - ] - document_string = build_document_metadata_string(metadata_sections) + connector_doc = _build_connector_doc( + title=full_path.name, + content=content, + relative_path=rel_path, + folder_name=folder_name, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=enable_summary, + ) - summary_content = "" - if long_context_llm and enable_summary: - summary_content, _ = await generate_document_summary( - document_string, long_context_llm, {"folder_name": folder_name, "file_path": rel_path} - ) + pipeline = IndexingPipelineService(session) + llm = await get_user_long_context_llm(session, user_id, search_space_id) + documents = await pipeline.prepare_for_indexing([connector_doc]) - embedding = embed_text(document_string) - chunks = await create_document_chunks(document_string) + if not documents: + return 0, 1, None - doc_metadata = { - "folder_name": folder_name, - "file_path": rel_path, - "summary": summary_content, - "mtime": mtime, - } + db_doc = documents[0] + await pipeline.index(db_doc, connector_doc, llm) + # Post-pipeline: assign folder_id and mtime + await session.refresh(db_doc) folder_id = None if root_folder_id: folder_id = await _resolve_folder_for_file( session, rel_path, root_folder_id, search_space_id, user_id ) - - if existing: - existing.title = title - existing.content = document_string - existing.content_hash = content_hash - existing.source_markdown = content - existing.embedding = embedding - existing.document_metadata = doc_metadata - existing.folder_id = folder_id - await safe_set_chunks(session, existing, chunks) - existing.updated_at = get_current_timestamp() - existing.status = DocumentStatus.ready() - else: - document = Document( - search_space_id=search_space_id, - title=title, - document_type=DocumentType.LOCAL_FOLDER_FILE, - document_metadata=doc_metadata, - content=document_string, - content_hash=content_hash, - unique_identifier_hash=uid_hash, - source_markdown=content, - embedding=embedding, - status=DocumentStatus.ready(), - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=None, - folder_id=folder_id, - ) - session.add(document) - await session.flush() - for chunk in chunks: - chunk.document_id = document.id - session.add_all(chunks) - + db_doc.folder_id = folder_id + doc_meta = dict(db_doc.document_metadata or {}) + doc_meta["mtime"] = mtime + db_doc.document_metadata = doc_meta await session.commit() - await task_logger.log_task_success( - log_entry, - f"Single file indexed: {rel_path}", - {"file": rel_path}, - ) - return 1, 0, None + indexed = 1 if DocumentStatus.is_state(db_doc.status, DocumentStatus.READY) else 0 + failed_msg = None if indexed else "Indexing failed" + + if indexed: + await task_logger.log_task_success( + log_entry, + f"Single file indexed: {rel_path}", + {"file": rel_path}, + ) + return indexed, 0 if indexed else 1, failed_msg except Exception as e: logger.exception(f"Error indexing single file {target_file_path}: {e}") diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index 110aa6caf..154cc6e0e 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -1,8 +1,7 @@ -"""Integration tests for local folder indexer — Tier 3 (I1-I5), Tier 4 (F1-F5), Tier 5 (P1).""" +"""Integration tests for local folder indexer — Tier 3 (I1-I5), Tier 4 (F1-F7), Tier 5 (P1).""" import os from pathlib import Path -from unittest.mock import AsyncMock, MagicMock import pytest from sqlalchemy import func, select @@ -18,41 +17,11 @@ from app.db import ( User, ) -import app.tasks.connector_indexers.local_folder_indexer as _lfi_mod - pytestmark = pytest.mark.integration - -@pytest.fixture -def patched_self_hosted(monkeypatch): - _cfg = type("_Cfg", (), {"is_self_hosted": staticmethod(lambda: True)})() - monkeypatch.setattr(_lfi_mod, "config", _cfg) - - -@pytest.fixture -def patched_embed_for_indexer(monkeypatch): - from app.config import config as app_config - dim = app_config.embedding_model_instance.dimension - mock = MagicMock(return_value=[0.1] * dim) - monkeypatch.setattr(_lfi_mod, "embed_text", mock) - return mock - - -@pytest.fixture -def patched_chunks_for_indexer(monkeypatch): - from app.db import Chunk - from app.config import config as app_config - dim = app_config.embedding_model_instance.dimension - - async def mock_create_chunks(text): - return [Chunk(content="chunk", embedding=[0.1] * dim)] - - monkeypatch.setattr(_lfi_mod, "create_document_chunks", mock_create_chunks) - - -@pytest.fixture -def patched_summary_for_indexer(monkeypatch): - monkeypatch.setattr(_lfi_mod, "get_user_long_context_llm", AsyncMock(return_value=None)) +UNIFIED_FIXTURES = ( + "patched_summarize", "patched_embed_texts", "patched_chunk_text", +) # ==================================================================== @@ -62,12 +31,7 @@ def patched_summary_for_indexer(monkeypatch): class TestFullIndexer: - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_i1_new_file_indexed( self, db_session: AsyncSession, @@ -103,12 +67,7 @@ class TestFullIndexer: assert docs[0].document_type == DocumentType.LOCAL_FOLDER_FILE assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY) - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_i2_unchanged_skipped( self, db_session: AsyncSession, @@ -130,7 +89,6 @@ class TestFullIndexer: ) assert count1 == 1 - # Second run — unchanged, pass root_folder_id from first run count2, _, _, _ = await index_local_folder( session=db_session, search_space_id=db_search_space.id, @@ -151,12 +109,7 @@ class TestFullIndexer: ).scalar_one() assert total == 1 - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_i3_changed_reindexed( self, db_session: AsyncSession, @@ -178,9 +131,7 @@ class TestFullIndexer: folder_name="test-folder", ) - # Modify f.write_text("# Version 2\n\nUpdated.") - # Touch mtime to ensure it's detected as different os.utime(f, (f.stat().st_atime + 10, f.stat().st_mtime + 10)) count, _, _, _ = await index_local_folder( @@ -193,7 +144,6 @@ class TestFullIndexer: ) assert count == 1 - # Should have a version snapshot versions = ( await db_session.execute( select(DocumentVersion).join(Document).where( @@ -204,12 +154,7 @@ class TestFullIndexer: ).scalars().all() assert len(versions) >= 1 - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_i4_deleted_removed( self, db_session: AsyncSession, @@ -262,12 +207,7 @@ class TestFullIndexer: ).scalar_one() assert docs_after == 0 - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_i5_single_file_mode( self, db_session: AsyncSession, @@ -305,18 +245,13 @@ class TestFullIndexer: # ==================================================================== -# Tier 4: Folder Mirroring (F1-F5) +# Tier 4: Folder Mirroring (F1-F7) # ==================================================================== class TestFolderMirroring: - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_f1_root_folder_created( self, db_session: AsyncSession, @@ -344,12 +279,7 @@ class TestFolderMirroring: ).scalar_one() assert root_folder.name == "test-folder" - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_f2_nested_folder_rows( self, db_session: AsyncSession, @@ -393,12 +323,7 @@ class TestFolderMirroring: assert daily_folder.parent_id == notes_folder.id assert weekly_folder.parent_id == notes_folder.id - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_f3_resync_reuses_folders( self, db_session: AsyncSession, @@ -428,7 +353,6 @@ class TestFolderMirroring: ).scalars().all() ids_before = {f.id for f in folders_before} - # Re-sync with root_folder_id from first run await index_local_folder( session=db_session, search_space_id=db_search_space.id, @@ -447,12 +371,7 @@ class TestFolderMirroring: assert ids_before == ids_after - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_f4_folder_id_assigned( self, db_session: AsyncSession, @@ -496,15 +415,9 @@ class TestFolderMirroring: assert today_doc.folder_id == daily_folder.id - # Root doc should be in the root folder assert root_doc.folder_id == root_folder_id - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_f5_empty_folder_cleanup( self, db_session: AsyncSession, @@ -531,7 +444,6 @@ class TestFolderMirroring: folder_name="test-folder", ) - # Verify weekly folder exists weekly_folder = ( await db_session.execute( select(Folder).where(Folder.name == "weekly") @@ -539,7 +451,6 @@ class TestFolderMirroring: ).scalar_one_or_none() assert weekly_folder is not None - # Delete weekly directory + its file shutil.rmtree(weekly) await index_local_folder( @@ -551,7 +462,6 @@ class TestFolderMirroring: root_folder_id=root_folder_id, ) - # weekly Folder should be gone (empty, dir removed) weekly_after = ( await db_session.execute( select(Folder).where(Folder.name == "weekly") @@ -559,7 +469,6 @@ class TestFolderMirroring: ).scalar_one_or_none() assert weekly_after is None - # daily should still exist daily_after = ( await db_session.execute( select(Folder).where(Folder.name == "daily") @@ -567,12 +476,7 @@ class TestFolderMirroring: ).scalar_one_or_none() assert daily_after is not None - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_f6_single_file_creates_subfolder( self, db_session: AsyncSession, @@ -634,12 +538,7 @@ class TestFolderMirroring: assert daily_folder.parent_id == notes_folder.id assert notes_folder.parent_id == root_folder_id - @pytest.mark.usefixtures( - "patched_self_hosted", - "patched_embed_for_indexer", - "patched_chunks_for_indexer", - "patched_summary_for_indexer", - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_f7_single_file_delete_cleans_empty_folders( self, db_session: AsyncSession, @@ -705,9 +604,7 @@ class TestFolderMirroring: class TestPipelineIntegration: - @pytest.mark.usefixtures( - "patched_summarize", "patched_embed_texts", "patched_chunk_text" - ) + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_p1_local_folder_file_through_pipeline( self, db_session: AsyncSession, From 25358fddcf17ff41c2f02a534bf0218eef96701f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:46:21 +0530 Subject: [PATCH 052/202] feat: implement local folder synchronization and versioning with new metadata handling and document_versions table --- ...7_add_local_folder_sync_and_versioning.py} | 25 +++++- surfsense_backend/app/db.py | 1 + .../app/routes/documents_routes.py | 42 ++++++++++ .../app/routes/folders_routes.py | 27 +++++++ surfsense_backend/app/schemas/folders.py | 3 + .../components/documents/FolderNode.tsx | 24 ++++-- .../components/documents/FolderTreeView.tsx | 11 ++- .../layout/ui/sidebar/DocumentsSidebar.tsx | 79 +++++++++++++++++-- surfsense_web/contracts/types/folder.types.ts | 1 + .../lib/apis/documents-api.service.ts | 5 ++ surfsense_web/lib/apis/folders-api.service.ts | 4 + 11 files changed, 205 insertions(+), 17 deletions(-) rename surfsense_backend/alembic/versions/{117_add_local_folder_connector_and_versioning.py => 117_add_local_folder_sync_and_versioning.py} (82%) diff --git a/surfsense_backend/alembic/versions/117_add_local_folder_connector_and_versioning.py b/surfsense_backend/alembic/versions/117_add_local_folder_sync_and_versioning.py similarity index 82% rename from surfsense_backend/alembic/versions/117_add_local_folder_connector_and_versioning.py rename to surfsense_backend/alembic/versions/117_add_local_folder_sync_and_versioning.py index a9da3beb4..e322a608d 100644 --- a/surfsense_backend/alembic/versions/117_add_local_folder_connector_and_versioning.py +++ b/surfsense_backend/alembic/versions/117_add_local_folder_sync_and_versioning.py @@ -1,4 +1,4 @@ -"""Add LOCAL_FOLDER_FILE document type and document_versions table +"""Add LOCAL_FOLDER_FILE document type, folder metadata, and document_versions table Revision ID: 117 Revises: 116 @@ -38,6 +38,19 @@ def upgrade() -> None: """ ) + # Add JSONB metadata column to folders table + col_exists = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = 'folders' AND column_name = 'metadata'" + ) + ).fetchone() + if not col_exists: + op.add_column( + "folders", + sa.Column("metadata", sa.dialects.postgresql.JSONB, nullable=True), + ) + # Create document_versions table table_exists = conn.execute( sa.text( @@ -124,3 +137,13 @@ def downgrade() -> None: op.execute("DROP INDEX IF EXISTS ix_document_versions_created_at") op.execute("DROP INDEX IF EXISTS ix_document_versions_document_id") op.execute("DROP TABLE IF EXISTS document_versions") + + # Drop metadata column from folders + col_exists = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = 'folders' AND column_name = 'metadata'" + ) + ).fetchone() + if col_exists: + op.drop_column("folders", "metadata") diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 1a4d3ea06..077b7daa6 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -956,6 +956,7 @@ class Folder(BaseModel, TimestampMixin): onupdate=lambda: datetime.now(UTC), index=True, ) + folder_metadata = Column("metadata", JSONB, nullable=True) parent = relationship("Folder", remote_side="Folder.id", backref="children") search_space = relationship("SearchSpace", back_populates="folders") diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 05221b192..81bbb1477 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1310,6 +1310,13 @@ async def folder_index( "You don't have permission to create documents in this search space", ) + watched_metadata = { + "watched": True, + "folder_path": request.folder_path, + "exclude_patterns": request.exclude_patterns, + "file_extensions": request.file_extensions, + } + root_folder_id = request.root_folder_id if root_folder_id: existing = ( @@ -1319,6 +1326,9 @@ async def folder_index( ).scalar_one_or_none() if not existing: root_folder_id = None + else: + existing.folder_metadata = watched_metadata + await session.commit() if not root_folder_id: root_folder = Folder( @@ -1326,6 +1336,7 @@ async def folder_index( search_space_id=request.search_space_id, created_by_id=str(user.id), position="a0", + folder_metadata=watched_metadata, ) session.add(root_folder) await session.flush() @@ -1403,3 +1414,34 @@ async def folder_index_file( "message": "File indexing started", "status": "processing", } + + +@router.get("/documents/watched-folders", response_model=list["FolderRead"]) +async def get_watched_folders( + search_space_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Return root folders that are marked as watched (metadata->>'watched' = 'true').""" + from app.schemas import FolderRead # noqa: F811 + + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + folders = ( + await session.execute( + select(Folder).where( + Folder.search_space_id == search_space_id, + Folder.parent_id.is_(None), + Folder.folder_metadata.isnot(None), + Folder.folder_metadata["watched"].astext == "true", + ) + ) + ).scalars().all() + + return folders diff --git a/surfsense_backend/app/routes/folders_routes.py b/surfsense_backend/app/routes/folders_routes.py index d688e692a..6e524d4a4 100644 --- a/surfsense_backend/app/routes/folders_routes.py +++ b/surfsense_backend/app/routes/folders_routes.py @@ -192,6 +192,33 @@ async def get_folder_breadcrumb( ) from e +@router.patch("/folders/{folder_id}/watched") +async def stop_watching_folder( + folder_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Clear the watched flag from a folder's metadata.""" + folder = await session.get(Folder, folder_id) + if not folder: + raise HTTPException(status_code=404, detail="Folder not found") + + await check_permission( + session, + user, + folder.search_space_id, + Permission.DOCUMENTS_UPDATE.value, + "You don't have permission to update folders in this search space", + ) + + if folder.folder_metadata and isinstance(folder.folder_metadata, dict): + updated = {**folder.folder_metadata, "watched": False} + folder.folder_metadata = updated + await session.commit() + + return {"message": "Folder watch status updated"} + + @router.put("/folders/{folder_id}", response_model=FolderRead) async def update_folder( folder_id: int, diff --git a/surfsense_backend/app/schemas/folders.py b/surfsense_backend/app/schemas/folders.py index 263817182..e8bdf3821 100644 --- a/surfsense_backend/app/schemas/folders.py +++ b/surfsense_backend/app/schemas/folders.py @@ -3,6 +3,8 @@ from datetime import datetime from uuid import UUID +from typing import Any + from pydantic import BaseModel, ConfigDict, Field @@ -34,6 +36,7 @@ class FolderRead(BaseModel): created_by_id: UUID | None created_at: datetime updated_at: datetime + metadata: dict[str, Any] | None = Field(default=None, validation_alias="folder_metadata") model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 1521c06fe..6780bd1e5 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -76,6 +76,7 @@ interface FolderNodeProps { isWatched?: boolean; onRescan?: (folder: FolderDisplay) => void; onStopWatching?: (folder: FolderDisplay) => void; + onViewMetadata?: (folder: FolderDisplay) => void; } function getDropZone( @@ -116,6 +117,7 @@ export const FolderNode = React.memo(function FolderNode({ isWatched, onRescan, onStopWatching, + onViewMetadata, }: FolderNodeProps) { const [renameValue, setRenameValue] = useState(folder.name); const inputRef = useRef(null); @@ -251,13 +253,21 @@ export const FolderNode = React.memo(function FolderNode({ isOver && !canDrop && "cursor-not-allowed" )} style={{ paddingLeft: `${depth * 16 + 4}px` }} - onClick={() => onToggleExpand(folder.id)} - onKeyDown={(e) => { - if (e.key === "Enter" || e.key === " ") { - e.preventDefault(); - onToggleExpand(folder.id); - } - }} + onClick={(e) => { + if ((e.ctrlKey || e.metaKey) && onViewMetadata) { + e.preventDefault(); + e.stopPropagation(); + onViewMetadata(folder); + return; + } + onToggleExpand(folder.id); + }} + onKeyDown={(e) => { + if (e.key === "Enter" || e.key === " ") { + e.preventDefault(); + onToggleExpand(folder.id); + } + }} onDoubleClick={(e) => { e.stopPropagation(); startRename(); diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index 5945edccb..f34b9a0c2 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -43,6 +43,7 @@ interface FolderTreeViewProps { watchedFolderIds?: Set; onRescanFolder?: (folder: FolderDisplay) => void; onStopWatchingFolder?: (folder: FolderDisplay) => void; + onViewFolderMetadata?: (folder: FolderDisplay) => void; } function groupBy(items: T[], keyFn: (item: T) => string | number): Record { @@ -79,6 +80,7 @@ export function FolderTreeView({ watchedFolderIds, onRescanFolder, onStopWatchingFolder, + onViewFolderMetadata, }: FolderTreeViewProps) { const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]); @@ -210,10 +212,11 @@ export function FolderTreeView({ siblingPositions={siblingPositions} contextMenuOpen={openContextMenuId === `folder-${f.id}`} onContextMenuOpenChange={(open) => setOpenContextMenuId(open ? `folder-${f.id}` : null)} - isWatched={watchedFolderIds?.has(f.id)} - onRescan={onRescanFolder} - onStopWatching={onStopWatchingFolder} - /> + isWatched={watchedFolderIds?.has(f.id)} + onRescan={onRescanFolder} + onStopWatching={onStopWatchingFolder} + onViewMetadata={onViewFolderMetadata} + /> ); if (isExpanded) { diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 202d170d9..f9d32bf98 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -21,6 +21,7 @@ import type { DocumentNodeDoc } from "@/components/documents/DocumentNode"; import type { FolderDisplay } from "@/components/documents/FolderNode"; import { FolderPickerDialog } from "@/components/documents/FolderPickerDialog"; import { FolderTreeView } from "@/components/documents/FolderTreeView"; +import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { EXPORT_FILE_EXTENSIONS } from "@/components/shared/ExportMenuItems"; import { AlertDialog, @@ -95,12 +96,46 @@ export function DocumentsSidebar({ const [activeTypes, setActiveTypes] = useState([]); const [watchedFolderIds, setWatchedFolderIds] = useState>(new Set()); + const [metadataFolder, setMetadataFolder] = useState(null); + const [metadataJson, setMetadataJson] = useState | null>(null); + const [metadataLoading, setMetadataLoading] = useState(false); + useEffect(() => { const api = typeof window !== "undefined" ? window.electronAPI : null; if (!api?.getWatchedFolders) return; async function loadWatchedIds() { const folders = await api!.getWatchedFolders(); + + if (folders.length === 0) { + try { + const backendFolders = await documentsApiService.getWatchedFolders(searchSpaceId); + for (const bf of backendFolders) { + const meta = bf.metadata as Record | null; + if (!meta?.watched || !meta.folder_path) continue; + await api!.addWatchedFolder({ + path: meta.folder_path as string, + name: bf.name, + rootFolderId: bf.id, + searchSpaceId: bf.search_space_id, + excludePatterns: (meta.exclude_patterns as string[]) ?? [], + fileExtensions: (meta.file_extensions as string[] | null) ?? null, + active: true, + }); + } + const recovered = await api!.getWatchedFolders(); + const ids = new Set( + recovered + .filter((f) => f.rootFolderId != null) + .map((f) => f.rootFolderId as number) + ); + setWatchedFolderIds(ids); + return; + } catch (err) { + console.error("[DocumentsSidebar] Recovery from backend failed:", err); + } + } + const ids = new Set( folders .filter((f) => f.rootFolderId != null) @@ -110,7 +145,7 @@ export function DocumentsSidebar({ } loadWatchedIds(); - }, []); + }, [searchSpaceId]); const { mutateAsync: deleteDocumentMutation } = useAtomValue(deleteDocumentMutationAtom); const [sidebarDocs, setSidebarDocs] = useAtom(sidebarSelectedDocumentsAtom); @@ -318,11 +353,30 @@ export function DocumentsSidebar({ } await api.removeWatchedFolder(matched.path); + try { + await foldersApiService.stopWatching(folder.id); + } catch (err) { + console.error("[DocumentsSidebar] Failed to clear watched metadata:", err); + } toast.success(`Stopped watching: ${matched.name}`); }, [] ); + const handleViewFolderMetadata = useCallback(async (folder: FolderDisplay) => { + setMetadataFolder(folder); + setMetadataLoading(true); + try { + const fullFolder = await foldersApiService.getFolder(folder.id); + setMetadataJson((fullFolder.metadata as Record) ?? {}); + } catch (err) { + console.error("[DocumentsSidebar] Failed to fetch folder metadata:", err); + setMetadataJson({ error: "Failed to load folder metadata" }); + } finally { + setMetadataLoading(false); + } + }, []); + const handleRenameFolder = useCallback(async (folder: FolderDisplay, newName: string) => { try { await foldersApiService.updateFolder(folder.id, { name: newName }); @@ -801,11 +855,26 @@ export function DocumentsSidebar({ onReorderFolder={handleReorderFolder} watchedFolderIds={watchedFolderIds} onRescanFolder={handleRescanFolder} - onStopWatchingFolder={handleStopWatching} - /> -
+ onStopWatchingFolder={handleStopWatching} + onViewFolderMetadata={handleViewFolderMetadata} + /> +
- { + if (!open) { + setMetadataFolder(null); + setMetadataJson(null); + setMetadataLoading(false); + } + }} + /> + + { + return baseApiService.get(`/api/v1/documents/watched-folders?search_space_id=${searchSpaceId}`, folderListResponse); + }; + /** * Delete a document */ diff --git a/surfsense_web/lib/apis/folders-api.service.ts b/surfsense_web/lib/apis/folders-api.service.ts index 99d9ad774..2e535d615 100644 --- a/surfsense_web/lib/apis/folders-api.service.ts +++ b/surfsense_web/lib/apis/folders-api.service.ts @@ -85,6 +85,10 @@ class FoldersApiService { return baseApiService.delete(`/api/v1/folders/${folderId}`, folderDeleteResponse); }; + stopWatching = async (folderId: number) => { + return baseApiService.patch(`/api/v1/folders/${folderId}/watched`, undefined); + }; + moveDocument = async (documentId: number, request: DocumentMoveRequest) => { const parsed = documentMoveRequest.safeParse(request); if (!parsed.success) { From 9c1d9357c4e3b0fe5eb25f737069d6494cea2188 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 20:19:16 +0200 Subject: [PATCH 053/202] refactor: fix dynamic tooltip resizing and split autocomplete into SPR modules --- surfsense_desktop/electron-builder.yml | 12 +- surfsense_desktop/package.json | 1 + surfsense_desktop/pnpm-lock.yaml | 3 + surfsense_desktop/scripts/build-electron.mjs | 2 +- surfsense_desktop/src/main.ts | 3 +- .../index.ts} | 181 ++++++++---------- .../modules/autocomplete/keystroke-buffer.ts | 76 ++++++++ .../modules/autocomplete/suggestion-window.ts | 103 ++++++++++ surfsense_desktop/src/modules/platform.ts | 49 ----- .../app/desktop/permissions/page.tsx | 13 +- surfsense_web/app/desktop/suggestion/page.tsx | 6 +- .../app/desktop/suggestion/suggestion.css | 70 ++++--- 12 files changed, 326 insertions(+), 193 deletions(-) rename surfsense_desktop/src/modules/{autocomplete.ts => autocomplete/index.ts} (55%) create mode 100644 surfsense_desktop/src/modules/autocomplete/keystroke-buffer.ts create mode 100644 surfsense_desktop/src/modules/autocomplete/suggestion-window.ts diff --git a/surfsense_desktop/electron-builder.yml b/surfsense_desktop/electron-builder.yml index 74c69d223..115b69c8e 100644 --- a/surfsense_desktop/electron-builder.yml +++ b/surfsense_desktop/electron-builder.yml @@ -10,13 +10,13 @@ files: - dist/**/* - "!node_modules" - node_modules/uiohook-napi/**/* - - "!node_modules/uiohook-napi/build" - "!node_modules/uiohook-napi/src" - "!node_modules/uiohook-napi/libuiohook" - "!node_modules/uiohook-napi/binding.gyp" - node_modules/node-gyp-build/**/* + - node_modules/bindings/**/* + - node_modules/file-uri-to-path/**/* - node_modules/node-mac-permissions/**/* - - "!node_modules/node-mac-permissions/build" - "!node_modules/node-mac-permissions/src" - "!node_modules/node-mac-permissions/binding.gyp" - "!src" @@ -41,13 +41,19 @@ asarUnpack: - "**/*.node" - "node_modules/uiohook-napi/**/*" - "node_modules/node-gyp-build/**/*" + - "node_modules/bindings/**/*" + - "node_modules/file-uri-to-path/**/*" - "node_modules/node-mac-permissions/**/*" mac: icon: assets/icon.icns category: public.app-category.productivity artifactName: "${productName}-${version}-${arch}.${ext}" - hardenedRuntime: true + hardenedRuntime: false gatekeeperAssess: false + extendInfo: + NSInputMonitoringUsageDescription: "SurfSense uses input monitoring to provide system-wide autocomplete suggestions as you type." + NSAccessibilityUsageDescription: "SurfSense uses accessibility features to read text fields and insert suggestions." + NSAppleEventsUsageDescription: "SurfSense uses Apple Events to read text from the active application and insert autocomplete suggestions." target: - target: dmg arch: [x64, arm64] diff --git a/surfsense_desktop/package.json b/surfsense_desktop/package.json index a2e452b7c..01a63b265 100644 --- a/surfsense_desktop/package.json +++ b/surfsense_desktop/package.json @@ -29,6 +29,7 @@ "wait-on": "^9.0.4" }, "dependencies": { + "bindings": "^1.5.0", "electron-updater": "^6.8.3", "get-port-please": "^3.2.0", "node-mac-permissions": "^2.5.0", diff --git a/surfsense_desktop/pnpm-lock.yaml b/surfsense_desktop/pnpm-lock.yaml index 82bad9456..d0b453d31 100644 --- a/surfsense_desktop/pnpm-lock.yaml +++ b/surfsense_desktop/pnpm-lock.yaml @@ -8,6 +8,9 @@ importers: .: dependencies: + bindings: + specifier: ^1.5.0 + version: 1.5.0 electron-updater: specifier: ^6.8.3 version: 6.8.3 diff --git a/surfsense_desktop/scripts/build-electron.mjs b/surfsense_desktop/scripts/build-electron.mjs index 83d941dd2..c2869ec46 100644 --- a/surfsense_desktop/scripts/build-electron.mjs +++ b/surfsense_desktop/scripts/build-electron.mjs @@ -104,7 +104,7 @@ async function buildElectron() { bundle: true, platform: 'node', target: 'node18', - external: ['electron', 'uiohook-napi', 'node-mac-permissions'], + external: ['electron', 'uiohook-napi', 'node-mac-permissions', 'bindings', 'file-uri-to-path'], sourcemap: true, minify: false, define: { diff --git a/surfsense_desktop/src/main.ts b/surfsense_desktop/src/main.ts index 9623be82e..c96453c6d 100644 --- a/surfsense_desktop/src/main.ts +++ b/surfsense_desktop/src/main.ts @@ -19,7 +19,8 @@ if (!setupDeepLinks()) { registerIpcHandlers(); function getInitialPath(): string { - if (process.platform === 'darwin' && !allPermissionsGranted()) { + const granted = allPermissionsGranted(); + if (process.platform === 'darwin' && !granted) { return '/desktop/permissions'; } return '/dashboard'; diff --git a/surfsense_desktop/src/modules/autocomplete.ts b/surfsense_desktop/src/modules/autocomplete/index.ts similarity index 55% rename from surfsense_desktop/src/modules/autocomplete.ts rename to surfsense_desktop/src/modules/autocomplete/index.ts index 2b877723f..2ea37d051 100644 --- a/surfsense_desktop/src/modules/autocomplete.ts +++ b/surfsense_desktop/src/modules/autocomplete/index.ts @@ -1,20 +1,19 @@ -import { BrowserWindow, clipboard, ipcMain, screen, shell } from 'electron'; -import path from 'path'; -import { IPC_CHANNELS } from '../ipc/channels'; -import { allPermissionsGranted } from './permissions'; -import { getFieldContent, getFrontmostApp, hasAccessibilityPermission, simulatePaste } from './platform'; -import { getServerPort } from './server'; -import { getMainWindow } from './window'; +import { clipboard, ipcMain, screen } from 'electron'; +import { IPC_CHANNELS } from '../../ipc/channels'; +import { getFrontmostApp, hasAccessibilityPermission, simulatePaste } from '../platform'; +import { getMainWindow } from '../window'; +import { + appendToBuffer, buildKeycodeMap, getBuffer, getBufferTrimmed, + getLastTrackedApp, removeLastChar, resetBuffer, resolveChar, setLastTrackedApp, +} from './keystroke-buffer'; +import { createSuggestionWindow, destroySuggestion, getSuggestionWindow } from './suggestion-window'; const DEBOUNCE_MS = 600; -const TOOLTIP_WIDTH = 420; -const TOOLTIP_HEIGHT = 140; let uIOhook: any = null; let UiohookKey: any = {}; let IGNORED_KEYCODES: Set = new Set(); -let suggestionWindow: BrowserWindow | null = null; let debounceTimer: ReturnType | null = null; let hookStarted = false; let autocompleteEnabled = true; @@ -38,12 +37,8 @@ function loadUiohook(): boolean { UiohookKey.F5, UiohookKey.F6, UiohookKey.F7, UiohookKey.F8, UiohookKey.F9, UiohookKey.F10, UiohookKey.F11, UiohookKey.F12, UiohookKey.PrintScreen, - UiohookKey.Insert, UiohookKey.Delete, - UiohookKey.Home, UiohookKey.End, - UiohookKey.PageUp, UiohookKey.PageDown, - UiohookKey.ArrowUp, UiohookKey.ArrowDown, - UiohookKey.ArrowLeft, UiohookKey.ArrowRight, ]); + buildKeycodeMap(); console.log('[autocomplete] uiohook-napi loaded'); return true; } catch (err) { @@ -52,70 +47,6 @@ function loadUiohook(): boolean { } } -function destroySuggestion(): void { - if (suggestionWindow && !suggestionWindow.isDestroyed()) { - suggestionWindow.close(); - } - suggestionWindow = null; -} - -function clampToScreen(x: number, y: number, w: number, h: number): { x: number; y: number } { - const display = screen.getDisplayNearestPoint({ x, y }); - const { x: dx, y: dy, width: dw, height: dh } = display.workArea; - return { - x: Math.max(dx, Math.min(x, dx + dw - w)), - y: Math.max(dy, Math.min(y, dy + dh - h)), - }; -} - -function createSuggestionWindow(x: number, y: number): BrowserWindow { - destroySuggestion(); - - const pos = clampToScreen(x, y + 20, TOOLTIP_WIDTH, TOOLTIP_HEIGHT); - - suggestionWindow = new BrowserWindow({ - width: TOOLTIP_WIDTH, - height: TOOLTIP_HEIGHT, - x: pos.x, - y: pos.y, - frame: false, - transparent: true, - focusable: false, - alwaysOnTop: true, - skipTaskbar: true, - resizable: false, - hasShadow: true, - type: 'panel', - webPreferences: { - preload: path.join(__dirname, 'preload.js'), - contextIsolation: true, - nodeIntegration: false, - sandbox: true, - }, - show: false, - }); - - suggestionWindow.loadURL(`http://localhost:${getServerPort()}/desktop/suggestion?t=${Date.now()}`); - - suggestionWindow.once('ready-to-show', () => { - suggestionWindow?.showInactive(); - }); - - suggestionWindow.webContents.setWindowOpenHandler(({ url }) => { - if (url.startsWith('http://localhost')) { - return { action: 'allow' }; - } - shell.openExternal(url); - return { action: 'deny' }; - }); - - suggestionWindow.on('closed', () => { - suggestionWindow = null; - }); - - return suggestionWindow; -} - function clearDebounce(): void { if (debounceTimer) { clearTimeout(debounceTimer); @@ -128,10 +59,24 @@ function isSurfSenseWindow(): boolean { return app === 'Electron' || app === 'SurfSense' || app === 'surfsense-desktop'; } -function onKeyDown(event: { keycode: number; ctrlKey?: boolean; metaKey?: boolean; altKey?: boolean }): void { +function onKeyDown(event: { + keycode: number; + shiftKey?: boolean; + ctrlKey?: boolean; + metaKey?: boolean; + altKey?: boolean; +}): void { if (!autocompleteEnabled) return; - if (event.keycode === UiohookKey.Tab && suggestionWindow && !suggestionWindow.isDestroyed()) { + const currentApp = getFrontmostApp(); + if (currentApp !== getLastTrackedApp()) { + resetBuffer(); + setLastTrackedApp(currentApp); + } + + const win = getSuggestionWindow(); + + if (event.keycode === UiohookKey.Tab && win && !win.isDestroyed()) { if (pendingSuggestionText) { acceptAndInject(pendingSuggestionText); } @@ -139,7 +84,7 @@ function onKeyDown(event: { keycode: number; ctrlKey?: boolean; metaKey?: boolea } if (event.keycode === UiohookKey.Escape) { - if (suggestionWindow && !suggestionWindow.isDestroyed()) { + if (win && !win.isDestroyed()) { destroySuggestion(); pendingSuggestionText = ''; } @@ -147,11 +92,41 @@ function onKeyDown(event: { keycode: number; ctrlKey?: boolean; metaKey?: boolea return; } - if (IGNORED_KEYCODES.has(event.keycode)) return; - if (event.ctrlKey || event.metaKey || event.altKey) return; - if (isSurfSenseWindow()) return; + if (currentApp === 'Electron' || currentApp === 'SurfSense' || currentApp === 'surfsense-desktop') { + return; + } - if (suggestionWindow && !suggestionWindow.isDestroyed()) { + if (event.ctrlKey || event.metaKey || event.altKey) { + resetBuffer(); + clearDebounce(); + return; + } + + if (event.keycode === UiohookKey.Backspace) { + removeLastChar(); + } else if (event.keycode === UiohookKey.Delete) { + // forward delete doesn't affect our trailing buffer + } else if (event.keycode === UiohookKey.Enter) { + appendToBuffer('\n'); + } else if (event.keycode === UiohookKey.Space) { + appendToBuffer(' '); + } else if ( + event.keycode === UiohookKey.ArrowLeft || event.keycode === UiohookKey.ArrowRight || + event.keycode === UiohookKey.ArrowUp || event.keycode === UiohookKey.ArrowDown || + event.keycode === UiohookKey.Home || event.keycode === UiohookKey.End || + event.keycode === UiohookKey.PageUp || event.keycode === UiohookKey.PageDown + ) { + resetBuffer(); + clearDebounce(); + return; + } else if (IGNORED_KEYCODES.has(event.keycode)) { + return; + } else { + const ch = resolveChar(event.keycode, !!event.shiftKey); + if (ch) appendToBuffer(ch); + } + + if (win && !win.isDestroyed()) { destroySuggestion(); } @@ -161,13 +136,16 @@ function onKeyDown(event: { keycode: number; ctrlKey?: boolean; metaKey?: boolea }, DEBOUNCE_MS); } +function onMouseClick(): void { + resetBuffer(); +} + async function triggerAutocomplete(): Promise { if (!hasAccessibilityPermission()) return; if (isSurfSenseWindow()) return; - const fieldContent = getFieldContent(); - if (!fieldContent || !fieldContent.text.trim()) return; - if (fieldContent.text.trim().length < 5) return; + const text = getBufferTrimmed(); + if (!text || text.length < 5) return; sourceApp = getFrontmostApp(); savedClipboard = clipboard.readText(); @@ -186,13 +164,16 @@ async function triggerAutocomplete(): Promise { } win.webContents.once('did-finish-load', () => { - if (suggestionWindow && !suggestionWindow.isDestroyed()) { - suggestionWindow.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, { - text: fieldContent.text, - cursorPosition: fieldContent.cursorPosition, - searchSpaceId, - }); - } + const sw = getSuggestionWindow(); + setTimeout(() => { + if (sw && !sw.isDestroyed()) { + sw.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, { + text: getBuffer(), + cursorPosition: getBuffer().length, + searchSpaceId, + }); + } + }, 300); }); } @@ -209,6 +190,7 @@ async function acceptAndInject(text: string): Promise { simulatePaste(); await new Promise((r) => setTimeout(r, 100)); clipboard.writeText(savedClipboard); + appendToBuffer(text); } catch { clipboard.writeText(savedClipboard); } @@ -238,21 +220,16 @@ function registerIpcHandlers(): void { export function registerAutocomplete(): void { registerIpcHandlers(); - if (!allPermissionsGranted()) { - console.log('[autocomplete] Permissions not granted — hook not started'); - return; - } - if (!loadUiohook()) { console.error('[autocomplete] Cannot start: uiohook-napi failed to load'); return; } uIOhook.on('keydown', onKeyDown); + uIOhook.on('click', onMouseClick); try { uIOhook.start(); hookStarted = true; - console.log('[autocomplete] uIOhook started'); } catch (err) { console.error('[autocomplete] uIOhook.start() failed:', err); } diff --git a/surfsense_desktop/src/modules/autocomplete/keystroke-buffer.ts b/surfsense_desktop/src/modules/autocomplete/keystroke-buffer.ts new file mode 100644 index 000000000..ca232d307 --- /dev/null +++ b/surfsense_desktop/src/modules/autocomplete/keystroke-buffer.ts @@ -0,0 +1,76 @@ +const MAX_BUFFER_LENGTH = 4000; +const KEYCODE_TO_CHAR: Record = {}; + +let keystrokeBuffer = ''; +let lastTrackedApp = ''; + +export function buildKeycodeMap(): void { + const letters: [string, number][] = [ + ['q', 16], ['w', 17], ['e', 18], ['r', 19], ['t', 20], + ['y', 21], ['u', 22], ['i', 23], ['o', 24], ['p', 25], + ['a', 30], ['s', 31], ['d', 32], ['f', 33], ['g', 34], + ['h', 35], ['j', 36], ['k', 37], ['l', 38], + ['z', 44], ['x', 45], ['c', 46], ['v', 47], + ['b', 48], ['n', 49], ['m', 50], + ]; + for (const [ch, code] of letters) { + KEYCODE_TO_CHAR[code] = [ch, ch.toUpperCase()]; + } + + const digits: [string, string, number][] = [ + ['1', '!', 2], ['2', '@', 3], ['3', '#', 4], ['4', '$', 5], + ['5', '%', 6], ['6', '^', 7], ['7', '&', 8], ['8', '*', 9], + ['9', '(', 10], ['0', ')', 11], + ]; + for (const [norm, shifted, code] of digits) { + KEYCODE_TO_CHAR[code] = [norm, shifted]; + } + + const punctuation: [string, string, number][] = [ + [';', ':', 39], ['=', '+', 13], [',', '<', 51], ['-', '_', 12], + ['.', '>', 52], ['/', '?', 53], ['`', '~', 41], ['[', '{', 26], + ['\\', '|', 43], [']', '}', 27], ["'", '"', 40], + ]; + for (const [norm, shifted, code] of punctuation) { + KEYCODE_TO_CHAR[code] = [norm, shifted]; + } +} + +export function resetBuffer(): void { + keystrokeBuffer = ''; +} + +export function appendToBuffer(char: string): void { + keystrokeBuffer += char; + if (keystrokeBuffer.length > MAX_BUFFER_LENGTH) { + keystrokeBuffer = keystrokeBuffer.slice(-MAX_BUFFER_LENGTH); + } +} + +export function removeLastChar(): void { + if (keystrokeBuffer.length > 0) { + keystrokeBuffer = keystrokeBuffer.slice(0, -1); + } +} + +export function getBuffer(): string { + return keystrokeBuffer; +} + +export function getBufferTrimmed(): string { + return keystrokeBuffer.trim(); +} + +export function getLastTrackedApp(): string { + return lastTrackedApp; +} + +export function setLastTrackedApp(app: string): void { + lastTrackedApp = app; +} + +export function resolveChar(keycode: number, shift: boolean): string | null { + const mapping = KEYCODE_TO_CHAR[keycode]; + if (!mapping) return null; + return shift ? mapping[1] : mapping[0]; +} diff --git a/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts new file mode 100644 index 000000000..f03930cf6 --- /dev/null +++ b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts @@ -0,0 +1,103 @@ +import { BrowserWindow, screen, shell } from 'electron'; +import path from 'path'; +import { getServerPort } from '../server'; + +const TOOLTIP_WIDTH = 420; +const TOOLTIP_HEIGHT = 38; +const MAX_HEIGHT = 400; + +let suggestionWindow: BrowserWindow | null = null; +let resizeTimer: ReturnType | null = null; + +function clampToScreen(x: number, y: number, w: number, h: number): { x: number; y: number } { + const display = screen.getDisplayNearestPoint({ x, y }); + const { x: dx, y: dy, width: dw, height: dh } = display.workArea; + return { + x: Math.max(dx, Math.min(x, dx + dw - w)), + y: Math.max(dy, Math.min(y, dy + dh - h)), + }; +} + +function stopResizePolling(): void { + if (resizeTimer) { clearInterval(resizeTimer); resizeTimer = null; } +} + +function startResizePolling(win: BrowserWindow): void { + stopResizePolling(); + let lastH = 0; + resizeTimer = setInterval(async () => { + if (!win || win.isDestroyed()) { stopResizePolling(); return; } + try { + const h: number = await win.webContents.executeJavaScript( + `document.body.scrollHeight` + ); + if (h > 0 && h !== lastH) { + lastH = h; + const clamped = Math.min(h, MAX_HEIGHT); + const bounds = win.getBounds(); + win.setBounds({ x: bounds.x, y: bounds.y, width: TOOLTIP_WIDTH, height: clamped }); + } + } catch {} + }, 150); +} + +export function getSuggestionWindow(): BrowserWindow | null { + return suggestionWindow; +} + +export function destroySuggestion(): void { + stopResizePolling(); + if (suggestionWindow && !suggestionWindow.isDestroyed()) { + suggestionWindow.close(); + } + suggestionWindow = null; +} + +export function createSuggestionWindow(x: number, y: number): BrowserWindow { + destroySuggestion(); + + const pos = clampToScreen(x, y + 20, TOOLTIP_WIDTH, TOOLTIP_HEIGHT); + + suggestionWindow = new BrowserWindow({ + width: TOOLTIP_WIDTH, + height: TOOLTIP_HEIGHT, + x: pos.x, + y: pos.y, + frame: false, + transparent: true, + focusable: false, + alwaysOnTop: true, + skipTaskbar: true, + hasShadow: true, + type: 'panel', + webPreferences: { + preload: path.join(__dirname, '..', 'preload.js'), + contextIsolation: true, + nodeIntegration: false, + sandbox: true, + }, + show: false, + }); + + suggestionWindow.loadURL(`http://localhost:${getServerPort()}/desktop/suggestion?t=${Date.now()}`); + + suggestionWindow.once('ready-to-show', () => { + suggestionWindow?.showInactive(); + if (suggestionWindow) startResizePolling(suggestionWindow); + }); + + suggestionWindow.webContents.setWindowOpenHandler(({ url }) => { + if (url.startsWith('http://localhost')) { + return { action: 'allow' }; + } + shell.openExternal(url); + return { action: 'deny' }; + }); + + suggestionWindow.on('closed', () => { + stopResizePolling(); + suggestionWindow = null; + }); + + return suggestionWindow; +} diff --git a/surfsense_desktop/src/modules/platform.ts b/surfsense_desktop/src/modules/platform.ts index 262866d07..1ab0c38fb 100644 --- a/surfsense_desktop/src/modules/platform.ts +++ b/surfsense_desktop/src/modules/platform.ts @@ -19,20 +19,6 @@ export function getFrontmostApp(): string { return ''; } -export function getSelectedText(): string { - try { - if (process.platform === 'darwin') { - return execSync( - 'osascript -e \'tell application "System Events" to get value of attribute "AXSelectedText" of focused UI element of first application process whose frontmost is true\'' - ).toString().trim(); - } - // Windows: no reliable accessibility API for selected text across apps - } catch { - return ''; - } - return ''; -} - export function simulateCopy(): void { if (process.platform === 'darwin') { execSync('osascript -e \'tell application "System Events" to keystroke "c" using command down\''); @@ -58,38 +44,3 @@ export function hasAccessibilityPermission(): boolean { if (process.platform !== 'darwin') return true; return systemPreferences.isTrustedAccessibilityClient(false); } - -export interface FieldContent { - text: string; - cursorPosition: number; -} - -export function getFieldContent(): FieldContent | null { - if (process.platform !== 'darwin') return null; - - try { - const text = execSync( - 'osascript -e \'tell application "System Events" to get value of attribute "AXValue" of focused UI element of first application process whose frontmost is true\'', - { timeout: 500 } - ).toString().trim(); - - let cursorPosition = text.length; - try { - const rangeStr = execSync( - 'osascript -e \'tell application "System Events" to get value of attribute "AXSelectedTextRange" of focused UI element of first application process whose frontmost is true\'', - { timeout: 500 } - ).toString().trim(); - - const locationMatch = rangeStr.match(/location[:\s]*(\d+)/i); - if (locationMatch) { - cursorPosition = parseInt(locationMatch[1], 10); - } - } catch { - // Fall back to end of text - } - - return { text, cursorPosition }; - } catch { - return null; - } -} diff --git a/surfsense_web/app/desktop/permissions/page.tsx b/surfsense_web/app/desktop/permissions/page.tsx index 2bcdc42df..8bde63357 100644 --- a/surfsense_web/app/desktop/permissions/page.tsx +++ b/surfsense_web/app/desktop/permissions/page.tsx @@ -169,11 +169,14 @@ export default function DesktopPermissionsPage() { > Open System Settings - {status === "denied" && ( -

- Toggle SurfSense on in System Settings to continue. -

- )} + {status === "denied" && ( +

+ Toggle SurfSense on in System Settings to continue. +

+ )} +

+ If SurfSense doesn't appear in the list, click + and select it from Applications. +

)}
diff --git a/surfsense_web/app/desktop/suggestion/page.tsx b/surfsense_web/app/desktop/suggestion/page.tsx index 14dfab3af..69a19e3f1 100644 --- a/surfsense_web/app/desktop/suggestion/page.tsx +++ b/surfsense_web/app/desktop/suggestion/page.tsx @@ -151,9 +151,9 @@ export default function SuggestionPage() {

{suggestion}

- Tab accept - · - Esc dismiss + Tab accept + + Esc dismiss
); diff --git a/surfsense_web/app/desktop/suggestion/suggestion.css b/surfsense_web/app/desktop/suggestion/suggestion.css index e9471e7f8..0d3332103 100644 --- a/surfsense_web/app/desktop/suggestion/suggestion.css +++ b/surfsense_web/app/desktop/suggestion/suggestion.css @@ -1,8 +1,16 @@ +html, body { + margin: 0 !important; + padding: 0 !important; + background: transparent !important; + overflow: hidden !important; + height: auto !important; + width: 100% !important; +} + .suggestion-body { margin: 0; padding: 0; background: transparent; - overflow: hidden; font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; -webkit-font-smoothing: antialiased; user-select: none; @@ -10,69 +18,73 @@ } .suggestion-tooltip { - background: rgba(30, 30, 30, 0.95); - backdrop-filter: blur(12px); - -webkit-backdrop-filter: blur(12px); - border: 1px solid rgba(255, 255, 255, 0.1); - border-radius: 10px; - padding: 10px 14px; + background: #1e1e1e; + border: 1px solid #3c3c3c; + border-radius: 8px; + padding: 8px 12px; margin: 4px; max-width: 400px; - box-shadow: 0 8px 32px rgba(0, 0, 0, 0.4), - 0 2px 8px rgba(0, 0, 0, 0.2); + box-shadow: 0 4px 16px rgba(0, 0, 0, 0.5); } .suggestion-text { - color: rgba(255, 255, 255, 0.9); + color: #d4d4d4; font-size: 13px; - line-height: 1.5; - margin: 0 0 8px 0; + line-height: 1.45; + margin: 0 0 6px 0; word-wrap: break-word; white-space: pre-wrap; } .suggestion-hint { - color: rgba(255, 255, 255, 0.4); + color: #666; font-size: 11px; display: flex; align-items: center; - gap: 4px; + gap: 6px; + border-top: 1px solid #2a2a2a; + padding-top: 6px; } -.suggestion-key { - background: rgba(255, 255, 255, 0.1); - border: 1px solid rgba(255, 255, 255, 0.15); +.suggestion-hint kbd { + background: #2a2a2a; + border: 1px solid #3c3c3c; border-radius: 3px; - padding: 1px 5px; + padding: 0 4px; + font-family: inherit; font-size: 10px; - font-weight: 500; - color: rgba(255, 255, 255, 0.6); + font-weight: 600; + color: #999; + line-height: 18px; } .suggestion-separator { - margin: 0 2px; + width: 1px; + height: 10px; + background: #333; } .suggestion-error { - border-color: rgba(255, 80, 80, 0.3); + border-color: #5c2626; } .suggestion-error-text { - color: rgba(255, 120, 120, 0.9); + color: #f48771; font-size: 12px; } .suggestion-loading { display: flex; - gap: 4px; - padding: 4px 0; + gap: 5px; + padding: 2px 0; + justify-content: center; } .suggestion-dot { - width: 5px; - height: 5px; + width: 4px; + height: 4px; border-radius: 50%; - background: rgba(255, 255, 255, 0.4); + background: #666; animation: suggestion-pulse 1.2s infinite ease-in-out; } @@ -91,6 +103,6 @@ } 40% { opacity: 1; - transform: scale(1); + transform: scale(1.1); } } From 3e68d4aa3ed04c87f155a4da08a6610251755f74 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 20:38:09 +0200 Subject: [PATCH 054/202] refactor: extract autocomplete service and fix tooltip screen-edge positioning --- .../app/routes/autocomplete_routes.py | 109 +---------------- .../app/services/autocomplete_service.py | 110 ++++++++++++++++++ .../modules/autocomplete/suggestion-window.ts | 27 +++-- 3 files changed, 130 insertions(+), 116 deletions(-) create mode 100644 surfsense_backend/app/services/autocomplete_service.py diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py index 9a285a723..68c56d0e0 100644 --- a/surfsense_backend/app/routes/autocomplete_routes.py +++ b/surfsense_backend/app/routes/autocomplete_routes.py @@ -1,118 +1,14 @@ -import logging -from typing import AsyncGenerator - from fastapi import APIRouter, Depends, Query from fastapi.responses import StreamingResponse -from langchain_core.messages import HumanMessage, SystemMessage from sqlalchemy.ext.asyncio import AsyncSession from app.db import User, get_async_session -from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever -from app.services.llm_service import get_agent_llm +from app.services.autocomplete_service import stream_autocomplete from app.services.new_streaming_service import VercelStreamingService from app.users import current_active_user -logger = logging.getLogger(__name__) - router = APIRouter(prefix="/autocomplete", tags=["autocomplete"]) -AUTOCOMPLETE_SYSTEM_PROMPT = """You are an inline text autocomplete engine. Your job is to complete the user's text naturally. - -Rules: -- Output ONLY the continuation text. Do NOT repeat what the user already typed. -- Keep completions concise: 1-3 sentences maximum. -- Match the user's tone, style, and language. -- If knowledge base context is provided, use it to make the completion factually accurate and personalized. -- Do NOT add quotes, explanations, or meta-commentary. -- Do NOT start with a space unless grammatically required. -- If you cannot produce a useful completion, output nothing.""" - -KB_CONTEXT_TEMPLATE = """ -Relevant knowledge base context (use this to personalize the completion): ---- -{kb_context} ---- -""" - - -async def _stream_autocomplete( - text: str, - cursor_position: int, - search_space_id: int, - session: AsyncSession, -) -> AsyncGenerator[str, None]: - """Stream an autocomplete response with KB context.""" - streaming_service = VercelStreamingService() - - try: - # Text before cursor is what we're completing - text_before_cursor = text[:cursor_position] if cursor_position >= 0 else text - - if not text_before_cursor.strip(): - yield streaming_service.format_message_start() - yield streaming_service.format_finish() - yield streaming_service.format_done() - return - - # Fast KB lookup: vector-only search, top 3 chunks, no planner LLM - kb_context = "" - try: - retriever = ChucksHybridSearchRetriever(session) - chunks = await retriever.vector_search( - query_text=text_before_cursor[-200:], # last 200 chars for relevance - top_k=3, - search_space_id=search_space_id, - ) - if chunks: - kb_snippets = [] - for chunk in chunks: - content = getattr(chunk, "content", None) or getattr(chunk, "chunk_text", "") - if content: - kb_snippets.append(content[:300]) - if kb_snippets: - kb_context = KB_CONTEXT_TEMPLATE.format( - kb_context="\n\n".join(kb_snippets) - ) - except Exception as e: - logger.warning(f"KB search failed for autocomplete, proceeding without context: {e}") - - # Get the search space's configured LLM - llm = await get_agent_llm(session, search_space_id) - if not llm: - yield streaming_service.format_message_start() - error_msg = "No LLM configured for this search space" - yield streaming_service.format_error(error_msg) - yield streaming_service.format_done() - return - - system_prompt = AUTOCOMPLETE_SYSTEM_PROMPT - if kb_context: - system_prompt += kb_context - - messages = [ - SystemMessage(content=system_prompt), - HumanMessage(content=f"Complete this text:\n{text_before_cursor}"), - ] - - # Stream the response - yield streaming_service.format_message_start() - text_id = streaming_service.generate_text_id() - yield streaming_service.format_text_start(text_id) - - async for chunk in llm.astream(messages): - token = chunk.content if hasattr(chunk, "content") else str(chunk) - if token: - yield streaming_service.format_text_delta(text_id, token) - - yield streaming_service.format_text_end(text_id) - yield streaming_service.format_finish() - yield streaming_service.format_done() - - except Exception as e: - logger.error(f"Autocomplete streaming error: {e}") - yield streaming_service.format_error(str(e)) - yield streaming_service.format_done() - @router.post("/stream") async def autocomplete_stream( @@ -122,12 +18,11 @@ async def autocomplete_stream( user: User = Depends(current_active_user), session: AsyncSession = Depends(get_async_session), ): - """Stream an autocomplete suggestion based on the current text and KB context.""" if cursor_position < 0: cursor_position = len(text) return StreamingResponse( - _stream_autocomplete(text, cursor_position, search_space_id, session), + stream_autocomplete(text, cursor_position, search_space_id, session), media_type="text/event-stream", headers={ **VercelStreamingService.get_response_headers(), diff --git a/surfsense_backend/app/services/autocomplete_service.py b/surfsense_backend/app/services/autocomplete_service.py new file mode 100644 index 000000000..7c172275d --- /dev/null +++ b/surfsense_backend/app/services/autocomplete_service.py @@ -0,0 +1,110 @@ +import logging +from typing import AsyncGenerator + +from langchain_core.messages import HumanMessage, SystemMessage +from sqlalchemy.ext.asyncio import AsyncSession + +from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever +from app.services.llm_service import get_agent_llm +from app.services.new_streaming_service import VercelStreamingService + +logger = logging.getLogger(__name__) + +SYSTEM_PROMPT = """You are an inline text autocomplete engine. Your job is to complete the user's text naturally. + +Rules: +- Output ONLY the continuation text. Do NOT repeat what the user already typed. +- Keep completions concise: 1-3 sentences maximum. +- Match the user's tone, style, and language. +- If knowledge base context is provided, use it to make the completion factually accurate and personalized. +- Do NOT add quotes, explanations, or meta-commentary. +- Do NOT start with a space unless grammatically required. +- If you cannot produce a useful completion, output nothing.""" + +KB_CONTEXT_TEMPLATE = """ +Relevant knowledge base context (use this to personalize the completion): +--- +{kb_context} +--- +""" + + +async def _retrieve_kb_context( + session: AsyncSession, + text: str, + search_space_id: int, +) -> str: + try: + retriever = ChucksHybridSearchRetriever(session) + chunks = await retriever.vector_search( + query_text=text[-200:], + top_k=3, + search_space_id=search_space_id, + ) + if not chunks: + return "" + snippets = [] + for chunk in chunks: + content = getattr(chunk, "content", None) or getattr(chunk, "chunk_text", "") + if content: + snippets.append(content[:300]) + if not snippets: + return "" + return KB_CONTEXT_TEMPLATE.format(kb_context="\n\n".join(snippets)) + except Exception as e: + logger.warning(f"KB search failed for autocomplete, proceeding without context: {e}") + return "" + + +async def stream_autocomplete( + text: str, + cursor_position: int, + search_space_id: int, + session: AsyncSession, +) -> AsyncGenerator[str, None]: + """Build context, call the LLM, and yield SSE-formatted tokens.""" + streaming = VercelStreamingService() + text_before_cursor = text[:cursor_position] if cursor_position >= 0 else text + + if not text_before_cursor.strip(): + yield streaming.format_message_start() + yield streaming.format_finish() + yield streaming.format_done() + return + + kb_context = await _retrieve_kb_context(session, text_before_cursor, search_space_id) + + llm = await get_agent_llm(session, search_space_id) + if not llm: + yield streaming.format_message_start() + yield streaming.format_error("No LLM configured for this search space") + yield streaming.format_done() + return + + system_prompt = SYSTEM_PROMPT + if kb_context: + system_prompt += kb_context + + messages = [ + SystemMessage(content=system_prompt), + HumanMessage(content=f"Complete this text:\n{text_before_cursor}"), + ] + + try: + yield streaming.format_message_start() + text_id = streaming.generate_text_id() + yield streaming.format_text_start(text_id) + + async for chunk in llm.astream(messages): + token = chunk.content if hasattr(chunk, "content") else str(chunk) + if token: + yield streaming.format_text_delta(text_id, token) + + yield streaming.format_text_end(text_id) + yield streaming.format_finish() + yield streaming.format_done() + + except Exception as e: + logger.error(f"Autocomplete streaming error: {e}") + yield streaming.format_error(str(e)) + yield streaming.format_done() diff --git a/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts index f03930cf6..e8a2f3a91 100644 --- a/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts +++ b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts @@ -8,14 +8,22 @@ const MAX_HEIGHT = 400; let suggestionWindow: BrowserWindow | null = null; let resizeTimer: ReturnType | null = null; +let cursorOrigin = { x: 0, y: 0 }; -function clampToScreen(x: number, y: number, w: number, h: number): { x: number; y: number } { - const display = screen.getDisplayNearestPoint({ x, y }); +const CURSOR_GAP = 20; + +function positionOnScreen(cursorX: number, cursorY: number, w: number, h: number): { x: number; y: number } { + const display = screen.getDisplayNearestPoint({ x: cursorX, y: cursorY }); const { x: dx, y: dy, width: dw, height: dh } = display.workArea; - return { - x: Math.max(dx, Math.min(x, dx + dw - w)), - y: Math.max(dy, Math.min(y, dy + dh - h)), - }; + + const x = Math.max(dx, Math.min(cursorX, dx + dw - w)); + + const spaceBelow = (dy + dh) - (cursorY + CURSOR_GAP); + const y = spaceBelow >= h + ? cursorY + CURSOR_GAP + : cursorY - h - CURSOR_GAP; + + return { x, y: Math.max(dy, y) }; } function stopResizePolling(): void { @@ -34,8 +42,8 @@ function startResizePolling(win: BrowserWindow): void { if (h > 0 && h !== lastH) { lastH = h; const clamped = Math.min(h, MAX_HEIGHT); - const bounds = win.getBounds(); - win.setBounds({ x: bounds.x, y: bounds.y, width: TOOLTIP_WIDTH, height: clamped }); + const pos = positionOnScreen(cursorOrigin.x, cursorOrigin.y, TOOLTIP_WIDTH, clamped); + win.setBounds({ x: pos.x, y: pos.y, width: TOOLTIP_WIDTH, height: clamped }); } } catch {} }, 150); @@ -55,8 +63,9 @@ export function destroySuggestion(): void { export function createSuggestionWindow(x: number, y: number): BrowserWindow { destroySuggestion(); + cursorOrigin = { x, y }; - const pos = clampToScreen(x, y + 20, TOOLTIP_WIDTH, TOOLTIP_HEIGHT); + const pos = positionOnScreen(x, y, TOOLTIP_WIDTH, TOOLTIP_HEIGHT); suggestionWindow = new BrowserWindow({ width: TOOLTIP_WIDTH, From f0a7c7134a7e81a8ee202e854afbfc98d9ad182a Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 00:09:17 +0530 Subject: [PATCH 055/202] feat: move endpoint above to retrieve watched folders based on search space ID --- .../app/routes/documents_routes.py | 60 ++++++++++--------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 81bbb1477..0acc1d30b 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -953,6 +953,37 @@ async def get_document_by_chunk_id( ) from e +@router.get("/documents/watched-folders", response_model=list["FolderRead"]) +async def get_watched_folders( + search_space_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Return root folders that are marked as watched (metadata->>'watched' = 'true').""" + from app.schemas import FolderRead # noqa: F811 + + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + folders = ( + await session.execute( + select(Folder).where( + Folder.search_space_id == search_space_id, + Folder.parent_id.is_(None), + Folder.folder_metadata.isnot(None), + Folder.folder_metadata["watched"].astext == "true", + ) + ) + ).scalars().all() + + return folders + + @router.get("/documents/{document_id}", response_model=DocumentRead) async def read_document( document_id: int, @@ -1416,32 +1447,3 @@ async def folder_index_file( } -@router.get("/documents/watched-folders", response_model=list["FolderRead"]) -async def get_watched_folders( - search_space_id: int, - session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user), -): - """Return root folders that are marked as watched (metadata->>'watched' = 'true').""" - from app.schemas import FolderRead # noqa: F811 - - await check_permission( - session, - user, - search_space_id, - Permission.DOCUMENTS_READ.value, - "You don't have permission to read documents in this search space", - ) - - folders = ( - await session.execute( - select(Folder).where( - Folder.search_space_id == search_space_id, - Folder.parent_id.is_(None), - Folder.folder_metadata.isnot(None), - Folder.folder_metadata["watched"].astext == "true", - ) - ) - ).scalars().all() - - return folders From b46c5532b3fb02c3fd7277021d128e4f2f8a3180 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 00:28:24 +0530 Subject: [PATCH 056/202] feat: add unified file and folder browsing functionality with IPC channel integration --- surfsense_desktop/src/ipc/channels.ts | 2 + surfsense_desktop/src/ipc/handlers.ts | 8 + .../src/modules/folder-watcher.ts | 68 +++++ surfsense_desktop/src/preload.ts | 4 + .../(manage)/components/DocumentsFilters.tsx | 23 +- .../layout/ui/sidebar/DocumentsSidebar.tsx | 51 +--- .../components/sources/DocumentUploadTab.tsx | 284 ++++++++++++++---- surfsense_web/types/window.d.ts | 15 + 8 files changed, 335 insertions(+), 120 deletions(-) diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts index 66788d90e..19c26607d 100644 --- a/surfsense_desktop/src/ipc/channels.ts +++ b/surfsense_desktop/src/ipc/channels.ts @@ -17,4 +17,6 @@ export const IPC_CHANNELS = { FOLDER_SYNC_PAUSE: 'folder-sync:pause', FOLDER_SYNC_RESUME: 'folder-sync:resume', FOLDER_SYNC_RENDERER_READY: 'folder-sync:renderer-ready', + BROWSE_FILE_OR_FOLDER: 'browse:file-or-folder', + READ_LOCAL_FILES: 'browse:read-local-files', } as const; diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts index 19051e871..246f0f6ac 100644 --- a/surfsense_desktop/src/ipc/handlers.ts +++ b/surfsense_desktop/src/ipc/handlers.ts @@ -9,6 +9,8 @@ import { pauseWatcher, resumeWatcher, markRendererReady, + browseFileOrFolder, + readLocalFiles, } from '../modules/folder-watcher'; export function registerIpcHandlers(): void { @@ -49,4 +51,10 @@ export function registerIpcHandlers(): void { ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY, () => { markRendererReady(); }); + + ipcMain.handle(IPC_CHANNELS.BROWSE_FILE_OR_FOLDER, () => browseFileOrFolder()); + + ipcMain.handle(IPC_CHANNELS.READ_LOCAL_FILES, (_event, paths: string[]) => + readLocalFiles(paths) + ); } diff --git a/surfsense_desktop/src/modules/folder-watcher.ts b/surfsense_desktop/src/modules/folder-watcher.ts index 81a835c22..1324858a0 100644 --- a/surfsense_desktop/src/modules/folder-watcher.ts +++ b/surfsense_desktop/src/modules/folder-watcher.ts @@ -391,3 +391,71 @@ export async function unregisterFolderWatcher(): Promise { } watchers.clear(); } + +export interface BrowseResult { + type: 'files' | 'folder'; + paths: string[]; +} + +export async function browseFileOrFolder(): Promise { + const result = await dialog.showOpenDialog({ + properties: ['openFile', 'openDirectory', 'multiSelections'], + title: 'Select files or a folder', + }); + if (result.canceled || result.filePaths.length === 0) return null; + + const stat = fs.statSync(result.filePaths[0]); + if (stat.isDirectory()) { + return { type: 'folder', paths: [result.filePaths[0]] }; + } + return { type: 'files', paths: result.filePaths }; +} + +const MIME_MAP: Record = { + '.pdf': 'application/pdf', + '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + '.html': 'text/html', '.htm': 'text/html', + '.csv': 'text/csv', + '.txt': 'text/plain', + '.md': 'text/markdown', '.markdown': 'text/markdown', + '.mp3': 'audio/mpeg', '.mpeg': 'audio/mpeg', '.mpga': 'audio/mpeg', + '.mp4': 'audio/mp4', '.m4a': 'audio/mp4', + '.wav': 'audio/wav', + '.webm': 'audio/webm', + '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', + '.png': 'image/png', + '.bmp': 'image/bmp', + '.webp': 'image/webp', + '.tiff': 'image/tiff', + '.doc': 'application/msword', + '.rtf': 'application/rtf', + '.xml': 'application/xml', + '.epub': 'application/epub+zip', + '.xls': 'application/vnd.ms-excel', + '.ppt': 'application/vnd.ms-powerpoint', + '.eml': 'message/rfc822', + '.odt': 'application/vnd.oasis.opendocument.text', + '.msg': 'application/vnd.ms-outlook', +}; + +export interface LocalFileData { + name: string; + data: ArrayBuffer; + mimeType: string; + size: number; +} + +export function readLocalFiles(filePaths: string[]): LocalFileData[] { + return filePaths.map((p) => { + const buf = fs.readFileSync(p); + const ext = path.extname(p).toLowerCase(); + return { + name: path.basename(p), + data: buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength), + mimeType: MIME_MAP[ext] || 'application/octet-stream', + size: buf.byteLength, + }; + }); +} diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 7c190db10..08ca87f8f 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -45,4 +45,8 @@ contextBridge.exposeInMainWorld('electronAPI', { pauseWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_PAUSE), resumeWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RESUME), signalRendererReady: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY), + + // Unified browse (files + folders) + browseFileOrFolder: () => ipcRenderer.invoke(IPC_CHANNELS.BROWSE_FILE_OR_FOLDER), + readLocalFiles: (paths: string[]) => ipcRenderer.invoke(IPC_CHANNELS.READ_LOCAL_FILES, paths), }); diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx index fcd3a39da..150c119de 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx @@ -1,6 +1,6 @@ "use client"; -import { Eye, FolderPlus, ListFilter, Search, Upload, X } from "lucide-react"; +import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; import React, { useCallback, useMemo, useRef, useState } from "react"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; @@ -19,7 +19,6 @@ export function DocumentsFilters({ onToggleType, activeTypes, onCreateFolder, - onWatchFolder, }: { typeCounts: Partial>; onSearch: (v: string) => void; @@ -27,7 +26,6 @@ export function DocumentsFilters({ onToggleType: (type: DocumentTypeEnum, checked: boolean) => void; activeTypes: DocumentTypeEnum[]; onCreateFolder?: () => void; - onWatchFolder?: () => void; }) { const t = useTranslations("documents"); const id = React.useId(); @@ -216,24 +214,7 @@ export function DocumentsFilters({ )} - {/* Watch Folder Button (desktop only) */} - {onWatchFolder && ( - - - - - Watch folder - - )} - - {/* Upload Button */} + {/* Upload Button */} + ) : ( + )} +
+ )} +
+ + + + {selectedFolder && ( + + +
+
+ +
+ + {selectedFolder.name} + + + {selectedFolder.path} +
- )} +
+
+
+ +
+ + +
+ + + +
+ )} {files.length > 0 && ( diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index b399664d6..826a575c7 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -26,6 +26,18 @@ interface FolderSyncWatcherReadyEvent { folderPath: string; } +interface BrowseResult { + type: "files" | "folder"; + paths: string[]; +} + +interface LocalFileData { + name: string; + data: ArrayBuffer; + mimeType: string; + size: number; +} + interface ElectronAPI { versions: { electron: string; @@ -51,6 +63,9 @@ interface ElectronAPI { pauseWatcher: () => Promise; resumeWatcher: () => Promise; signalRendererReady: () => Promise; + // Unified browse + browseFileOrFolder: () => Promise; + readLocalFiles: (paths: string[]) => Promise; } declare global { From e0b35cfbabe43add555771e19166558376b30ff7 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 00:40:49 +0530 Subject: [PATCH 057/202] feat: implement pending file event handling using durable queue with acknowledgment support in folder synchronization --- surfsense_desktop/src/ipc/channels.ts | 2 + surfsense_desktop/src/ipc/handlers.ts | 10 ++ .../src/modules/folder-watcher.ts | 111 +++++++++++++++--- surfsense_desktop/src/preload.ts | 2 + surfsense_web/hooks/use-folder-sync.ts | 82 +++++++++---- surfsense_web/types/window.d.ts | 3 + 6 files changed, 175 insertions(+), 35 deletions(-) diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts index 19c26607d..2761960f7 100644 --- a/surfsense_desktop/src/ipc/channels.ts +++ b/surfsense_desktop/src/ipc/channels.ts @@ -17,6 +17,8 @@ export const IPC_CHANNELS = { FOLDER_SYNC_PAUSE: 'folder-sync:pause', FOLDER_SYNC_RESUME: 'folder-sync:resume', FOLDER_SYNC_RENDERER_READY: 'folder-sync:renderer-ready', + FOLDER_SYNC_GET_PENDING_EVENTS: 'folder-sync:get-pending-events', + FOLDER_SYNC_ACK_EVENTS: 'folder-sync:ack-events', BROWSE_FILE_OR_FOLDER: 'browse:file-or-folder', READ_LOCAL_FILES: 'browse:read-local-files', } as const; diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts index 246f0f6ac..7194aaaff 100644 --- a/surfsense_desktop/src/ipc/handlers.ts +++ b/surfsense_desktop/src/ipc/handlers.ts @@ -6,6 +6,8 @@ import { removeWatchedFolder, getWatchedFolders, getWatcherStatus, + getPendingFileEvents, + acknowledgeFileEvents, pauseWatcher, resumeWatcher, markRendererReady, @@ -52,6 +54,14 @@ export function registerIpcHandlers(): void { markRendererReady(); }); + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_GET_PENDING_EVENTS, () => + getPendingFileEvents() + ); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_ACK_EVENTS, (_event, eventIds: string[]) => + acknowledgeFileEvents(eventIds) + ); + ipcMain.handle(IPC_CHANNELS.BROWSE_FILE_OR_FOLDER, () => browseFileOrFolder()); ipcMain.handle(IPC_CHANNELS.READ_LOCAL_FILES, (_event, paths: string[]) => diff --git a/surfsense_desktop/src/modules/folder-watcher.ts b/surfsense_desktop/src/modules/folder-watcher.ts index 1324858a0..9cbdd9775 100644 --- a/surfsense_desktop/src/modules/folder-watcher.ts +++ b/surfsense_desktop/src/modules/folder-watcher.ts @@ -1,5 +1,6 @@ import { BrowserWindow, dialog } from 'electron'; import chokidar, { type FSWatcher } from 'chokidar'; +import { randomUUID } from 'crypto'; import * as path from 'path'; import * as fs from 'fs'; import { IPC_CHANNELS } from '../ipc/channels'; @@ -20,12 +21,27 @@ interface WatcherEntry { } type MtimeMap = Record; +type FolderSyncAction = 'add' | 'change' | 'unlink'; + +export interface FolderSyncFileChangedEvent { + id: string; + rootFolderId: number | null; + searchSpaceId: number; + folderPath: string; + folderName: string; + relativePath: string; + fullPath: string; + action: FolderSyncAction; + timestamp: number; +} const STORE_KEY = 'watchedFolders'; +const OUTBOX_STORE_KEY = 'events'; const MTIME_TOLERANCE_S = 1.0; let store: any = null; let mtimeStore: any = null; +let outboxStore: any = null; let watchers: Map = new Map(); /** @@ -35,22 +51,11 @@ let watchers: Map = new Map(); const mtimeMaps: Map = new Map(); let rendererReady = false; -const pendingEvents: any[] = []; +const outboxEvents: Map = new Map(); +let outboxLoaded = false; export function markRendererReady() { rendererReady = true; - for (const event of pendingEvents) { - sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, event); - } - pendingEvents.length = 0; -} - -function sendFileChangedEvent(data: any) { - if (rendererReady) { - sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, data); - } else { - pendingEvents.push(data); - } } async function getStore() { @@ -77,6 +82,57 @@ async function getMtimeStore() { return mtimeStore; } +async function getOutboxStore() { + if (!outboxStore) { + const { default: Store } = await import('electron-store'); + outboxStore = new Store({ + name: 'folder-sync-outbox', + defaults: { + [OUTBOX_STORE_KEY]: [] as FolderSyncFileChangedEvent[], + }, + }); + } + return outboxStore; +} + +function makeEventKey(event: Pick): string { + return `${event.folderPath}:${event.relativePath}`; +} + +function persistOutbox() { + getOutboxStore().then((s) => { + s.set(OUTBOX_STORE_KEY, Array.from(outboxEvents.values())); + }); +} + +async function loadOutbox() { + if (outboxLoaded) return; + const s = await getOutboxStore(); + const stored: FolderSyncFileChangedEvent[] = s.get(OUTBOX_STORE_KEY, []); + outboxEvents.clear(); + for (const event of stored) { + if (!event?.id || !event.folderPath || !event.relativePath) continue; + outboxEvents.set(makeEventKey(event), event); + } + outboxLoaded = true; +} + +function sendFileChangedEvent( + data: Omit +) { + const event: FolderSyncFileChangedEvent = { + id: randomUUID(), + ...data, + }; + + outboxEvents.set(makeEventKey(event), event); + persistOutbox(); + + if (rendererReady) { + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, event); + } +} + function loadMtimeMap(folderPath: string): MtimeMap { return mtimeMaps.get(folderPath) ?? {}; } @@ -235,7 +291,7 @@ async function startWatcher(config: WatchedFolderConfig) { }); }); - const handleFileEvent = (filePath: string, action: string) => { + const handleFileEvent = (filePath: string, action: FolderSyncAction) => { if (!ready) return; const relativePath = path.relative(config.path, filePath); @@ -357,6 +413,32 @@ export async function getWatcherStatus(): Promise< })); } +export async function getPendingFileEvents(): Promise { + await loadOutbox(); + return Array.from(outboxEvents.values()).sort((a, b) => a.timestamp - b.timestamp); +} + +export async function acknowledgeFileEvents(eventIds: string[]): Promise<{ acknowledged: number }> { + if (!eventIds || eventIds.length === 0) return { acknowledged: 0 }; + await loadOutbox(); + + const ackSet = new Set(eventIds); + let acknowledged = 0; + + for (const [key, event] of outboxEvents.entries()) { + if (ackSet.has(event.id)) { + outboxEvents.delete(key); + acknowledged += 1; + } + } + + if (acknowledged > 0) { + persistOutbox(); + } + + return { acknowledged }; +} + export async function pauseWatcher(): Promise { for (const [, entry] of watchers) { if (entry.watcher) { @@ -375,6 +457,7 @@ export async function resumeWatcher(): Promise { } export async function registerFolderWatcher(): Promise { + await loadOutbox(); const s = await getStore(); const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []); diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 08ca87f8f..6a2610dc8 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -45,6 +45,8 @@ contextBridge.exposeInMainWorld('electronAPI', { pauseWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_PAUSE), resumeWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RESUME), signalRendererReady: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY), + getPendingFileEvents: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_GET_PENDING_EVENTS), + acknowledgeFileEvents: (eventIds: string[]) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_ACK_EVENTS, eventIds), // Unified browse (files + folders) browseFileOrFolder: () => ipcRenderer.invoke(IPC_CHANNELS.BROWSE_FILE_OR_FOLDER), diff --git a/surfsense_web/hooks/use-folder-sync.ts b/surfsense_web/hooks/use-folder-sync.ts index f051b7df6..59c061afb 100644 --- a/surfsense_web/hooks/use-folder-sync.ts +++ b/surfsense_web/hooks/use-folder-sync.ts @@ -4,6 +4,7 @@ import { useEffect, useRef } from "react"; import { documentsApiService } from "@/lib/apis/documents-api.service"; interface FileChangedEvent { + id: string; rootFolderId: number | null; searchSpaceId: number; folderPath: string; @@ -15,25 +16,35 @@ interface FileChangedEvent { } const DEBOUNCE_MS = 2000; +interface QueueItem { + event: FileChangedEvent; + ackIds: string[]; +} export function useFolderSync() { - const queueRef = useRef([]); + const queueRef = useRef([]); const processingRef = useRef(false); const debounceTimers = useRef>>(new Map()); + const pendingByKey = useRef>(new Map()); + const isMountedRef = useRef(false); async function processQueue() { if (processingRef.current) return; processingRef.current = true; while (queueRef.current.length > 0) { - const event = queueRef.current.shift()!; + const item = queueRef.current.shift()!; try { - await documentsApiService.folderIndexFile(event.searchSpaceId, { - folder_path: event.folderPath, - folder_name: event.folderName, - search_space_id: event.searchSpaceId, - target_file_path: event.fullPath, - root_folder_id: event.rootFolderId, + await documentsApiService.folderIndexFile(item.event.searchSpaceId, { + folder_path: item.event.folderPath, + folder_name: item.event.folderName, + search_space_id: item.event.searchSpaceId, + target_file_path: item.event.fullPath, + root_folder_id: item.event.rootFolderId, }); + const api = typeof window !== "undefined" ? window.electronAPI : null; + if (api?.acknowledgeFileEvents && item.ackIds.length > 0) { + await api.acknowledgeFileEvents(item.ackIds); + } } catch (err) { console.error("[FolderSync] Failed to trigger re-index:", err); } @@ -41,34 +52,63 @@ export function useFolderSync() { processingRef.current = false; } + function enqueueWithDebounce(event: FileChangedEvent) { + const key = `${event.folderPath}:${event.relativePath}`; + const existing = pendingByKey.current.get(key); + const ackSet = new Set(existing?.ackIds ?? []); + ackSet.add(event.id); + pendingByKey.current.set(key, { + event, + ackIds: Array.from(ackSet), + }); + + const existingTimeout = debounceTimers.current.get(key); + if (existingTimeout) clearTimeout(existingTimeout); + + const timeout = setTimeout(() => { + debounceTimers.current.delete(key); + const pending = pendingByKey.current.get(key); + if (!pending) return; + pendingByKey.current.delete(key); + queueRef.current.push(pending); + processQueue(); + }, DEBOUNCE_MS); + + debounceTimers.current.set(key, timeout); + } + useEffect(() => { + isMountedRef.current = true; const api = typeof window !== "undefined" ? window.electronAPI : null; - if (!api?.onFileChanged) return; + if (!api?.onFileChanged) { + return () => { + isMountedRef.current = false; + }; + } // Signal to main process that the renderer is ready to receive events api.signalRendererReady?.(); + // Drain durable outbox first so events survive renderer startup gaps and restarts + void api.getPendingFileEvents?.().then((pendingEvents) => { + if (!isMountedRef.current || !pendingEvents?.length) return; + for (const event of pendingEvents) { + enqueueWithDebounce(event); + } + }); + const cleanup = api.onFileChanged((event: FileChangedEvent) => { - const key = `${event.folderPath}:${event.fullPath}`; - - const existing = debounceTimers.current.get(key); - if (existing) clearTimeout(existing); - - const timeout = setTimeout(() => { - debounceTimers.current.delete(key); - queueRef.current.push(event); - processQueue(); - }, DEBOUNCE_MS); - - debounceTimers.current.set(key, timeout); + enqueueWithDebounce(event); }); return () => { + isMountedRef.current = false; cleanup(); for (const timeout of debounceTimers.current.values()) { clearTimeout(timeout); } debounceTimers.current.clear(); + pendingByKey.current.clear(); }; }, []); } diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index 826a575c7..719373e02 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -11,6 +11,7 @@ interface WatchedFolderConfig { } interface FolderSyncFileChangedEvent { + id: string; rootFolderId: number | null; searchSpaceId: number; folderPath: string; @@ -63,6 +64,8 @@ interface ElectronAPI { pauseWatcher: () => Promise; resumeWatcher: () => Promise; signalRendererReady: () => Promise; + getPendingFileEvents: () => Promise; + acknowledgeFileEvents: (eventIds: string[]) => Promise<{ acknowledged: number }>; // Unified browse browseFileOrFolder: () => Promise; readLocalFiles: (paths: string[]) => Promise; From a99d999a3658c09ce133940cc0dd15a3353d6cd7 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Thu, 2 Apr 2026 21:29:05 +0200 Subject: [PATCH 058/202] fix: correct preload.js path after autocomplete module restructure --- surfsense_desktop/src/modules/autocomplete/suggestion-window.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts index e8a2f3a91..8f61b2901 100644 --- a/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts +++ b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts @@ -80,7 +80,7 @@ export function createSuggestionWindow(x: number, y: number): BrowserWindow { hasShadow: true, type: 'panel', webPreferences: { - preload: path.join(__dirname, '..', 'preload.js'), + preload: path.join(__dirname, 'preload.js'), contextIsolation: true, nodeIntegration: false, sandbox: true, From 530db1053939cd69a6ce41107a6169babab1f707 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 02:56:24 +0530 Subject: [PATCH 059/202] refactor: remove unused Electron API check and update search space ID handling in document upload --- surfsense_desktop/src/ipc/channels.ts | 2 +- surfsense_desktop/src/ipc/handlers.ts | 4 +- .../src/modules/folder-watcher.ts | 18 +- surfsense_desktop/src/preload.ts | 4 +- .../layout/ui/sidebar/DocumentsSidebar.tsx | 2 - .../components/sources/DocumentUploadTab.tsx | 162 ++++++++++-------- surfsense_web/types/window.d.ts | 9 +- 7 files changed, 100 insertions(+), 101 deletions(-) diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts index 2761960f7..2000964c7 100644 --- a/surfsense_desktop/src/ipc/channels.ts +++ b/surfsense_desktop/src/ipc/channels.ts @@ -19,6 +19,6 @@ export const IPC_CHANNELS = { FOLDER_SYNC_RENDERER_READY: 'folder-sync:renderer-ready', FOLDER_SYNC_GET_PENDING_EVENTS: 'folder-sync:get-pending-events', FOLDER_SYNC_ACK_EVENTS: 'folder-sync:ack-events', - BROWSE_FILE_OR_FOLDER: 'browse:file-or-folder', + BROWSE_FILES: 'browse:files', READ_LOCAL_FILES: 'browse:read-local-files', } as const; diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts index 7194aaaff..c4251b30b 100644 --- a/surfsense_desktop/src/ipc/handlers.ts +++ b/surfsense_desktop/src/ipc/handlers.ts @@ -11,7 +11,7 @@ import { pauseWatcher, resumeWatcher, markRendererReady, - browseFileOrFolder, + browseFiles, readLocalFiles, } from '../modules/folder-watcher'; @@ -62,7 +62,7 @@ export function registerIpcHandlers(): void { acknowledgeFileEvents(eventIds) ); - ipcMain.handle(IPC_CHANNELS.BROWSE_FILE_OR_FOLDER, () => browseFileOrFolder()); + ipcMain.handle(IPC_CHANNELS.BROWSE_FILES, () => browseFiles()); ipcMain.handle(IPC_CHANNELS.READ_LOCAL_FILES, (_event, paths: string[]) => readLocalFiles(paths) diff --git a/surfsense_desktop/src/modules/folder-watcher.ts b/surfsense_desktop/src/modules/folder-watcher.ts index 9cbdd9775..969dabe97 100644 --- a/surfsense_desktop/src/modules/folder-watcher.ts +++ b/surfsense_desktop/src/modules/folder-watcher.ts @@ -475,23 +475,13 @@ export async function unregisterFolderWatcher(): Promise { watchers.clear(); } -export interface BrowseResult { - type: 'files' | 'folder'; - paths: string[]; -} - -export async function browseFileOrFolder(): Promise { +export async function browseFiles(): Promise { const result = await dialog.showOpenDialog({ - properties: ['openFile', 'openDirectory', 'multiSelections'], - title: 'Select files or a folder', + properties: ['openFile', 'multiSelections'], + title: 'Select files', }); if (result.canceled || result.filePaths.length === 0) return null; - - const stat = fs.statSync(result.filePaths[0]); - if (stat.isDirectory()) { - return { type: 'folder', paths: [result.filePaths[0]] }; - } - return { type: 'files', paths: result.filePaths }; + return result.filePaths; } const MIME_MAP: Record = { diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 6a2610dc8..6fbfd354a 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -48,7 +48,7 @@ contextBridge.exposeInMainWorld('electronAPI', { getPendingFileEvents: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_GET_PENDING_EVENTS), acknowledgeFileEvents: (eventIds: string[]) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_ACK_EVENTS, eventIds), - // Unified browse (files + folders) - browseFileOrFolder: () => ipcRenderer.invoke(IPC_CHANNELS.BROWSE_FILE_OR_FOLDER), + // Browse files via native dialog + browseFiles: () => ipcRenderer.invoke(IPC_CHANNELS.BROWSE_FILES), readLocalFiles: (paths: string[]) => ipcRenderer.invoke(IPC_CHANNELS.READ_LOCAL_FILES, paths), }); diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index ed3a78786..f8b774d26 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -277,8 +277,6 @@ export function DocumentsSidebar({ [createFolderParentId, searchSpaceId, setExpandedFolderMap] ); - const isElectron = typeof window !== "undefined" && !!window.electronAPI; - const handleRescanFolder = useCallback( async (folder: FolderDisplay) => { const api = window.electronAPI; diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 3fdf576b5..d5ac2770a 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtom } from "jotai"; -import { CheckCircle2, FileType, FolderOpen, Info, Upload, X } from "lucide-react"; +import { CheckCircle2, ChevronDown, File as FileIcon, FileType, FolderOpen, Info, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; import { useCallback, useMemo, useRef, useState } from "react"; @@ -19,6 +19,12 @@ import { Alert, AlertDescription } from "@/components/ui/alert"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger, +} from "@/components/ui/dropdown-menu"; import { Label } from "@/components/ui/label"; import { Progress } from "@/components/ui/progress"; import { Separator } from "@/components/ui/separator"; @@ -146,7 +152,7 @@ export function DocumentUploadTab({ const [selectedFolder, setSelectedFolder] = useState(null); const [watchFolder, setWatchFolder] = useState(true); const [folderSubmitting, setFolderSubmitting] = useState(false); - const isElectron = typeof window !== "undefined" && !!window.electronAPI?.browseFileOrFolder; + const isElectron = typeof window !== "undefined" && !!window.electronAPI?.browseFiles; const acceptedFileTypes = useMemo(() => { const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE; @@ -193,7 +199,7 @@ export function DocumentUploadTab({ onDrop, accept: acceptedFileTypes, maxSize: 50 * 1024 * 1024, // 50MB per file - noClick: !isElectron, + noClick: isElectron, disabled: files.length >= MAX_FILES, }); @@ -201,52 +207,51 @@ export function DocumentUploadTab({ e.stopPropagation(); }, []); - const handleBrowse = useCallback(async (e: React.MouseEvent) => { - e.stopPropagation(); - e.preventDefault(); - + const handleBrowseFiles = useCallback(async () => { const api = window.electronAPI; - if (!api?.browseFileOrFolder) { - fileInputRef.current?.click(); - return; - } + if (!api?.browseFiles) return; - const result = await api.browseFileOrFolder(); - if (!result) return; + const paths = await api.browseFiles(); + if (!paths || paths.length === 0) return; - if (result.type === "folder") { - const folderPath = result.paths[0]; - const folderName = folderPath.split("/").pop() || folderPath.split("\\").pop() || folderPath; - setFiles([]); - setSelectedFolder({ path: folderPath, name: folderName }); - setWatchFolder(true); - } else { - setSelectedFolder(null); - const fileDataList = await api.readLocalFiles(result.paths); - const newFiles: FileWithId[] = fileDataList.map((fd) => ({ - id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, - file: new File([fd.data], fd.name, { type: fd.mimeType }), - })); - setFiles((prev) => { - const merged = [...prev, ...newFiles]; - if (merged.length > MAX_FILES) { - toast.error(t("max_files_exceeded"), { - description: t("max_files_exceeded_desc", { max: MAX_FILES }), - }); - return prev; - } - const totalSize = merged.reduce((sum, e) => sum + e.file.size, 0); - if (totalSize > MAX_TOTAL_SIZE_BYTES) { - toast.error(t("max_size_exceeded"), { - description: t("max_size_exceeded_desc", { max: MAX_TOTAL_SIZE_MB }), - }); - return prev; - } - return merged; - }); - } + setSelectedFolder(null); + const fileDataList = await api.readLocalFiles(paths); + const newFiles: FileWithId[] = fileDataList.map((fd) => ({ + id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, + file: new File([fd.data], fd.name, { type: fd.mimeType }), + })); + setFiles((prev) => { + const merged = [...prev, ...newFiles]; + if (merged.length > MAX_FILES) { + toast.error(t("max_files_exceeded"), { + description: t("max_files_exceeded_desc", { max: MAX_FILES }), + }); + return prev; + } + const totalSize = merged.reduce((sum, e) => sum + e.file.size, 0); + if (totalSize > MAX_TOTAL_SIZE_BYTES) { + toast.error(t("max_size_exceeded"), { + description: t("max_size_exceeded_desc", { max: MAX_TOTAL_SIZE_MB }), + }); + return prev; + } + return merged; + }); }, [t]); + const handleBrowseFolder = useCallback(async () => { + const api = window.electronAPI; + if (!api?.selectFolder) return; + + const folderPath = await api.selectFolder(); + if (!folderPath) return; + + const folderName = folderPath.split("/").pop() || folderPath.split("\\").pop() || folderPath; + setFiles([]); + setSelectedFolder({ path: folderPath, name: folderName }); + setWatchFolder(true); + }, []); + const formatFileSize = (bytes: number) => { if (bytes === 0) return "0 Bytes"; const k = 1024; @@ -280,10 +285,11 @@ export function DocumentUploadTab({ setFolderSubmitting(true); try { - const result = await documentsApiService.folderIndex(Number(searchSpaceId), { + const numericSpaceId = Number(searchSpaceId); + const result = await documentsApiService.folderIndex(numericSpaceId, { folder_path: selectedFolder.path, folder_name: selectedFolder.name, - search_space_id: searchSpaceId, + search_space_id: numericSpaceId, enable_summary: shouldSummarize, }); @@ -409,33 +415,43 @@ export function DocumentUploadTab({ )} )} - {!isFileCountLimitReached && ( -
- {isElectron ? ( - - ) : ( - - )} -
- )} + {!isFileCountLimitReached && ( +
+ {isElectron ? ( + + e.stopPropagation()}> + + + e.stopPropagation()}> + + + Files + + + + Folder + + + + ) : ( + + )} +
+ )}
diff --git a/surfsense_web/types/window.d.ts b/surfsense_web/types/window.d.ts index 719373e02..0842ed655 100644 --- a/surfsense_web/types/window.d.ts +++ b/surfsense_web/types/window.d.ts @@ -27,11 +27,6 @@ interface FolderSyncWatcherReadyEvent { folderPath: string; } -interface BrowseResult { - type: "files" | "folder"; - paths: string[]; -} - interface LocalFileData { name: string; data: ArrayBuffer; @@ -66,8 +61,8 @@ interface ElectronAPI { signalRendererReady: () => Promise; getPendingFileEvents: () => Promise; acknowledgeFileEvents: (eventIds: string[]) => Promise<{ acknowledged: number }>; - // Unified browse - browseFileOrFolder: () => Promise; + // Browse files/folders via native dialogs + browseFiles: () => Promise; readLocalFiles: (paths: string[]) => Promise; } From bd21c2842dec8dfeae80a00a040c4b1513c1cf3d Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 04:14:09 +0530 Subject: [PATCH 060/202] feat: enhance document upload and folder synchronization UI with improved processing state indicators and responsive design adjustments --- .../app/routes/documents_routes.py | 5 +- .../assistant-ui/document-upload-popup.tsx | 29 +- .../components/documents/DocumentNode.tsx | 14 +- .../components/documents/FolderNode.tsx | 48 +- .../components/documents/FolderTreeView.tsx | 30 + .../components/sources/DocumentUploadTab.tsx | 514 +++++++++--------- .../contracts/enums/connectorIcons.tsx | 2 + 7 files changed, 359 insertions(+), 283 deletions(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 0acc1d30b..edb01d4cc 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -29,6 +29,7 @@ from app.schemas import ( DocumentTitleSearchResponse, DocumentUpdate, DocumentWithChunksRead, + FolderRead, PaginatedResponse, ) from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher @@ -953,15 +954,13 @@ async def get_document_by_chunk_id( ) from e -@router.get("/documents/watched-folders", response_model=list["FolderRead"]) +@router.get("/documents/watched-folders", response_model=list[FolderRead]) async def get_watched_folders( search_space_id: int, session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): """Return root folders that are marked as watched (metadata->>'watched' = 'true').""" - from app.schemas import FolderRead # noqa: F811 - await check_permission( session, user, diff --git a/surfsense_web/components/assistant-ui/document-upload-popup.tsx b/surfsense_web/components/assistant-ui/document-upload-popup.tsx index 06b0d38e7..78600be47 100644 --- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx +++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx @@ -125,29 +125,23 @@ const DocumentUploadPopupContent: FC<{ onPointerDownOutside={(e) => e.preventDefault()} onInteractOutside={(e) => e.preventDefault()} onEscapeKeyDown={(e) => e.preventDefault()} - className="select-none max-w-4xl w-[95vw] sm:w-full h-[calc(100dvh-2rem)] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-12 [&>button]:top-3 sm:[&>button]:top-10 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button]:z-[100] [&>button_svg]:size-4 sm:[&>button_svg]:size-5" + className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(460px,75dvh)] sm:h-[min(520px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-3 sm:[&>button]:top-5 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button]:z-[100] [&>button_svg]:size-4 sm:[&>button_svg]:size-5" > Upload Document - {/* Scrollable container for mobile */}
- {/* Header - scrolls with content on mobile */} -
- {/* Upload header */} -
-
-

- Upload Documents -

-

- Upload and sync your documents to your search space -

-
+
+
+

+ Upload Documents +

+

+ Upload and sync your documents to your search space +

- {/* Content */} -
+
{!isLoading && !hasDocumentSummaryLLM ? ( @@ -179,9 +173,6 @@ const DocumentUploadPopupContent: FC<{ )}
- - {/* Bottom fade shadow - hidden on very small screens */} -
); diff --git a/surfsense_web/components/documents/DocumentNode.tsx b/surfsense_web/components/documents/DocumentNode.tsx index 691a6eb0d..7a3b3e0ca 100644 --- a/surfsense_web/components/documents/DocumentNode.tsx +++ b/surfsense_web/components/documents/DocumentNode.tsx @@ -195,12 +195,14 @@ export const DocumentNode = React.memo(function DocumentNode({ {doc.title} - - {getDocumentTypeIcon( - doc.document_type as DocumentTypeEnum, - "h-3.5 w-3.5 text-muted-foreground" - )} - + {getDocumentTypeIcon(doc.document_type as DocumentTypeEnum, "h-3.5 w-3.5 text-muted-foreground") && ( + + {getDocumentTypeIcon( + doc.document_type as DocumentTypeEnum, + "h-3.5 w-3.5 text-muted-foreground" + )} + + )} diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 6780bd1e5..41c1d8f73 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -1,6 +1,7 @@ "use client"; import { + AlertCircle, ChevronDown, ChevronRight, Eye, @@ -30,6 +31,8 @@ import { DropdownMenuItem, DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; +import { Spinner } from "@/components/ui/spinner"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { cn } from "@/lib/utils"; import type { FolderSelectionState } from "./FolderTreeView"; @@ -55,6 +58,7 @@ interface FolderNodeProps { isRenaming: boolean; childCount: number; selectionState: FolderSelectionState; + processingState: "idle" | "processing" | "failed"; onToggleSelect: (folderId: number, selectAll: boolean) => void; onToggleExpand: (folderId: number) => void; onRename: (folder: FolderDisplay, newName: string) => void; @@ -100,6 +104,7 @@ export const FolderNode = React.memo(function FolderNode({ isRenaming, childCount, selectionState, + processingState, onToggleSelect, onToggleExpand, onRename, @@ -281,14 +286,41 @@ export const FolderNode = React.memo(function FolderNode({ )} - e.stopPropagation()} - className="h-3.5 w-3.5 shrink-0" - /> + {processingState !== "idle" && selectionState === "none" ? ( + <> + + + + {processingState === "processing" ? ( + + ) : ( + + )} + + + + {processingState === "processing" + ? "Syncing folder contents" + : "Some files failed to process"} + + + e.stopPropagation()} + className="h-3.5 w-3.5 shrink-0 hidden group-hover:flex" + /> + + ) : ( + e.stopPropagation()} + className="h-3.5 w-3.5 shrink-0" + /> + )} diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index f34b9a0c2..01af73edc 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -166,6 +166,35 @@ export function FolderTreeView({ return states; }, [folders, docsByFolder, foldersByParent, mentionedDocIds]); + const folderProcessingStates = useMemo(() => { + const states: Record = {}; + + function compute(folderId: number): { hasProcessing: boolean; hasFailed: boolean } { + const directDocs = docsByFolder[folderId] ?? []; + let hasProcessing = directDocs.some( + (d) => d.status?.state === "pending" || d.status?.state === "processing" + ); + let hasFailed = directDocs.some((d) => d.status?.state === "failed"); + + for (const child of foldersByParent[folderId] ?? []) { + const sub = compute(child.id); + hasProcessing = hasProcessing || sub.hasProcessing; + hasFailed = hasFailed || sub.hasFailed; + } + + if (hasProcessing) states[folderId] = "processing"; + else if (hasFailed) states[folderId] = "failed"; + else states[folderId] = "idle"; + + return { hasProcessing, hasFailed }; + } + + for (const f of folders) { + if (states[f.id] === undefined) compute(f.id); + } + return states; + }, [folders, docsByFolder, foldersByParent]); + function renderLevel(parentId: number | null, depth: number): React.ReactNode[] { const key = parentId ?? "root"; const childFolders = (foldersByParent[key] ?? []) @@ -199,6 +228,7 @@ export function FolderTreeView({ isRenaming={renamingFolderId === f.id} childCount={folderChildCounts[f.id] ?? 0} selectionState={folderSelectionStates[f.id] ?? "none"} + processingState={folderProcessingStates[f.id] ?? "idle"} onToggleSelect={onToggleFolderSelect} onToggleExpand={onToggleExpand} onRename={onRenameFolder} diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index d5ac2770a..7176afae5 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -1,24 +1,21 @@ "use client"; import { useAtom } from "jotai"; -import { CheckCircle2, ChevronDown, File as FileIcon, FileType, FolderOpen, Info, Upload, X } from "lucide-react"; +import { CheckCircle2, ChevronDown, File as FileIcon, FileType, FolderOpen, Plus, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; import { useCallback, useMemo, useRef, useState } from "react"; import { useDropzone } from "react-dropzone"; import { toast } from "sonner"; import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; -import { SummaryConfig } from "@/components/assistant-ui/connector-popup/components/summary-config"; import { Accordion, AccordionContent, AccordionItem, AccordionTrigger, } from "@/components/ui/accordion"; -import { Alert, AlertDescription } from "@/components/ui/alert"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; -import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; import { DropdownMenu, DropdownMenuContent, @@ -27,7 +24,6 @@ import { } from "@/components/ui/dropdown-menu"; import { Label } from "@/components/ui/label"; import { Progress } from "@/components/ui/progress"; -import { Separator } from "@/components/ui/separator"; import { Spinner } from "@/components/ui/spinner"; import { Switch } from "@/components/ui/switch"; import { documentsApiService } from "@/lib/apis/documents-api.service"; @@ -36,7 +32,6 @@ import { trackDocumentUploadStarted, trackDocumentUploadSuccess, } from "@/lib/posthog/events"; -import { GridPattern } from "./GridPattern"; interface SelectedFolder { path: string; @@ -128,13 +123,12 @@ interface FileWithId { file: File; } -const cardClass = "border border-border bg-slate-400/5 dark:bg-white/5"; - -// Upload limits — files are sent in batches of 5 to avoid proxy timeouts const MAX_FILES = 50; const MAX_TOTAL_SIZE_MB = 200; const MAX_TOTAL_SIZE_BYTES = MAX_TOTAL_SIZE_MB * 1024 * 1024; +const toggleRowClass = "flex items-center justify-between rounded-lg bg-slate-400/5 dark:bg-white/5 p-3"; + export function DocumentUploadTab({ searchSpaceId, onSuccess, @@ -198,7 +192,7 @@ export function DocumentUploadTab({ const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, accept: acceptedFileTypes, - maxSize: 50 * 1024 * 1024, // 50MB per file + maxSize: 50 * 1024 * 1024, noClick: isElectron, disabled: files.length >= MAX_FILES, }); @@ -270,6 +264,8 @@ export function DocumentUploadTab({ (MAX_TOTAL_SIZE_BYTES - totalFileSize) / (1024 * 1024) ).toFixed(1); + const hasContent = files.length > 0 || selectedFolder !== null; + const handleAccordionChange = useCallback( (value: string) => { setAccordionValue(value); @@ -307,7 +303,7 @@ export function DocumentUploadTab({ }); toast.success(`Watching folder: ${selectedFolder.name}`); } else { - toast.success(`Indexing folder: ${selectedFolder.name}`); + toast.success(`Syncing folder: ${selectedFolder.name}`); } setSelectedFolder(null); @@ -355,139 +351,180 @@ export function DocumentUploadTab({ ); }; - return ( -
- - - - {t("file_size_limit")}{" "} - {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} - - + const renderBrowseButton = (options?: { compact?: boolean; fullWidth?: boolean }) => { + const { compact, fullWidth } = options ?? {}; + if (isFileCountLimitReached) return null; - -
- -
- -
- - {isFileCountLimitReached ? ( -
- -
-

- {t("file_limit_reached")} -

-

- {t("file_limit_reached_desc", { max: MAX_FILES })} -

+ const sizeClass = compact ? "h-7" : "h-8"; + const widthClass = fullWidth ? "w-full" : ""; + + if (isElectron) { + return ( + + e.stopPropagation()}> + + + e.stopPropagation()}> + + + Files + + + + Folder + + + + ); + } + + return ( + + ); + }; + + return ( +
+ {/* Hidden file input for mobile browse */} + + + {/* MOBILE DROP ZONE */} +
+ {hasContent ? ( + !selectedFolder && !isFileCountLimitReached && ( + isElectron ? ( +
+ {renderBrowseButton({ compact: true, fullWidth: true })}
-
- ) : isDragActive ? ( -
- -

{t("drop_files")}

-
- ) : ( -
- -
-

{t("drag_drop")}

-

{t("or_browse")}

-
- {files.length > 0 && ( -

- {t("remaining_capacity", { files: remainingFiles, sizeMB: remainingSizeMB })} -

- )} -
- )} - {!isFileCountLimitReached && ( -
- {isElectron ? ( - - e.stopPropagation()}> - - - e.stopPropagation()}> - - - Files - - - - Folder - - - ) : ( + ) + ) + ) : ( +
{ + if (!isElectron) fileInputRef.current?.click(); + }} + > + +
+

+ {isElectron ? "Select files or folder" : "Tap to select files"} +

+

+ {t("file_size_limit")}{" "} + {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} +

+
+ {isElectron && ( +
e.stopPropagation()}> + {renderBrowseButton({ fullWidth: true })} +
)}
)} -
- - +
- {selectedFolder && ( - - -
-
- -
- - {selectedFolder.name} - - - {selectedFolder.path} - -
+ {/* DESKTOP DROP ZONE */} +
+ {hasContent ? ( +
+ + + {isDragActive + ? t("drop_files") + : isFileCountLimitReached + ? t("file_limit_reached") + : t("remaining_capacity", { files: remainingFiles, sizeMB: remainingSizeMB })} + + {renderBrowseButton({ compact: true })} +
+ ) : isFileCountLimitReached ? ( +
+ +

{t("file_limit_reached")}

+

+ {t("file_limit_reached_desc", { max: MAX_FILES })} +

+
+ ) : isDragActive ? ( +
+ +

{t("drop_files")}

+
+ ) : ( +
+ +

{t("drag_drop")}

+

+ {t("file_size_limit")}{" "} + {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} +

+
{renderBrowseButton()}
+
+ )} +
+ + {/* FOLDER SELECTED */} + {selectedFolder && ( +
+
+ +
+

{selectedFolder.name}

+

{selectedFolder.path}

- - -
-
)} + {/* FILES SELECTED */} + {files.length > 0 && ( +
+
+

+ {t("selected_files", { count: files.length })} · {formatFileSize(totalFileSize)} +

+ +
+ +
+ {files.map((entry) => ( +
+ + {entry.file.name} + + {formatFileSize(entry.file.size)} + + +
+ ))} +
+ + {isUploading && ( +
+
+ {t("uploading_files")} + {Math.round(uploadProgress)}% +
+ +
+ )} + +
+
+

Enable AI Summary

+

+ Improves search quality but adds latency +

+
+ +
+ + +
+ )} + + {/* SUPPORTED FORMATS */} - - -
-
-
- {t("supported_file_types")} -
-
- {t("file_types_desc")} -
-
-
+ + + + {t("supported_file_types")} + - -
+ +
{supportedExtensions.map((ext) => ( - + {ext} ))} diff --git a/surfsense_web/contracts/enums/connectorIcons.tsx b/surfsense_web/contracts/enums/connectorIcons.tsx index 2e609b060..ab71d58b5 100644 --- a/surfsense_web/contracts/enums/connectorIcons.tsx +++ b/surfsense_web/contracts/enums/connectorIcons.tsx @@ -126,6 +126,8 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas return ; case "DEEPEST": return ; + case "LOCAL_FOLDER_FILE": + return null; default: return ; } From 44e39792da6fc7a35edbe8ae1cb68807e4ac3b3f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 04:14:28 +0530 Subject: [PATCH 061/202] feat: assign folder_id to documents before indexing to ensure correct folder visibility during processing --- .../local_folder_indexer.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index a3281eaea..041df71fc 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -719,6 +719,21 @@ async def index_local_folder( } documents = await pipeline.prepare_for_indexing(connector_docs) + # Assign folder_id immediately so docs appear in the correct + # folder while still pending/processing (visible via Zero sync). + for document in documents: + cd = doc_map.get(document.unique_identifier_hash) + if cd is None: + continue + rel_path = (cd.metadata or {}).get("file_path", "") + parent_dir = str(Path(rel_path).parent) if rel_path else "" + if parent_dir == ".": + parent_dir = "" + document.folder_id = folder_mapping.get( + parent_dir, folder_mapping.get("") + ) + await session.commit() + llm = await get_user_long_context_llm(session, user_id, search_space_id) for document in documents: @@ -732,17 +747,9 @@ async def index_local_folder( if DocumentStatus.is_state(result.status, DocumentStatus.READY): indexed_count += 1 - # Assign folder_id and mtime post-pipeline - rel_path = (connector_doc.metadata or {}).get("file_path", "") - parent_dir = str(Path(rel_path).parent) if rel_path else "" - if parent_dir == ".": - parent_dir = "" - fid = folder_mapping.get(parent_dir, folder_mapping.get("")) - unique_id = connector_doc.unique_id mtime_info = file_meta_map.get(unique_id, {}) - result.folder_id = fid doc_meta = dict(result.document_metadata or {}) doc_meta["mtime"] = mtime_info.get("mtime") result.document_metadata = doc_meta @@ -894,16 +901,18 @@ async def _index_single_file( return 0, 1, None db_doc = documents[0] - await pipeline.index(db_doc, connector_doc, llm) - # Post-pipeline: assign folder_id and mtime - await session.refresh(db_doc) - folder_id = None + # Assign folder_id before indexing so the doc appears in the + # correct folder while still pending/processing. if root_folder_id: - folder_id = await _resolve_folder_for_file( + db_doc.folder_id = await _resolve_folder_for_file( session, rel_path, root_folder_id, search_space_id, user_id ) - db_doc.folder_id = folder_id + await session.commit() + + await pipeline.index(db_doc, connector_doc, llm) + + await session.refresh(db_doc) doc_meta = dict(db_doc.document_metadata or {}) doc_meta["mtime"] = mtime db_doc.document_metadata = doc_meta From fe7fcaae5dada13d12bf5883b5722c54dcd5f425 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 04:16:19 +0530 Subject: [PATCH 062/202] feat: update folder deletion process to queue document deletions first and handle folder cleanup in Celery task --- .../app/routes/folders_routes.py | 31 +++++++++---------- .../app/tasks/celery_tasks/document_tasks.py | 27 ++++++++++++---- .../layout/ui/sidebar/DocumentsSidebar.tsx | 7 ++++- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/surfsense_backend/app/routes/folders_routes.py b/surfsense_backend/app/routes/folders_routes.py index 6e524d4a4..2dc9bceac 100644 --- a/surfsense_backend/app/routes/folders_routes.py +++ b/surfsense_backend/app/routes/folders_routes.py @@ -367,7 +367,7 @@ async def delete_folder( session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): - """Delete a folder and cascade-delete subfolders. Documents are async-deleted via Celery.""" + """Mark documents for deletion and dispatch Celery to delete docs first, then folders.""" try: folder = await session.get(Folder, folder_id) if not folder: @@ -399,30 +399,29 @@ async def delete_folder( ) await session.commit() - await session.execute(Folder.__table__.delete().where(Folder.id == folder_id)) - await session.commit() + try: + from app.tasks.celery_tasks.document_tasks import ( + delete_folder_documents_task, + ) - if document_ids: - try: - from app.tasks.celery_tasks.document_tasks import ( - delete_folder_documents_task, - ) - - delete_folder_documents_task.delay(document_ids) - except Exception as err: + delete_folder_documents_task.delay( + document_ids, folder_subtree_ids=list(subtree_ids) + ) + except Exception as err: + if document_ids: await session.execute( Document.__table__.update() .where(Document.id.in_(document_ids)) .values(status={"state": "ready"}) ) await session.commit() - raise HTTPException( - status_code=503, - detail="Folder deleted but document cleanup could not be queued. Documents have been restored.", - ) from err + raise HTTPException( + status_code=503, + detail="Could not queue folder deletion. Documents have been restored.", + ) from err return { - "message": "Folder deleted successfully", + "message": "Folder deletion started", "documents_queued_for_deletion": len(document_ids), } diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index 110f3deee..4701d9911 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -142,21 +142,30 @@ async def _delete_document_background(document_id: int) -> None: retry_backoff_max=300, max_retries=5, ) -def delete_folder_documents_task(self, document_ids: list[int]): - """Celery task to batch-delete documents orphaned by folder deletion.""" +def delete_folder_documents_task( + self, + document_ids: list[int], + folder_subtree_ids: list[int] | None = None, +): + """Celery task to delete documents first, then the folder rows.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: - loop.run_until_complete(_delete_folder_documents(document_ids)) + loop.run_until_complete( + _delete_folder_documents(document_ids, folder_subtree_ids) + ) finally: loop.close() -async def _delete_folder_documents(document_ids: list[int]) -> None: - """Delete chunks in batches, then document rows for each orphaned document.""" +async def _delete_folder_documents( + document_ids: list[int], + folder_subtree_ids: list[int] | None = None, +) -> None: + """Delete chunks in batches, then document rows, then folder rows.""" from sqlalchemy import delete as sa_delete, select - from app.db import Chunk, Document + from app.db import Chunk, Document, Folder async with get_celery_session_maker()() as session: batch_size = 500 @@ -178,6 +187,12 @@ async def _delete_folder_documents(document_ids: list[int]) -> None: await session.delete(doc) await session.commit() + if folder_subtree_ids: + await session.execute( + sa_delete(Folder).where(Folder.id.in_(folder_subtree_ids)) + ) + await session.commit() + @celery_app.task( name="delete_search_space_background", diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index f8b774d26..8dce68eeb 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -188,7 +188,12 @@ export function DocumentsSidebar({ const treeDocuments: DocumentNodeDoc[] = useMemo(() => { const zeroDocs = (zeroAllDocs ?? []) - .filter((d) => d.title && d.title.trim() !== "") + .filter((d) => { + if (!d.title || d.title.trim() === "") return false; + const state = (d.status as { state?: string } | undefined)?.state; + if (state === "deleting") return false; + return true; + }) .map((d) => ({ id: d.id, title: d.title, From 62e698d8aae9e6f556203ab8b1e5949b1749a994 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 2 Apr 2026 19:39:10 -0700 Subject: [PATCH 063/202] refactor: streamline document upload limits and enhance handling of mentioned documents - Updated maximum file size limit to 500 MB per file. - Removed restrictions on the number of files per upload and total upload size. - Enhanced handling of user-mentioning documents in the knowledge base search middleware. - Improved document reading and processing logic to accommodate new features and optimizations. --- .../versions/116_create_zero_publication.py | 4 +- ..._optimize_zero_publication_column_lists.py | 102 + .../app/agents/new_chat/chat_deepagent.py | 2 + .../agents/new_chat/middleware/filesystem.py | 10 + .../new_chat/middleware/knowledge_search.py | 178 +- .../app/routes/documents_routes.py | 169 +- surfsense_backend/app/routes/editor_routes.py | 161 +- surfsense_backend/app/schemas/documents.py | 11 +- .../app/tasks/chat/stream_new_chat.py | 108 +- .../app/tasks/document_processors/__init__.py | 10 +- .../tasks/document_processors/_constants.py | 74 + .../document_processors/_direct_converters.py | 90 + .../app/tasks/document_processors/_etl.py | 209 ++ .../app/tasks/document_processors/_helpers.py | 218 ++ .../app/tasks/document_processors/_save.py | 285 ++ .../document_processors/file_processors.py | 2753 ++++++----------- .../document_processors/markdown_processor.py | 81 +- .../document_upload/test_upload_limits.py | 88 +- .../unit/middleware/test_knowledge_search.py | 6 +- .../components/DocumentsTableShell.tsx | 44 +- .../documents/(manage)/components/types.ts | 2 +- .../components/editor-panel/editor-panel.tsx | 72 +- .../layout/ui/tabs/DocumentTabContent.tsx | 78 +- surfsense_web/components/markdown-viewer.tsx | 12 +- .../new-chat/source-detail-panel.tsx | 206 +- .../components/sources/DocumentUploadTab.tsx | 180 +- .../contracts/types/document.types.ts | 32 + .../lib/apis/documents-api.service.ts | 42 +- surfsense_web/messages/en.json | 21 +- surfsense_web/messages/es.json | 21 +- surfsense_web/messages/hi.json | 21 +- surfsense_web/messages/pt.json | 21 +- surfsense_web/messages/zh.json | 21 +- 33 files changed, 2889 insertions(+), 2443 deletions(-) create mode 100644 surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py create mode 100644 surfsense_backend/app/tasks/document_processors/_constants.py create mode 100644 surfsense_backend/app/tasks/document_processors/_direct_converters.py create mode 100644 surfsense_backend/app/tasks/document_processors/_etl.py create mode 100644 surfsense_backend/app/tasks/document_processors/_helpers.py create mode 100644 surfsense_backend/app/tasks/document_processors/_save.py diff --git a/surfsense_backend/alembic/versions/116_create_zero_publication.py b/surfsense_backend/alembic/versions/116_create_zero_publication.py index 8f0d7b5d3..ff74952a9 100644 --- a/surfsense_backend/alembic/versions/116_create_zero_publication.py +++ b/surfsense_backend/alembic/versions/116_create_zero_publication.py @@ -42,9 +42,7 @@ def upgrade() -> None: if not exists: table_list = ", ".join(TABLES) conn.execute( - sa.text( - f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}" - ) + sa.text(f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}") ) diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py new file mode 100644 index 000000000..3c2d34c76 --- /dev/null +++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py @@ -0,0 +1,102 @@ +"""optimize zero_publication with column lists + +Recreates the zero_publication using column lists for the documents +table so that large text columns (content, source_markdown, +blocknote_document, etc.) are excluded from WAL replication. +This prevents RangeError: Invalid string length in zero-cache's +change-streamer when documents have very large content. + +Also resets REPLICA IDENTITY to DEFAULT on tables that had it set +to FULL for the old Electric SQL setup (migration 66/75/76). +With DEFAULT (primary-key) identity, column-list publications +only need to include the PK — not every column. + +After running this migration you MUST: + 1. Stop zero-cache + 2. Delete / reset the zero-cache data volume + 3. Restart zero-cache (it will do a fresh initial sync) + +Revision ID: 117 +Revises: 116 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +revision: str = "117" +down_revision: str | None = "116" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +PUBLICATION_NAME = "zero_publication" + +TABLES_WITH_FULL_IDENTITY = [ + "documents", + "notifications", + "search_source_connectors", + "new_chat_messages", + "chat_comments", + "chat_session_state", +] + +DOCUMENT_COLS = [ + "id", + "title", + "document_type", + "search_space_id", + "folder_id", + "created_by_id", + "status", + "created_at", + "updated_at", +] + +PUBLICATION_DDL_FULL = f"""\ +CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE + notifications, documents, folders, + search_source_connectors, new_chat_messages, + chat_comments, chat_session_state +""" + + +def upgrade() -> None: + conn = op.get_bind() + + for tbl in TABLES_WITH_FULL_IDENTITY: + conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT')) + + conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}")) + + has_zero_ver = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = 'documents' AND column_name = '_0_version'" + ) + ).fetchone() + + cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else []) + col_list = ", ".join(cols) + + conn.execute( + sa.text( + f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE " + f"notifications, " + f"documents ({col_list}), " + f"folders, " + f"search_source_connectors, " + f"new_chat_messages, " + f"chat_comments, " + f"chat_session_state" + ) + ) + + +def downgrade() -> None: + conn = op.get_bind() + conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}")) + conn.execute(sa.text(PUBLICATION_DDL_FULL)) + for tbl in TABLES_WITH_FULL_IDENTITY: + conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY FULL')) diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index ccc06f272..fc1e80d28 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -159,6 +159,7 @@ async def create_surfsense_deep_agent( additional_tools: Sequence[BaseTool] | None = None, firecrawl_api_key: str | None = None, thread_visibility: ChatVisibility | None = None, + mentioned_document_ids: list[int] | None = None, ): """ Create a SurfSense deep agent with configurable tools and prompts. @@ -451,6 +452,7 @@ async def create_surfsense_deep_agent( search_space_id=search_space_id, available_connectors=available_connectors, available_document_types=available_document_types, + mentioned_document_ids=mentioned_document_ids, ), SurfSenseFilesystemMiddleware( search_space_id=search_space_id, diff --git a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py index 41b24f88b..d7697ef15 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py +++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py @@ -66,6 +66,16 @@ the ``, identify chunks marked `matched="true"`, then use those sections instead of reading the entire file sequentially. Use `` values as citation IDs in your answers. + +## User-Mentioned Documents + +When the `ls` output tags a file with `[MENTIONED BY USER — read deeply]`, +the user **explicitly selected** that document. These files are your highest- +priority sources: +1. **Always read them thoroughly** — scan the full ``, then read + all major sections, not just matched chunks. +2. **Prefer their content** over other search results when answering. +3. **Cite from them first** whenever applicable. """ # ============================================================================= diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py index 3728f229c..7b0dd2f71 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py +++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py @@ -28,7 +28,13 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.agents.new_chat.utils import parse_date_or_datetime, resolve_date_range -from app.db import NATIVE_TO_LEGACY_DOCTYPE, Document, Folder, shielded_async_session +from app.db import ( + NATIVE_TO_LEGACY_DOCTYPE, + Chunk, + Document, + Folder, + shielded_async_session, +) from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever from app.utils.document_converters import embed_texts from app.utils.perf import get_perf_logger @@ -430,21 +436,36 @@ async def _get_folder_paths( def _build_synthetic_ls( existing_files: dict[str, Any] | None, new_files: dict[str, Any], + *, + mentioned_paths: set[str] | None = None, ) -> tuple[AIMessage, ToolMessage]: """Build a synthetic ls("/documents") tool-call + result for the LLM context. - Paths are listed with *new* (rank-ordered) files first, then existing files - that were already in state from prior turns. + Mentioned files are listed first. A separate header tells the LLM which + files the user explicitly selected; the path list itself stays clean so + paths can be passed directly to ``read_file`` without stripping tags. """ + _mentioned = mentioned_paths or set() merged: dict[str, Any] = {**(existing_files or {}), **new_files} doc_paths = [ p for p, v in merged.items() if p.startswith("/documents/") and v is not None ] new_set = set(new_files) - new_paths = [p for p in doc_paths if p in new_set] + mentioned_list = [p for p in doc_paths if p in _mentioned] + new_non_mentioned = [p for p in doc_paths if p in new_set and p not in _mentioned] old_paths = [p for p in doc_paths if p not in new_set] - ordered = new_paths + old_paths + ordered = mentioned_list + new_non_mentioned + old_paths + + parts: list[str] = [] + if mentioned_list: + parts.append( + "USER-MENTIONED documents (read these thoroughly before answering):" + ) + for p in mentioned_list: + parts.append(f" {p}") + parts.append("") + parts.append(str(ordered) if ordered else "No documents found.") tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}" ai_msg = AIMessage( @@ -452,7 +473,7 @@ def _build_synthetic_ls( tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}], ) tool_msg = ToolMessage( - content=str(ordered) if ordered else "No documents found.", + content="\n".join(parts), tool_call_id=tool_call_id, ) return ai_msg, tool_msg @@ -524,12 +545,92 @@ async def search_knowledge_base( return results[:top_k] +async def fetch_mentioned_documents( + *, + document_ids: list[int], + search_space_id: int, +) -> list[dict[str, Any]]: + """Fetch explicitly mentioned documents with *all* their chunks. + + Returns the same dict structure as ``search_knowledge_base`` so results + can be merged directly into ``build_scoped_filesystem``. Unlike search + results, every chunk is included (no top-K limiting) and none are marked + as ``matched`` since the entire document is relevant by virtue of the + user's explicit mention. + """ + if not document_ids: + return [] + + async with shielded_async_session() as session: + doc_result = await session.execute( + select(Document).where( + Document.id.in_(document_ids), + Document.search_space_id == search_space_id, + ) + ) + docs = {doc.id: doc for doc in doc_result.scalars().all()} + + if not docs: + return [] + + chunk_result = await session.execute( + select(Chunk.id, Chunk.content, Chunk.document_id) + .where(Chunk.document_id.in_(list(docs.keys()))) + .order_by(Chunk.document_id, Chunk.id) + ) + chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs} + for row in chunk_result.all(): + if row.document_id in chunks_by_doc: + chunks_by_doc[row.document_id].append( + {"chunk_id": row.id, "content": row.content} + ) + + results: list[dict[str, Any]] = [] + for doc_id in document_ids: + doc = docs.get(doc_id) + if doc is None: + continue + metadata = doc.document_metadata or {} + results.append( + { + "document_id": doc.id, + "content": "", + "score": 1.0, + "chunks": chunks_by_doc.get(doc.id, []), + "matched_chunk_ids": [], + "document": { + "id": doc.id, + "title": doc.title, + "document_type": ( + doc.document_type.value + if getattr(doc, "document_type", None) + else None + ), + "metadata": metadata, + }, + "source": ( + doc.document_type.value + if getattr(doc, "document_type", None) + else None + ), + "_user_mentioned": True, + } + ) + return results + + async def build_scoped_filesystem( *, documents: Sequence[dict[str, Any]], search_space_id: int, -) -> dict[str, dict[str, str]]: - """Build a StateBackend-compatible files dict from search results.""" +) -> tuple[dict[str, dict[str, str]], dict[int, str]]: + """Build a StateBackend-compatible files dict from search results. + + Returns ``(files, doc_id_to_path)`` so callers can reliably map a + document id back to its filesystem path without guessing by title. + Paths are collision-proof: when two documents resolve to the same + path the doc-id is appended to disambiguate. + """ async with shielded_async_session() as session: folder_paths = await _get_folder_paths(session, search_space_id) doc_ids = [ @@ -551,6 +652,7 @@ async def build_scoped_filesystem( } files: dict[str, dict[str, str]] = {} + doc_id_to_path: dict[int, str] = {} for document in documents: doc_meta = document.get("document") or {} title = str(doc_meta.get("title") or "untitled") @@ -559,6 +661,9 @@ async def build_scoped_filesystem( base_folder = folder_paths.get(folder_id, "/documents") file_name = _safe_filename(title) path = f"{base_folder}/{file_name}" + if path in files: + stem = file_name.removesuffix(".xml") + path = f"{base_folder}/{stem} ({doc_id}).xml" matched_ids = set(document.get("matched_chunk_ids") or []) xml_content = _build_document_xml(document, matched_chunk_ids=matched_ids) files[path] = { @@ -567,7 +672,9 @@ async def build_scoped_filesystem( "created_at": "", "modified_at": "", } - return files + if isinstance(doc_id, int): + doc_id_to_path[doc_id] = path + return files, doc_id_to_path class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] @@ -583,12 +690,14 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] available_connectors: list[str] | None = None, available_document_types: list[str] | None = None, top_k: int = 10, + mentioned_document_ids: list[int] | None = None, ) -> None: self.llm = llm self.search_space_id = search_space_id self.available_connectors = available_connectors self.available_document_types = available_document_types self.top_k = top_k + self.mentioned_document_ids = mentioned_document_ids or [] async def _plan_search_inputs( self, @@ -680,6 +789,18 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] user_text=user_text, ) + # --- 1. Fetch mentioned documents (user-selected, all chunks) --- + mentioned_results: list[dict[str, Any]] = [] + if self.mentioned_document_ids: + mentioned_results = await fetch_mentioned_documents( + document_ids=self.mentioned_document_ids, + search_space_id=self.search_space_id, + ) + # Clear after first turn so they are not re-fetched on subsequent + # messages within the same agent instance. + self.mentioned_document_ids = [] + + # --- 2. Run KB hybrid search --- search_results = await search_knowledge_base( query=planned_query, search_space_id=self.search_space_id, @@ -689,19 +810,50 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] start_date=start_date, end_date=end_date, ) - new_files = await build_scoped_filesystem( - documents=search_results, + + # --- 3. Merge: mentioned first, then search (dedup by doc id) --- + seen_doc_ids: set[int] = set() + merged: list[dict[str, Any]] = [] + for doc in mentioned_results: + doc_id = (doc.get("document") or {}).get("id") + if doc_id is not None: + seen_doc_ids.add(doc_id) + merged.append(doc) + for doc in search_results: + doc_id = (doc.get("document") or {}).get("id") + if doc_id is not None and doc_id in seen_doc_ids: + continue + merged.append(doc) + + # --- 4. Build scoped filesystem --- + new_files, doc_id_to_path = await build_scoped_filesystem( + documents=merged, search_space_id=self.search_space_id, ) - ai_msg, tool_msg = _build_synthetic_ls(existing_files, new_files) + # Identify which paths belong to user-mentioned documents using + # the authoritative doc_id -> path mapping (no title guessing). + mentioned_doc_ids = { + (d.get("document") or {}).get("id") for d in mentioned_results + } + mentioned_paths = { + doc_id_to_path[did] for did in mentioned_doc_ids if did in doc_id_to_path + } + + ai_msg, tool_msg = _build_synthetic_ls( + existing_files, + new_files, + mentioned_paths=mentioned_paths, + ) if t0 is not None: _perf_log.info( - "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r new_files=%d total=%d", + "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r " + "mentioned=%d new_files=%d total=%d", asyncio.get_event_loop().time() - t0, user_text[:80], planned_query[:120], + len(mentioned_results), len(new_files), len(new_files) + len(existing_files or {}), ) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 6e69218f1..f53c81bb6 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1,7 +1,7 @@ # Force asyncio to use standard event loop before unstructured imports import asyncio -from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile +from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload @@ -17,6 +17,7 @@ from app.db import ( get_async_session, ) from app.schemas import ( + ChunkRead, DocumentRead, DocumentsCreate, DocumentStatusBatchResponse, @@ -45,9 +46,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1" router = APIRouter() -MAX_FILES_PER_UPLOAD = 10 -MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file -MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024 # 200 MB total +MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB per file @router.post("/documents") @@ -156,13 +155,6 @@ async def create_documents_file_upload( if not files: raise HTTPException(status_code=400, detail="No files provided") - if len(files) > MAX_FILES_PER_UPLOAD: - raise HTTPException( - status_code=413, - detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.", - ) - - total_size = 0 for file in files: file_size = file.size or 0 if file_size > MAX_FILE_SIZE_BYTES: @@ -171,14 +163,6 @@ async def create_documents_file_upload( detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) " f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.", ) - total_size += file_size - - if total_size > MAX_TOTAL_SIZE_BYTES: - raise HTTPException( - status_code=413, - detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) " - f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.", - ) # ===== Read all files concurrently to avoid blocking the event loop ===== async def _read_and_save(file: UploadFile) -> tuple[str, str, int]: @@ -206,16 +190,6 @@ async def create_documents_file_upload( saved_files = await asyncio.gather(*(_read_and_save(f) for f in files)) - actual_total_size = sum(size for _, _, size in saved_files) - if actual_total_size > MAX_TOTAL_SIZE_BYTES: - for temp_path, _, _ in saved_files: - os.unlink(temp_path) - raise HTTPException( - status_code=413, - detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) " - f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.", - ) - # ===== PHASE 1: Create pending documents for all files ===== created_documents: list[Document] = [] files_to_process: list[tuple[Document, str, str]] = [] @@ -451,13 +425,15 @@ async def read_documents( reason=doc.status.get("reason"), ) + raw_content = doc.content or "" api_documents.append( DocumentRead( id=doc.id, title=doc.title, document_type=doc.document_type, document_metadata=doc.document_metadata, - content=doc.content, + content="", + content_preview=raw_content[:300], content_hash=doc.content_hash, unique_identifier_hash=doc.unique_identifier_hash, created_at=doc.created_at, @@ -609,13 +585,15 @@ async def search_documents( reason=doc.status.get("reason"), ) + raw_content = doc.content or "" api_documents.append( DocumentRead( id=doc.id, title=doc.title, document_type=doc.document_type, document_metadata=doc.document_metadata, - content=doc.content, + content="", + content_preview=raw_content[:300], content_hash=doc.content_hash, unique_identifier_hash=doc.unique_identifier_hash, created_at=doc.created_at, @@ -884,16 +862,19 @@ async def get_document_type_counts( @router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead) async def get_document_by_chunk_id( chunk_id: int, + chunk_window: int = Query( + 5, ge=0, description="Number of chunks before/after the cited chunk to include" + ), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): """ - Retrieves a document based on a chunk ID, including all its chunks ordered by creation time. - Requires DOCUMENTS_READ permission for the search space. - The document's embedding and chunk embeddings are excluded from the response. + Retrieves a document based on a chunk ID, including a window of chunks around the cited one. + Uses SQL-level pagination to avoid loading all chunks into memory. """ try: - # First, get the chunk and verify it exists + from sqlalchemy import and_, func, or_ + chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id)) chunk = chunk_result.scalars().first() @@ -902,11 +883,8 @@ async def get_document_by_chunk_id( status_code=404, detail=f"Chunk with id {chunk_id} not found" ) - # Get the associated document document_result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter(Document.id == chunk.document_id) + select(Document).filter(Document.id == chunk.document_id) ) document = document_result.scalars().first() @@ -916,7 +894,6 @@ async def get_document_by_chunk_id( detail="Document not found", ) - # Check permission for the search space await check_permission( session, user, @@ -925,10 +902,38 @@ async def get_document_by_chunk_id( "You don't have permission to read documents in this search space", ) - # Sort chunks by creation time - sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at) + total_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter(Chunk.document_id == document.id) + ) + total_chunks = total_result.scalar() or 0 + + cited_idx_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter( + Chunk.document_id == document.id, + or_( + Chunk.created_at < chunk.created_at, + and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id), + ), + ) + ) + cited_idx = cited_idx_result.scalar() or 0 + + start = max(0, cited_idx - chunk_window) + end = min(total_chunks, cited_idx + chunk_window + 1) + + windowed_result = await session.execute( + select(Chunk) + .filter(Chunk.document_id == document.id) + .order_by(Chunk.created_at, Chunk.id) + .offset(start) + .limit(end - start) + ) + windowed_chunks = windowed_result.scalars().all() - # Return the document with its chunks return DocumentWithChunksRead( id=document.id, title=document.title, @@ -940,7 +945,9 @@ async def get_document_by_chunk_id( created_at=document.created_at, updated_at=document.updated_at, search_space_id=document.search_space_id, - chunks=sorted_chunks, + chunks=windowed_chunks, + total_chunks=total_chunks, + chunk_start_index=start, ) except HTTPException: raise @@ -950,6 +957,75 @@ async def get_document_by_chunk_id( ) from e +@router.get( + "/documents/{document_id}/chunks", + response_model=PaginatedResponse[ChunkRead], +) +async def get_document_chunks_paginated( + document_id: int, + page: int = Query(0, ge=0), + page_size: int = Query(20, ge=1, le=100), + start_offset: int | None = Query( + None, ge=0, description="Direct offset; overrides page * page_size" + ), + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Paginated chunk loading for a document. + Supports both page-based and offset-based access. + """ + try: + from sqlalchemy import func + + doc_result = await session.execute( + select(Document).filter(Document.id == document_id) + ) + document = doc_result.scalars().first() + + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + await check_permission( + session, + user, + document.search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + total_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter(Chunk.document_id == document_id) + ) + total = total_result.scalar() or 0 + + offset = start_offset if start_offset is not None else page * page_size + chunks_result = await session.execute( + select(Chunk) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.created_at, Chunk.id) + .offset(offset) + .limit(page_size) + ) + chunks = chunks_result.scalars().all() + + return PaginatedResponse( + items=chunks, + total=total, + page=offset // page_size if page_size else page, + page_size=page_size, + has_more=(offset + len(chunks)) < total, + ) + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to fetch chunks: {e!s}" + ) from e + + @router.get("/documents/{document_id}", response_model=DocumentRead) async def read_document( document_id: int, @@ -980,13 +1056,14 @@ async def read_document( "You don't have permission to read documents in this search space", ) - # Convert database object to API-friendly format + raw_content = document.content or "" return DocumentRead( id=document.id, title=document.title, document_type=document.document_type, document_metadata=document.document_metadata, - content=document.content, + content=raw_content, + content_preview=raw_content[:300], content_hash=document.content_hash, unique_identifier_hash=document.unique_identifier_hash, created_at=document.created_at, diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py index f54f18def..09a35c619 100644 --- a/surfsense_backend/app/routes/editor_routes.py +++ b/surfsense_backend/app/routes/editor_routes.py @@ -15,11 +15,10 @@ import pypandoc import typst from fastapi import APIRouter, Depends, HTTPException, Query from fastapi.responses import StreamingResponse -from sqlalchemy import select +from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload -from app.db import Document, DocumentType, Permission, User, get_async_session +from app.db import Chunk, Document, DocumentType, Permission, User, get_async_session from app.routes.reports_routes import ( _FILE_EXTENSIONS, _MEDIA_TYPES, @@ -44,6 +43,9 @@ router = APIRouter() async def get_editor_content( search_space_id: int, document_id: int, + max_length: int | None = Query( + None, description="Truncate source_markdown to this many characters" + ), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): @@ -65,9 +67,7 @@ async def get_editor_content( ) result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( + select(Document).filter( Document.id == document_id, Document.search_space_id == search_space_id, ) @@ -77,62 +77,63 @@ async def get_editor_content( if not document: raise HTTPException(status_code=404, detail="Document not found") - # Priority 1: Return source_markdown if it exists (check `is not None` to allow empty strings) - if document.source_markdown is not None: + count_result = await session.execute( + select(func.count()).select_from(Chunk).filter(Chunk.document_id == document_id) + ) + chunk_count = count_result.scalar() or 0 + + def _build_response(md: str) -> dict: + size_bytes = len(md.encode("utf-8")) + truncated = False + output_md = md + if max_length is not None and size_bytes > max_length: + output_md = md[:max_length] + truncated = True return { "document_id": document.id, "title": document.title, "document_type": document.document_type.value, - "source_markdown": document.source_markdown, + "source_markdown": output_md, + "content_size_bytes": size_bytes, + "chunk_count": chunk_count, + "truncated": truncated, "updated_at": document.updated_at.isoformat() if document.updated_at else None, } - # Priority 2: Lazy-migrate from blocknote_document (pure Python, no external deps) + if document.source_markdown is not None: + return _build_response(document.source_markdown) + if document.blocknote_document: from app.utils.blocknote_to_markdown import blocknote_to_markdown markdown = blocknote_to_markdown(document.blocknote_document) if markdown: - # Persist the migration so we don't repeat it document.source_markdown = markdown await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": markdown, - "updated_at": document.updated_at.isoformat() - if document.updated_at - else None, - } + return _build_response(markdown) - # Priority 3: For NOTE type with no content, return empty markdown if document.document_type == DocumentType.NOTE: empty_markdown = "" document.source_markdown = empty_markdown await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": empty_markdown, - "updated_at": document.updated_at.isoformat() - if document.updated_at - else None, - } + return _build_response(empty_markdown) - # Priority 4: Reconstruct from chunks - chunks = sorted(document.chunks, key=lambda c: c.id) + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() - if not chunks: + if not chunk_contents: raise HTTPException( status_code=400, detail="This document has no content and cannot be edited. Please re-upload to enable editing.", ) - markdown_content = "\n\n".join(chunk.content for chunk in chunks) + markdown_content = "\n\n".join(chunk_contents) if not markdown_content.strip(): raise HTTPException( @@ -140,17 +141,77 @@ async def get_editor_content( detail="This document has empty content and cannot be edited.", ) - # Persist the lazy migration document.source_markdown = markdown_content await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": markdown_content, - "updated_at": document.updated_at.isoformat() if document.updated_at else None, - } + return _build_response(markdown_content) + + +@router.get( + "/search-spaces/{search_space_id}/documents/{document_id}/download-markdown" +) +async def download_document_markdown( + search_space_id: int, + document_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Download the full document content as a .md file. + Reconstructs markdown from source_markdown or chunks. + """ + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + result = await session.execute( + select(Document).filter( + Document.id == document_id, + Document.search_space_id == search_space_id, + ) + ) + document = result.scalars().first() + + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + markdown: str | None = document.source_markdown + if markdown is None and document.blocknote_document: + from app.utils.blocknote_to_markdown import blocknote_to_markdown + + markdown = blocknote_to_markdown(document.blocknote_document) + if markdown is None: + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + if chunk_contents: + markdown = "\n\n".join(chunk_contents) + + if not markdown or not markdown.strip(): + raise HTTPException( + status_code=400, detail="Document has no content to download" + ) + + safe_title = ( + "".join( + c if c.isalnum() or c in " -_" else "_" + for c in (document.title or "document") + ).strip()[:80] + or "document" + ) + + return StreamingResponse( + io.BytesIO(markdown.encode("utf-8")), + media_type="text/markdown; charset=utf-8", + headers={"Content-Disposition": f'attachment; filename="{safe_title}.md"'}, + ) @router.post("/search-spaces/{search_space_id}/documents/{document_id}/save") @@ -258,9 +319,7 @@ async def export_document( ) result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( + select(Document).filter( Document.id == document_id, Document.search_space_id == search_space_id, ) @@ -269,16 +328,20 @@ async def export_document( if not document: raise HTTPException(status_code=404, detail="Document not found") - # Resolve markdown content (same priority as editor-content endpoint) markdown_content: str | None = document.source_markdown if markdown_content is None and document.blocknote_document: from app.utils.blocknote_to_markdown import blocknote_to_markdown markdown_content = blocknote_to_markdown(document.blocknote_document) if markdown_content is None: - chunks = sorted(document.chunks, key=lambda c: c.id) - if chunks: - markdown_content = "\n\n".join(chunk.content for chunk in chunks) + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + if chunk_contents: + markdown_content = "\n\n".join(chunk_contents) if not markdown_content or not markdown_content.strip(): raise HTTPException(status_code=400, detail="Document has no content to export") diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index c022a09d2..49d2836b2 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -53,25 +53,26 @@ class DocumentRead(BaseModel): title: str document_type: DocumentType document_metadata: dict - content: str # Changed to string to match frontend + content: str = "" + content_preview: str = "" content_hash: str unique_identifier_hash: str | None created_at: datetime updated_at: datetime | None search_space_id: int folder_id: int | None = None - created_by_id: UUID | None = None # User who created/uploaded this document + created_by_id: UUID | None = None created_by_name: str | None = None created_by_email: str | None = None - status: DocumentStatusSchema | None = ( - None # Processing status (ready, processing, failed) - ) + status: DocumentStatusSchema | None = None model_config = ConfigDict(from_attributes=True) class DocumentWithChunksRead(DocumentRead): chunks: list[ChunkRead] = [] + total_chunks: int = 0 + chunk_start_index: int = 0 model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index 7c1e3b7ea..5ff907459 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -39,7 +39,6 @@ from app.agents.new_chat.llm_config import ( ) from app.db import ( ChatVisibility, - Document, NewChatMessage, NewChatThread, Report, @@ -63,74 +62,6 @@ _perf_log = get_perf_logger() _background_tasks: set[asyncio.Task] = set() -def format_mentioned_documents_as_context(documents: list[Document]) -> str: - """ - Format mentioned documents as context for the agent. - - Uses the same XML structure as knowledge_base.format_documents_for_context - to ensure citations work properly with chunk IDs. - """ - if not documents: - return "" - - context_parts = [""] - context_parts.append( - "The user has explicitly mentioned the following documents from their knowledge base. " - "These documents are directly relevant to the query and should be prioritized as primary sources. " - "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])." - ) - context_parts.append("") - - for doc in documents: - # Build metadata JSON - metadata = doc.document_metadata or {} - metadata_json = json.dumps(metadata, ensure_ascii=False) - - # Get URL from metadata - url = ( - metadata.get("url") - or metadata.get("source") - or metadata.get("page_url") - or "" - ) - - context_parts.append("") - context_parts.append("") - context_parts.append(f" {doc.id}") - context_parts.append( - f" {doc.document_type.value}" - ) - context_parts.append(f" <![CDATA[{doc.title}]]>") - context_parts.append(f" ") - context_parts.append( - f" " - ) - context_parts.append("") - context_parts.append("") - context_parts.append("") - - # Use chunks if available (preferred for proper citations) - if hasattr(doc, "chunks") and doc.chunks: - for chunk in doc.chunks: - context_parts.append( - f" " - ) - else: - # Fallback to document content if chunks not loaded - # Use document ID as chunk ID prefix for consistency - context_parts.append( - f" " - ) - - context_parts.append("") - context_parts.append("") - context_parts.append("") - - context_parts.append("") - - return "\n".join(context_parts) - - def format_mentioned_surfsense_docs_as_context( documents: list[SurfsenseDocsDocument], ) -> str: @@ -1317,6 +1248,7 @@ async def stream_new_chat( firecrawl_api_key=firecrawl_api_key, thread_visibility=visibility, disabled_tools=disabled_tools, + mentioned_document_ids=mentioned_document_ids, ) _perf_log.info( "[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0 @@ -1340,18 +1272,9 @@ async def stream_new_chat( thread.needs_history_bootstrap = False await session.commit() - # Fetch mentioned documents if any (with chunks for proper citations) - mentioned_documents: list[Document] = [] - if mentioned_document_ids: - result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( - Document.id.in_(mentioned_document_ids), - Document.search_space_id == search_space_id, - ) - ) - mentioned_documents = list(result.scalars().all()) + # Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware + # which merges them into the scoped filesystem with full document + # structure. Only SurfSense docs and report context are inlined here. # Fetch mentioned SurfSense docs if any mentioned_surfsense_docs: list[SurfsenseDocsDocument] = [] @@ -1379,15 +1302,10 @@ async def stream_new_chat( ) recent_reports = list(recent_reports_result.scalars().all()) - # Format the user query with context (mentioned documents + SurfSense docs) + # Format the user query with context (SurfSense docs + reports only) final_query = user_query context_parts = [] - if mentioned_documents: - context_parts.append( - format_mentioned_documents_as_context(mentioned_documents) - ) - if mentioned_surfsense_docs: context_parts.append( format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs) @@ -1479,7 +1397,7 @@ async def stream_new_chat( yield streaming_service.format_start_step() # Initial thinking step - analyzing the request - if mentioned_documents or mentioned_surfsense_docs: + if mentioned_surfsense_docs: initial_title = "Analyzing referenced content" action_verb = "Analyzing" else: @@ -1490,18 +1408,6 @@ async def stream_new_chat( query_text = user_query[:80] + ("..." if len(user_query) > 80 else "") processing_parts.append(query_text) - if mentioned_documents: - doc_names = [] - for doc in mentioned_documents: - title = doc.title - if len(title) > 30: - title = title[:27] + "..." - doc_names.append(title) - if len(doc_names) == 1: - processing_parts.append(f"[{doc_names[0]}]") - else: - processing_parts.append(f"[{len(doc_names)} documents]") - if mentioned_surfsense_docs: doc_names = [] for doc in mentioned_surfsense_docs: @@ -1527,7 +1433,7 @@ async def stream_new_chat( # These ORM objects (with eagerly-loaded chunks) can be very large. # They're only needed to build context strings already copied into # final_query / langchain_messages — release them before streaming. - del mentioned_documents, mentioned_surfsense_docs, recent_reports + del mentioned_surfsense_docs, recent_reports del langchain_messages, final_query # Check if this is the first assistant response so we can generate diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py index e70c41cb4..2b5690d02 100644 --- a/surfsense_backend/app/tasks/document_processors/__init__.py +++ b/surfsense_backend/app/tasks/document_processors/__init__.py @@ -12,16 +12,14 @@ Available processors: - YouTube processor: Process YouTube videos and extract transcripts """ -# URL crawler # Extension processor -from .extension_processor import add_extension_received_document - -# File processors -from .file_processors import ( +# File processors (backward-compatible re-exports from _save) +from ._save import ( add_received_file_document_using_docling, add_received_file_document_using_llamacloud, add_received_file_document_using_unstructured, ) +from .extension_processor import add_extension_received_document # Markdown processor from .markdown_processor import add_received_markdown_file_document @@ -32,9 +30,9 @@ from .youtube_processor import add_youtube_video_document __all__ = [ # Extension processing "add_extension_received_document", + # File processing with different ETL services "add_received_file_document_using_docling", "add_received_file_document_using_llamacloud", - # File processing with different ETL services "add_received_file_document_using_unstructured", # Markdown file processing "add_received_markdown_file_document", diff --git a/surfsense_backend/app/tasks/document_processors/_constants.py b/surfsense_backend/app/tasks/document_processors/_constants.py new file mode 100644 index 000000000..f74d7acce --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_constants.py @@ -0,0 +1,74 @@ +""" +Constants for file document processing. + +Centralizes file type classification, LlamaCloud retry configuration, +and timeout calculation parameters. +""" + +import ssl +from enum import Enum + +import httpx + +# --------------------------------------------------------------------------- +# File type classification +# --------------------------------------------------------------------------- + +MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt") +AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") +DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm") + + +class FileCategory(Enum): + MARKDOWN = "markdown" + AUDIO = "audio" + DIRECT_CONVERT = "direct_convert" + DOCUMENT = "document" + + +def classify_file(filename: str) -> FileCategory: + """Classify a file by its extension into a processing category.""" + lower = filename.lower() + if lower.endswith(MARKDOWN_EXTENSIONS): + return FileCategory.MARKDOWN + if lower.endswith(AUDIO_EXTENSIONS): + return FileCategory.AUDIO + if lower.endswith(DIRECT_CONVERT_EXTENSIONS): + return FileCategory.DIRECT_CONVERT + return FileCategory.DOCUMENT + + +# --------------------------------------------------------------------------- +# LlamaCloud retry configuration +# --------------------------------------------------------------------------- + +LLAMACLOUD_MAX_RETRIES = 5 +LLAMACLOUD_BASE_DELAY = 10 # seconds (exponential backoff base) +LLAMACLOUD_MAX_DELAY = 120 # max delay between retries (2 minutes) +LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( + ssl.SSLError, + httpx.ConnectError, + httpx.ConnectTimeout, + httpx.ReadError, + httpx.ReadTimeout, + httpx.WriteError, + httpx.WriteTimeout, + httpx.RemoteProtocolError, + httpx.LocalProtocolError, + ConnectionError, + ConnectionResetError, + TimeoutError, + OSError, +) + +# --------------------------------------------------------------------------- +# Timeout calculation constants +# --------------------------------------------------------------------------- + +UPLOAD_BYTES_PER_SECOND_SLOW = ( + 100 * 1024 +) # 100 KB/s (conservative for slow connections) +MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file +MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files +BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing +PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py new file mode 100644 index 000000000..b1a69ef4f --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py @@ -0,0 +1,90 @@ +""" +Lossless file-to-markdown converters for text-based formats. + +These converters handle file types that can be faithfully represented as +markdown without any external ETL/OCR service: + +- CSV / TSV → markdown table (stdlib ``csv``) +- HTML / HTM → markdown (``markdownify``) +""" + +from __future__ import annotations + +import csv +from collections.abc import Callable +from pathlib import Path + +from markdownify import markdownify + +# The stdlib csv module defaults to a 128 KB field-size limit which is too +# small for real-world exports (e.g. chat logs, CRM dumps). We raise it once +# at import time so every csv.reader call in this module can handle large fields. +csv.field_size_limit(2**31 - 1) + + +def _escape_pipe(cell: str) -> str: + """Escape literal pipe characters inside a markdown table cell.""" + return cell.replace("|", "\\|") + + +def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str: + """Convert a CSV (or TSV) file to a markdown table. + + The first row is treated as the header. An empty file returns an + empty string so the caller can decide how to handle it. + """ + with open(file_path, encoding="utf-8", newline="") as fh: + reader = csv.reader(fh, delimiter=delimiter) + rows = list(reader) + + if not rows: + return "" + + header, *body = rows + col_count = len(header) + + lines: list[str] = [] + + header_cells = [_escape_pipe(c.strip()) for c in header] + lines.append("| " + " | ".join(header_cells) + " |") + lines.append("| " + " | ".join(["---"] * col_count) + " |") + + for row in body: + padded = row + [""] * (col_count - len(row)) + cells = [_escape_pipe(c.strip()) for c in padded[:col_count]] + lines.append("| " + " | ".join(cells) + " |") + + return "\n".join(lines) + "\n" + + +def tsv_to_markdown(file_path: str) -> str: + """Convert a TSV file to a markdown table.""" + return csv_to_markdown(file_path, delimiter="\t") + + +def html_to_markdown(file_path: str) -> str: + """Convert an HTML file to markdown via ``markdownify``.""" + html = Path(file_path).read_text(encoding="utf-8") + return markdownify(html).strip() + + +_CONVERTER_MAP: dict[str, Callable[..., str]] = { + ".csv": csv_to_markdown, + ".tsv": tsv_to_markdown, + ".html": html_to_markdown, + ".htm": html_to_markdown, +} + + +def convert_file_directly(file_path: str, filename: str) -> str: + """Dispatch to the appropriate lossless converter based on file extension. + + Raises ``ValueError`` if the extension is not supported. + """ + suffix = Path(filename).suffix.lower() + converter = _CONVERTER_MAP.get(suffix) + if converter is None: + raise ValueError( + f"No direct converter for extension '{suffix}' (file: {filename})" + ) + return converter(file_path) diff --git a/surfsense_backend/app/tasks/document_processors/_etl.py b/surfsense_backend/app/tasks/document_processors/_etl.py new file mode 100644 index 000000000..cc3a8b1ac --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_etl.py @@ -0,0 +1,209 @@ +""" +ETL parsing strategies for different document processing services. + +Provides parse functions for Unstructured, LlamaCloud, and Docling, along with +LlamaCloud retry logic and dynamic timeout calculations. +""" + +import asyncio +import logging +import os +import random +import warnings +from logging import ERROR, getLogger + +import httpx + +from app.config import config as app_config +from app.db import Log +from app.services.task_logging_service import TaskLoggingService + +from ._constants import ( + LLAMACLOUD_BASE_DELAY, + LLAMACLOUD_MAX_DELAY, + LLAMACLOUD_MAX_RETRIES, + LLAMACLOUD_RETRYABLE_EXCEPTIONS, + PER_PAGE_JOB_TIMEOUT, +) +from ._helpers import calculate_job_timeout, calculate_upload_timeout + +# --------------------------------------------------------------------------- +# LlamaCloud parsing with retry +# --------------------------------------------------------------------------- + + +async def parse_with_llamacloud_retry( + file_path: str, + estimated_pages: int, + task_logger: TaskLoggingService | None = None, + log_entry: Log | None = None, +): + """ + Parse a file with LlamaCloud with retry logic for transient SSL/connection errors. + + Uses dynamic timeout calculations based on file size and page count to handle + very large files reliably. + + Returns: + LlamaParse result object + + Raises: + Exception: If all retries fail + """ + from llama_cloud_services import LlamaParse + from llama_cloud_services.parse.utils import ResultType + + file_size_bytes = os.path.getsize(file_path) + file_size_mb = file_size_bytes / (1024 * 1024) + + upload_timeout = calculate_upload_timeout(file_size_bytes) + job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) + + custom_timeout = httpx.Timeout( + connect=120.0, + read=upload_timeout, + write=upload_timeout, + pool=120.0, + ) + + logging.info( + f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " + f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " + f"job_timeout={job_timeout:.0f}s" + ) + + last_exception = None + attempt_errors: list[str] = [] + + for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1): + try: + async with httpx.AsyncClient(timeout=custom_timeout) as custom_client: + parser = LlamaParse( + api_key=app_config.LLAMA_CLOUD_API_KEY, + num_workers=1, + verbose=True, + language="en", + result_type=ResultType.MD, + max_timeout=int(max(2000, job_timeout + upload_timeout)), + job_timeout_in_seconds=job_timeout, + job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, + custom_client=custom_client, + ) + result = await parser.aparse(file_path) + + if attempt > 1: + logging.info( + f"LlamaCloud upload succeeded on attempt {attempt} after " + f"{len(attempt_errors)} failures" + ) + return result + + except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: + last_exception = e + error_type = type(e).__name__ + error_msg = str(e)[:200] + attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}") + + if attempt < LLAMACLOUD_MAX_RETRIES: + base_delay = min( + LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), + LLAMACLOUD_MAX_DELAY, + ) + jitter = base_delay * 0.25 * (2 * random.random() - 1) + delay = base_delay + jitter + + if task_logger and log_entry: + await task_logger.log_task_progress( + log_entry, + f"LlamaCloud upload failed " + f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), " + f"retrying in {delay:.0f}s", + { + "error_type": error_type, + "error_message": error_msg, + "attempt": attempt, + "retry_delay": delay, + "file_size_mb": round(file_size_mb, 1), + "upload_timeout": upload_timeout, + }, + ) + else: + logging.warning( + f"LlamaCloud upload failed " + f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): " + f"{error_type}. File: {file_size_mb:.1f}MB. " + f"Retrying in {delay:.0f}s..." + ) + + await asyncio.sleep(delay) + else: + logging.error( + f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} " + f"attempts. File size: {file_size_mb:.1f}MB, " + f"Pages: {estimated_pages}. " + f"Errors: {'; '.join(attempt_errors)}" + ) + + except Exception: + raise + + raise last_exception or RuntimeError( + f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. " + f"File size: {file_size_mb:.1f}MB" + ) + + +# --------------------------------------------------------------------------- +# Per-service parse functions +# --------------------------------------------------------------------------- + + +async def parse_with_unstructured(file_path: str): + """ + Parse a file using the Unstructured ETL service. + + Returns: + List of LangChain Document elements. + """ + from langchain_unstructured import UnstructuredLoader + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + return await loader.aload() + + +async def parse_with_docling(file_path: str, filename: str) -> str: + """ + Parse a file using the Docling ETL service (via the Docling service wrapper). + + Returns: + Markdown content string. + """ + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + pdfminer_logger = getLogger("pdfminer") + original_level = pdfminer_logger.level + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer") + warnings.filterwarnings( + "ignore", message=".*Cannot set gray non-stroke color.*" + ) + warnings.filterwarnings("ignore", message=".*invalid float value.*") + pdfminer_logger.setLevel(ERROR) + + try: + result = await docling_service.process_document(file_path, filename) + finally: + pdfminer_logger.setLevel(original_level) + + return result["content"] diff --git a/surfsense_backend/app/tasks/document_processors/_helpers.py b/surfsense_backend/app/tasks/document_processors/_helpers.py new file mode 100644 index 000000000..7ac05932c --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_helpers.py @@ -0,0 +1,218 @@ +""" +Document helper functions for deduplication, migration, and connector updates. + +Provides reusable logic shared across file processors and ETL strategies. +""" + +import logging + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentStatus, DocumentType +from app.utils.document_converters import generate_unique_identifier_hash + +from ._constants import ( + BASE_JOB_TIMEOUT, + MAX_UPLOAD_TIMEOUT, + MIN_UPLOAD_TIMEOUT, + PER_PAGE_JOB_TIMEOUT, + UPLOAD_BYTES_PER_SECOND_SLOW, +) +from .base import ( + check_document_by_unique_identifier, + check_duplicate_document, +) + +# --------------------------------------------------------------------------- +# Unique identifier helpers +# --------------------------------------------------------------------------- + + +def get_google_drive_unique_identifier( + connector: dict | None, + filename: str, + search_space_id: int, +) -> tuple[str, str | None]: + """ + Get unique identifier hash, using file_id for Google Drive (stable across renames). + + Returns: + Tuple of (primary_hash, legacy_hash or None). + For Google Drive: (file_id-based hash, filename-based hash for migration). + For other sources: (filename-based hash, None). + """ + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + metadata = connector.get("metadata", {}) + file_id = metadata.get("google_drive_file_id") + + if file_id: + primary_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id + ) + legacy_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id + ) + return primary_hash, legacy_hash + + primary_hash = generate_unique_identifier_hash( + DocumentType.FILE, filename, search_space_id + ) + return primary_hash, None + + +# --------------------------------------------------------------------------- +# Document deduplication and migration +# --------------------------------------------------------------------------- + + +async def handle_existing_document_update( + session: AsyncSession, + existing_document: Document, + content_hash: str, + connector: dict | None, + filename: str, + primary_hash: str, +) -> tuple[bool, Document | None]: + """ + Handle update logic for an existing document. + + Returns: + Tuple of (should_skip_processing, document_to_return): + - (True, document): Content unchanged, return existing document + - (False, None): Content changed, needs re-processing + """ + if existing_document.unique_identifier_hash != primary_hash: + existing_document.unique_identifier_hash = primary_hash + logging.info(f"Migrated document to file_id-based identifier: {filename}") + + if existing_document.content_hash == content_hash: + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + connector_metadata = connector.get("metadata", {}) + new_name = connector_metadata.get("google_drive_file_name") + doc_metadata = existing_document.document_metadata or {} + old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get( + "google_drive_file_name" + ) + + if new_name and old_name and old_name != new_name: + from sqlalchemy.orm.attributes import flag_modified + + existing_document.title = new_name + if not existing_document.document_metadata: + existing_document.document_metadata = {} + existing_document.document_metadata["FILE_NAME"] = new_name + existing_document.document_metadata["google_drive_file_name"] = new_name + flag_modified(existing_document, "document_metadata") + await session.commit() + logging.info( + f"File renamed in Google Drive: '{old_name}' → '{new_name}' " + f"(no re-processing needed)" + ) + + logging.info(f"Document for file {filename} unchanged. Skipping.") + return True, existing_document + + # Content has changed — guard against content_hash collision before + # expensive ETL processing. + collision_doc = await check_duplicate_document(session, content_hash) + if collision_doc and collision_doc.id != existing_document.id: + logging.warning( + "Content-hash collision for %s: identical content exists in " + "document #%s (%s). Skipping re-processing.", + filename, + collision_doc.id, + collision_doc.document_type, + ) + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PENDING + ) or DocumentStatus.is_state( + existing_document.status, DocumentStatus.PROCESSING + ): + await session.delete(existing_document) + await session.commit() + return True, None + + return True, existing_document + + logging.info(f"Content changed for file {filename}. Updating document.") + return False, None + + +async def find_existing_document_with_migration( + session: AsyncSession, + primary_hash: str, + legacy_hash: str | None, + content_hash: str | None = None, +) -> Document | None: + """ + Find existing document, checking primary hash, legacy hash, and content_hash. + + Supports migration from filename-based to file_id-based hashing for + Google Drive files, with content_hash fallback for cross-source dedup. + """ + existing_document = await check_document_by_unique_identifier(session, primary_hash) + + if not existing_document and legacy_hash: + existing_document = await check_document_by_unique_identifier( + session, legacy_hash + ) + if existing_document: + logging.info( + "Found legacy document (filename-based hash), " + "will migrate to file_id-based hash" + ) + + if not existing_document and content_hash: + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + logging.info( + f"Found duplicate content from different source (content_hash match). " + f"Original document ID: {existing_document.id}, " + f"type: {existing_document.document_type}" + ) + + return existing_document + + +# --------------------------------------------------------------------------- +# Connector helpers +# --------------------------------------------------------------------------- + + +async def update_document_from_connector( + document: Document | None, + connector: dict | None, + session: AsyncSession, +) -> None: + """Update document type, metadata, and connector_id from connector info.""" + if not document or not connector: + return + if "type" in connector: + document.document_type = connector["type"] + if "metadata" in connector: + if not document.document_metadata: + document.document_metadata = connector["metadata"] + else: + merged = {**document.document_metadata, **connector["metadata"]} + document.document_metadata = merged + if "connector_id" in connector: + document.connector_id = connector["connector_id"] + await session.commit() + + +# --------------------------------------------------------------------------- +# Timeout calculations +# --------------------------------------------------------------------------- + + +def calculate_upload_timeout(file_size_bytes: int) -> float: + """Calculate upload timeout based on file size (conservative for slow connections).""" + estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 + return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) + + +def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: + """Calculate job processing timeout based on page count and file size.""" + page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) + size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 + return max(page_based_timeout, size_based_timeout) diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py new file mode 100644 index 000000000..5088ad004 --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_save.py @@ -0,0 +1,285 @@ +""" +Unified document save/update logic for file processors. + +Replaces the three nearly-identical ``add_received_file_document_using_*`` +functions with a single ``save_file_document`` function plus thin wrappers +for backward compatibility. +""" + +import logging + +from langchain_core.documents import Document as LangChainDocument +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentStatus, DocumentType +from app.services.llm_service import get_user_long_context_llm +from app.utils.document_converters import ( + create_document_chunks, + embed_text, + generate_content_hash, + generate_document_summary, +) + +from ._helpers import ( + find_existing_document_with_migration, + get_google_drive_unique_identifier, + handle_existing_document_update, +) +from .base import get_current_timestamp, safe_set_chunks + +# --------------------------------------------------------------------------- +# Summary generation +# --------------------------------------------------------------------------- + + +async def _generate_summary( + markdown_content: str, + file_name: str, + etl_service: str, + user_llm, + enable_summary: bool, +) -> tuple[str, list[float]]: + """ + Generate a document summary and embedding. + + Docling uses its own large-document summary strategy; other ETL services + use the standard ``generate_document_summary`` helper. + """ + if not enable_summary: + summary = f"File: {file_name}\n\n{markdown_content[:4000]}" + return summary, embed_text(summary) + + if etl_service == "DOCLING": + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + summary_text = await docling_service.process_large_document_summary( + content=markdown_content, llm=user_llm, document_title=file_name + ) + + meta = { + "file_name": file_name, + "etl_service": etl_service, + "document_type": "File Document", + } + parts = ["# DOCUMENT METADATA"] + for key, value in meta.items(): + if value: + formatted_key = key.replace("_", " ").title() + parts.append(f"**{formatted_key}:** {value}") + + enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text + return enhanced, embed_text(enhanced) + + # Standard summary (Unstructured / LlamaCloud / others) + meta = { + "file_name": file_name, + "etl_service": etl_service, + "document_type": "File Document", + } + return await generate_document_summary(markdown_content, user_llm, meta) + + +# --------------------------------------------------------------------------- +# Unified save function +# --------------------------------------------------------------------------- + + +async def save_file_document( + session: AsyncSession, + file_name: str, + markdown_content: str, + search_space_id: int, + user_id: str, + etl_service: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """ + Process and store a file document with deduplication and migration support. + + Handles both creating new documents and updating existing ones. This is + the single implementation behind the per-ETL-service wrapper functions. + + Args: + session: Database session + file_name: Name of the processed file + markdown_content: Markdown content to store + search_space_id: ID of the search space + user_id: ID of the user + etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING) + connector: Optional connector info for Google Drive files + enable_summary: Whether to generate an AI summary + + Returns: + Document object if successful, None if duplicate detected + """ + try: + primary_hash, legacy_hash = get_google_drive_unique_identifier( + connector, file_name, search_space_id + ) + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await find_existing_document_with_migration( + session, primary_hash, legacy_hash, content_hash + ) + + if existing_document: + should_skip, doc = await handle_existing_document_update( + session, + existing_document, + content_hash, + connector, + file_name, + primary_hash, + ) + if should_skip: + return doc + + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + if not user_llm: + raise RuntimeError( + f"No long context LLM configured for user {user_id} " + f"in search space {search_space_id}" + ) + + summary_content, summary_embedding = await _generate_summary( + markdown_content, file_name, etl_service, user_llm, enable_summary + ) + chunks = await create_document_chunks(markdown_content) + doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service} + + if existing_document: + existing_document.title = file_name + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = doc_metadata + await safe_set_chunks(session, existing_document, chunks) + existing_document.source_markdown = markdown_content + existing_document.content_needs_reindexing = False + existing_document.updated_at = get_current_timestamp() + existing_document.status = DocumentStatus.ready() + + await session.commit() + await session.refresh(existing_document) + return existing_document + + doc_type = DocumentType.FILE + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + doc_type = DocumentType.GOOGLE_DRIVE_FILE + + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=doc_type, + document_metadata=doc_metadata, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + unique_identifier_hash=primary_hash, + source_markdown=markdown_content, + content_needs_reindexing=False, + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector.get("connector_id") if connector else None, + status=DocumentStatus.ready(), + ) + session.add(document) + await session.commit() + await session.refresh(document) + return document + + except SQLAlchemyError as db_error: + await session.rollback() + if "ix_documents_content_hash" in str(db_error): + logging.warning( + "content_hash collision during commit for %s (%s). Skipping.", + file_name, + etl_service, + ) + return None + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError( + f"Failed to process file document using {etl_service}: {e!s}" + ) from e + + +# --------------------------------------------------------------------------- +# Backward-compatible wrapper functions +# --------------------------------------------------------------------------- + + +async def add_received_file_document_using_unstructured( + session: AsyncSession, + file_name: str, + unstructured_processed_elements: list[LangChainDocument], + search_space_id: int, + user_id: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """Process and store a file document using the Unstructured service.""" + from app.utils.document_converters import convert_document_to_markdown + + markdown_content = await convert_document_to_markdown( + unstructured_processed_elements + ) + return await save_file_document( + session, + file_name, + markdown_content, + search_space_id, + user_id, + "UNSTRUCTURED", + connector, + enable_summary, + ) + + +async def add_received_file_document_using_llamacloud( + session: AsyncSession, + file_name: str, + llamacloud_markdown_document: str, + search_space_id: int, + user_id: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """Process and store document content parsed by LlamaCloud.""" + return await save_file_document( + session, + file_name, + llamacloud_markdown_document, + search_space_id, + user_id, + "LLAMACLOUD", + connector, + enable_summary, + ) + + +async def add_received_file_document_using_docling( + session: AsyncSession, + file_name: str, + docling_markdown_document: str, + search_space_id: int, + user_id: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """Process and store document content parsed by Docling.""" + return await save_file_document( + session, + file_name, + docling_markdown_document, + search_space_id, + user_id, + "DOCLING", + connector, + enable_summary, + ) diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 6c0ae1870..0c1cad52d 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -1,905 +1,685 @@ """ -File document processors for different ETL services (Unstructured, LlamaCloud, Docling). +File document processors orchestrating content extraction and indexing. + +This module is the public entry point for file processing. It delegates to +specialised sub-modules that each own a single concern: + +- ``_constants`` — file type classification and configuration constants +- ``_helpers`` — document deduplication, migration, connector helpers +- ``_direct_converters`` — lossless file-to-markdown for csv/tsv/html +- ``_etl`` — ETL parsing strategies (Unstructured, LlamaCloud, Docling) +- ``_save`` — unified document creation / update logic """ -import asyncio +from __future__ import annotations + import contextlib import logging -import ssl -import warnings +import os +from dataclasses import dataclass, field from logging import ERROR, getLogger -import httpx from fastapi import HTTPException -from langchain_core.documents import Document as LangChainDocument -from litellm import atranscription -from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config as app_config -from app.db import Document, DocumentStatus, DocumentType, Log, Notification -from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter -from app.services.llm_service import get_user_long_context_llm +from app.db import Document, Log, Notification from app.services.notification_service import NotificationService from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - convert_document_to_markdown, - create_document_chunks, - embed_text, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) -from .base import ( - check_document_by_unique_identifier, - check_duplicate_document, - get_current_timestamp, - safe_set_chunks, +from ._constants import FileCategory, classify_file +from ._direct_converters import convert_file_directly +from ._etl import ( + parse_with_docling, + parse_with_llamacloud_retry, + parse_with_unstructured, +) +from ._helpers import update_document_from_connector +from ._save import ( + add_received_file_document_using_docling, + add_received_file_document_using_llamacloud, + add_received_file_document_using_unstructured, + save_file_document, ) from .markdown_processor import add_received_markdown_file_document -# Constants for LlamaCloud retry configuration -LLAMACLOUD_MAX_RETRIES = 5 # Increased from 3 for large file resilience -LLAMACLOUD_BASE_DELAY = 10 # Base delay in seconds for exponential backoff -LLAMACLOUD_MAX_DELAY = 120 # Maximum delay between retries (2 minutes) -LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( - ssl.SSLError, - httpx.ConnectError, - httpx.ConnectTimeout, - httpx.ReadTimeout, - httpx.WriteTimeout, - httpx.RemoteProtocolError, - httpx.LocalProtocolError, - ConnectionError, - ConnectionResetError, - TimeoutError, - OSError, # Catches various network-level errors -) - -# Timeout calculation constants -UPLOAD_BYTES_PER_SECOND_SLOW = ( - 100 * 1024 -) # 100 KB/s (conservative for slow connections) -MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file -MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files -BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing -PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing +# Re-export public API so existing ``from file_processors import …`` keeps working. +__all__ = [ + "add_received_file_document_using_docling", + "add_received_file_document_using_llamacloud", + "add_received_file_document_using_unstructured", + "parse_with_llamacloud_retry", + "process_file_in_background", + "process_file_in_background_with_document", + "save_file_document", +] -def get_google_drive_unique_identifier( - connector: dict | None, - filename: str, - search_space_id: int, -) -> tuple[str, str | None]: - """ - Get unique identifier hash for a file, with special handling for Google Drive. - - For Google Drive files, uses file_id as the unique identifier (doesn't change on rename). - For other files, uses filename. - - Args: - connector: Optional connector info dict with type and metadata - filename: The filename (used for non-Google Drive files or as fallback) - search_space_id: The search space ID - - Returns: - Tuple of (primary_hash, legacy_hash or None) - - For Google Drive: (file_id_based_hash, filename_based_hash for migration) - - For other sources: (filename_based_hash, None) - """ - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - metadata = connector.get("metadata", {}) - file_id = metadata.get("google_drive_file_id") - - if file_id: - # New method: use file_id as unique identifier (doesn't change on rename) - primary_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - # Legacy method: for backward compatibility with existing documents - # that were indexed with filename-based hash - legacy_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id - ) - return primary_hash, legacy_hash - - # For non-Google Drive files, use filename as before - primary_hash = generate_unique_identifier_hash( - DocumentType.FILE, filename, search_space_id - ) - return primary_hash, None +# --------------------------------------------------------------------------- +# Processing context (bundles parameters shared across handler functions) +# --------------------------------------------------------------------------- -async def handle_existing_document_update( - session: AsyncSession, - existing_document: Document, - content_hash: str, - connector: dict | None, - filename: str, - primary_hash: str, -) -> tuple[bool, Document | None]: - """ - Handle update logic for an existing document. +@dataclass +class _ProcessingContext: + session: AsyncSession + file_path: str + filename: str + search_space_id: int + user_id: str + task_logger: TaskLoggingService + log_entry: Log + connector: dict | None = None + notification: Notification | None = None + enable_summary: bool = field(init=False) - Args: - session: Database session - existing_document: The existing document found in database - content_hash: Hash of the new content - connector: Optional connector info - filename: Current filename - primary_hash: The primary hash (file_id based for Google Drive) - - Returns: - Tuple of (should_skip_processing, document_to_return) - - (True, document): Content unchanged, just return existing document - - (False, None): Content changed, need to re-process - """ - # Check if this document needs hash migration (found via legacy hash) - if existing_document.unique_identifier_hash != primary_hash: - existing_document.unique_identifier_hash = primary_hash - logging.info(f"Migrated document to file_id-based identifier: {filename}") - - # Check if content has changed - if existing_document.content_hash == content_hash: - # Content unchanged - check if we need to update metadata (e.g., filename changed) - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - connector_metadata = connector.get("metadata", {}) - new_name = connector_metadata.get("google_drive_file_name") - # Check both possible keys for old name (FILE_NAME is used in stored documents) - doc_metadata = existing_document.document_metadata or {} - old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get( - "google_drive_file_name" - ) - - if new_name and old_name and old_name != new_name: - # File was renamed - update title and metadata, skip expensive processing - from sqlalchemy.orm.attributes import flag_modified - - existing_document.title = new_name - if not existing_document.document_metadata: - existing_document.document_metadata = {} - existing_document.document_metadata["FILE_NAME"] = new_name - existing_document.document_metadata["google_drive_file_name"] = new_name - flag_modified(existing_document, "document_metadata") - await session.commit() - logging.info( - f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)" - ) - - logging.info(f"Document for file {filename} unchanged. Skipping.") - return True, existing_document - else: - # Content has changed — guard against content_hash collision before - # expensive ETL processing. A collision means the exact same content - # already lives in a *different* document (e.g. a manual upload of the - # same file). Proceeding would trigger a unique-constraint violation - # on ix_documents_content_hash. - collision_doc = await check_duplicate_document(session, content_hash) - if collision_doc and collision_doc.id != existing_document.id: - logging.warning( - "Content-hash collision for %s: identical content exists in " - "document #%s (%s). Skipping re-processing.", - filename, - collision_doc.id, - collision_doc.document_type, - ) - if DocumentStatus.is_state( - existing_document.status, DocumentStatus.PENDING - ) or DocumentStatus.is_state( - existing_document.status, DocumentStatus.PROCESSING - ): - # Pending/processing doc has no real content yet — remove it - # so the UI doesn't show a contentless entry. - await session.delete(existing_document) - await session.commit() - return True, None - - # Document already has valid content — keep it as-is. - return True, existing_document - - logging.info(f"Content changed for file {filename}. Updating document.") - return False, None - - -async def find_existing_document_with_migration( - session: AsyncSession, - primary_hash: str, - legacy_hash: str | None, - content_hash: str | None = None, -) -> Document | None: - """ - Find existing document, checking both new hash and legacy hash for migration, - with fallback to content_hash for cross-source deduplication. - - Args: - session: Database session - primary_hash: The primary hash (file_id based for Google Drive) - legacy_hash: The legacy hash (filename based) for migration, or None - content_hash: The content hash for fallback deduplication, or None - - Returns: - Existing document if found, None otherwise - """ - # First check with primary hash (new method) - existing_document = await check_document_by_unique_identifier(session, primary_hash) - - # If not found and we have a legacy hash, check with that (migration path) - if not existing_document and legacy_hash: - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - logging.info( - "Found legacy document (filename-based hash), will migrate to file_id-based hash" - ) - - # Fallback: check by content_hash to catch duplicates from different sources - # This prevents unique constraint violations when the same content exists - # under a different unique_identifier (e.g., manual upload vs Google Drive) - if not existing_document and content_hash: - existing_document = await check_duplicate_document(session, content_hash) - if existing_document: - logging.info( - f"Found duplicate content from different source (content_hash match). " - f"Original document ID: {existing_document.id}, type: {existing_document.document_type}" - ) - - return existing_document - - -def calculate_upload_timeout(file_size_bytes: int) -> float: - """ - Calculate appropriate upload timeout based on file size. - - Assumes a conservative slow connection speed to handle worst-case scenarios. - - Args: - file_size_bytes: Size of the file in bytes - - Returns: - Timeout in seconds - """ - # Calculate time needed at slow connection speed - # Add 50% buffer for network variability and SSL overhead - estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 - - # Clamp to reasonable bounds - return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) - - -def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: - """ - Calculate job processing timeout based on page count and file size. - - Args: - estimated_pages: Estimated number of pages - file_size_bytes: Size of the file in bytes - - Returns: - Timeout in seconds - """ - # Base timeout + time per page - page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) - - # Also consider file size (large images take longer to process) - # ~1 minute per 10MB of file size - size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 - - # Use the larger of the two estimates - return max(page_based_timeout, size_based_timeout) - - -async def parse_with_llamacloud_retry( - file_path: str, - estimated_pages: int, - task_logger: TaskLoggingService | None = None, - log_entry: Log | None = None, -): - """ - Parse a file with LlamaCloud with retry logic for transient SSL/connection errors. - - Uses dynamic timeout calculations based on file size and page count to handle - very large files reliably. - - Args: - file_path: Path to the file to parse - estimated_pages: Estimated number of pages for timeout calculation - task_logger: Optional task logger for progress updates - log_entry: Optional log entry for progress updates - - Returns: - LlamaParse result object - - Raises: - Exception: If all retries fail - """ - import os - import random - - from llama_cloud_services import LlamaParse - from llama_cloud_services.parse.utils import ResultType - - # Get file size for timeout calculations - file_size_bytes = os.path.getsize(file_path) - file_size_mb = file_size_bytes / (1024 * 1024) - - # Calculate dynamic timeouts based on file size and page count - upload_timeout = calculate_upload_timeout(file_size_bytes) - job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) - - # HTTP client timeouts - scaled based on file size - # Write timeout is critical for large file uploads - custom_timeout = httpx.Timeout( - connect=120.0, # 2 minutes to establish connection (handles slow DNS, etc.) - read=upload_timeout, # Dynamic based on file size - write=upload_timeout, # Dynamic based on file size (upload time) - pool=120.0, # 2 minutes to acquire connection from pool - ) - - logging.info( - f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " - f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " - f"job_timeout={job_timeout:.0f}s" - ) - - last_exception = None - attempt_errors = [] - - for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1): - try: - # Create a fresh httpx client for each attempt - async with httpx.AsyncClient(timeout=custom_timeout) as custom_client: - # Create LlamaParse parser instance with optimized settings - parser = LlamaParse( - api_key=app_config.LLAMA_CLOUD_API_KEY, - num_workers=1, # Use single worker for file processing - verbose=True, - language="en", - result_type=ResultType.MD, - # Timeout settings for large files - max_timeout=int(max(2000, job_timeout + upload_timeout)), - job_timeout_in_seconds=job_timeout, - job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, - # Use our custom client with larger timeouts - custom_client=custom_client, - ) - - # Parse the file asynchronously - result = await parser.aparse(file_path) - - # Success - log if we had previous failures - if attempt > 1: - logging.info( - f"LlamaCloud upload succeeded on attempt {attempt} after " - f"{len(attempt_errors)} failures" - ) - - return result - - except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: - last_exception = e - error_type = type(e).__name__ - error_msg = str(e)[:200] - attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}") - - if attempt < LLAMACLOUD_MAX_RETRIES: - # Calculate exponential backoff with jitter - # Base delay doubles each attempt, capped at max delay - base_delay = min( - LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY - ) - # Add random jitter (±25%) to prevent thundering herd - jitter = base_delay * 0.25 * (2 * random.random() - 1) - delay = base_delay + jitter - - if task_logger and log_entry: - await task_logger.log_task_progress( - log_entry, - f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), retrying in {delay:.0f}s", - { - "error_type": error_type, - "error_message": error_msg, - "attempt": attempt, - "retry_delay": delay, - "file_size_mb": round(file_size_mb, 1), - "upload_timeout": upload_timeout, - }, - ) - else: - logging.warning( - f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): " - f"{error_type}. File: {file_size_mb:.1f}MB. Retrying in {delay:.0f}s..." - ) - - await asyncio.sleep(delay) - else: - logging.error( - f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} attempts. " - f"File size: {file_size_mb:.1f}MB, Pages: {estimated_pages}. " - f"Errors: {'; '.join(attempt_errors)}" - ) - - except Exception: - # Non-retryable exception, raise immediately - raise - - # All retries exhausted - raise last_exception or RuntimeError( - f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. " - f"File size: {file_size_mb:.1f}MB" - ) - - -async def add_received_file_document_using_unstructured( - session: AsyncSession, - file_name: str, - unstructured_processed_elements: list[LangChainDocument], - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store a file document using Unstructured service. - - Args: - session: Database session - file_name: Name of the processed file - unstructured_processed_elements: Processed elements from Unstructured - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - file_in_markdown = await convert_document_to_markdown( - unstructured_processed_elements + def __post_init__(self) -> None: + self.enable_summary = ( + self.connector.get("enable_summary", True) if self.connector else True ) - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search space {search_space_id}" - ) - - # Generate summary with metadata - document_metadata = { - "file_name": file_name, - "etl_service": "UNSTRUCTURED", - "document_type": "File Document", - } - if enable_summary: - summary_content, summary_embedding = await generate_document_summary( - file_in_markdown, user_llm, document_metadata - ) - else: - summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - summary_embedding = embed_text(summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - # Update existing document - existing_document.title = file_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "UNSTRUCTURED", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - # Create new document - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "UNSTRUCTURED", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (Unstructured). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError(f"Failed to process file document: {e!s}") from e +# --------------------------------------------------------------------------- +# Notification helper +# --------------------------------------------------------------------------- -async def add_received_file_document_using_llamacloud( - session: AsyncSession, - file_name: str, - llamacloud_markdown_document: str, - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store document content parsed by LlamaCloud. - - Args: - session: Database session - file_name: Name of the processed file - llamacloud_markdown_document: Markdown content from LlamaCloud parsing - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - # Combine all markdown documents into one - file_in_markdown = llamacloud_markdown_document - - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search space {search_space_id}" - ) - - # Generate summary with metadata - document_metadata = { - "file_name": file_name, - "etl_service": "LLAMACLOUD", - "document_type": "File Document", - } - if enable_summary: - summary_content, summary_embedding = await generate_document_summary( - file_in_markdown, user_llm, document_metadata - ) - else: - summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - summary_embedding = embed_text(summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - existing_document.title = file_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "LLAMACLOUD", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "LLAMACLOUD", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (LlamaCloud). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError( - f"Failed to process file document using LlamaCloud: {e!s}" - ) from e - - -async def add_received_file_document_using_docling( - session: AsyncSession, - file_name: str, - docling_markdown_document: str, - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store document content parsed by Docling. - - Args: - session: Database session - file_name: Name of the processed file - docling_markdown_document: Markdown content from Docling parsing - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - file_in_markdown = docling_markdown_document - - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search_space {search_space_id}" - ) - - if enable_summary: - from app.services.docling_service import create_docling_service - - docling_service = create_docling_service() - - summary_content = await docling_service.process_large_document_summary( - content=file_in_markdown, llm=user_llm, document_title=file_name - ) - - document_metadata = { - "file_name": file_name, - "etl_service": "DOCLING", - "document_type": "File Document", - } - metadata_parts = ["# DOCUMENT METADATA"] - for key, value in document_metadata.items(): - if value: - formatted_key = key.replace("_", " ").title() - metadata_parts.append(f"**{formatted_key}:** {value}") - - metadata_section = "\n".join(metadata_parts) - enhanced_summary_content = ( - f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}" - ) - else: - enhanced_summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - - summary_embedding = embed_text(enhanced_summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - # Update existing document - existing_document.title = file_name - existing_document.content = enhanced_summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "DOCLING", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() # Mark as ready - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - # Create new document - # Determine document type based on connector - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "DOCLING", - }, - content=enhanced_summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), # Mark as ready - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (Docling). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError( - f"Failed to process file document using Docling: {e!s}" - ) from e - - -async def _update_document_from_connector( - document: Document | None, connector: dict | None, session: AsyncSession +async def _notify( + ctx: _ProcessingContext, + stage: str, + stage_message: str | None = None, + **kwargs, ) -> None: - """Helper to update document type, metadata, and connector_id from connector info.""" - if document and connector: - if "type" in connector: - document.document_type = connector["type"] - if "metadata" in connector: - # Merge with existing document_metadata (the actual column name) - if not document.document_metadata: - document.document_metadata = connector["metadata"] - else: - # Expand existing metadata with connector metadata - merged = {**document.document_metadata, **connector["metadata"]} - document.document_metadata = merged - # Set connector_id if provided for de-indexing support - if "connector_id" in connector: - document.connector_id = connector["connector_id"] - await session.commit() + """Send a processing-progress notification if one is attached.""" + if not ctx.notification: + return + await NotificationService.document_processing.notify_processing_progress( + ctx.session, + ctx.notification, + stage=stage, + stage_message=stage_message, + **kwargs, + ) + + +# --------------------------------------------------------------------------- +# Page-limit helpers +# --------------------------------------------------------------------------- + + +def _estimate_pages_safe(page_limit_service, file_path: str) -> int: + """Estimate page count with a file-size fallback.""" + try: + return page_limit_service.estimate_pages_before_processing(file_path) + except Exception: + file_size = os.path.getsize(file_path) + return max(1, file_size // (80 * 1024)) + + +async def _log_page_divergence( + task_logger: TaskLoggingService, + log_entry: Log, + filename: str, + estimated: int, + actual: int, + final: int, +) -> None: + """Log a warning when the actual page count far exceeds the pre-estimate.""" + if actual > estimated * 1.5: + await task_logger.log_task_progress( + log_entry, + f"Actual page count higher than estimate: {filename}", + { + "estimated_before": estimated, + "actual_pages": actual, + "using_count": final, + }, + ) + + +# =================================================================== +# Handlers for process_file_in_background (legacy / connector path) +# =================================================================== + + +async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None: + """Read a markdown / text file and create or update a document.""" + await _notify(ctx, "parsing", "Reading file") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing markdown/text file: {ctx.filename}", + {"file_type": "markdown", "processing_stage": "reading_file"}, + ) + + with open(ctx.file_path, encoding="utf-8") as f: + markdown_content = f.read() + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await _notify(ctx, "chunking") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Creating document from markdown content: {ctx.filename}", + { + "processing_stage": "creating_document", + "content_length": len(markdown_content), + }, + ) + + result = await add_received_markdown_file_document( + ctx.session, + ctx.filename, + markdown_content, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed markdown file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "markdown", + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Markdown file already exists (duplicate): {ctx.filename}", + {"duplicate_detected": True, "file_type": "markdown"}, + ) + return result + + +async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None: + """Convert a text-based file (csv/tsv/html) to markdown without ETL.""" + await _notify(ctx, "parsing", "Converting file") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Direct-converting file to markdown: {ctx.filename}", + {"file_type": "direct_convert", "processing_stage": "converting"}, + ) + + markdown_content = convert_file_directly(ctx.file_path, ctx.filename) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await _notify(ctx, "chunking") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Creating document from converted content: {ctx.filename}", + { + "processing_stage": "creating_document", + "content_length": len(markdown_content), + }, + ) + + result = await add_received_markdown_file_document( + ctx.session, + ctx.filename, + markdown_content, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully direct-converted file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "direct_convert", + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Direct-converted file already exists (duplicate): {ctx.filename}", + {"duplicate_detected": True, "file_type": "direct_convert"}, + ) + return result + + +async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None: + """Transcribe an audio file and create or update a document.""" + await _notify(ctx, "parsing", "Transcribing audio") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing audio file for transcription: {ctx.filename}", + {"file_type": "audio", "processing_stage": "starting_transcription"}, + ) + + stt_service_type = ( + "local" + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + + if stt_service_type == "local": + from app.services.stt_service import stt_service + + try: + stt_result = stt_service.transcribe_file(ctx.file_path) + transcribed_text = stt_result.get("text", "") + if not transcribed_text: + raise ValueError("Transcription returned empty text") + transcribed_text = ( + f"# Transcription of {ctx.filename}\n\n{transcribed_text}" + ) + except Exception as e: + raise HTTPException( + status_code=422, + detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}", + ) from e + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Local STT transcription completed: {ctx.filename}", + { + "processing_stage": "local_transcription_complete", + "language": stt_result.get("language"), + "confidence": stt_result.get("language_probability"), + "duration": stt_result.get("duration"), + }, + ) + else: + from litellm import atranscription + + with open(ctx.file_path, "rb") as audio_file: + transcription_kwargs: dict = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE + + transcription_response = await atranscription(**transcription_kwargs) + transcribed_text = transcription_response.get("text", "") + if not transcribed_text: + raise ValueError("Transcription returned empty text") + + transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}" + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Transcription completed, creating document: {ctx.filename}", + { + "processing_stage": "transcription_complete", + "transcript_length": len(transcribed_text), + }, + ) + + await _notify(ctx, "chunking") + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + result = await add_received_markdown_file_document( + ctx.session, + ctx.filename, + transcribed_text, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully transcribed and processed audio file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "audio", + "transcript_length": len(transcribed_text), + "stt_service": stt_service_type, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Audio file transcript already exists (duplicate): {ctx.filename}", + {"duplicate_detected": True, "file_type": "audio"}, + ) + return result + + +# --------------------------------------------------------------------------- +# Document file processing (ETL service dispatch) +# --------------------------------------------------------------------------- + + +async def _etl_unstructured( + ctx: _ProcessingContext, + page_limit_service, + estimated_pages: int, +) -> Document | None: + """Parse and save via the Unstructured ETL service.""" + await _notify(ctx, "parsing", "Extracting content") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing file with Unstructured ETL: {ctx.filename}", + { + "file_type": "document", + "etl_service": "UNSTRUCTURED", + "processing_stage": "loading", + }, + ) + + docs = await parse_with_unstructured(ctx.file_path) + + await _notify(ctx, "chunking", chunks_count=len(docs)) + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Unstructured ETL completed, creating document: {ctx.filename}", + {"processing_stage": "etl_complete", "elements_count": len(docs)}, + ) + + actual_pages = page_limit_service.estimate_pages_from_elements(docs) + final_pages = max(estimated_pages, actual_pages) + await _log_page_divergence( + ctx.task_logger, + ctx.log_entry, + ctx.filename, + estimated_pages, + actual_pages, + final_pages, + ) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + result = await add_received_file_document_using_unstructured( + ctx.session, + ctx.filename, + docs, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + enable_summary=ctx.enable_summary, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await page_limit_service.update_page_usage( + ctx.user_id, final_pages, allow_exceed=True + ) + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file with Unstructured: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "document", + "etl_service": "UNSTRUCTURED", + "pages_processed": final_pages, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Document already exists (duplicate): {ctx.filename}", + { + "duplicate_detected": True, + "file_type": "document", + "etl_service": "UNSTRUCTURED", + }, + ) + return result + + +async def _etl_llamacloud( + ctx: _ProcessingContext, + page_limit_service, + estimated_pages: int, +) -> Document | None: + """Parse and save via the LlamaCloud ETL service.""" + await _notify(ctx, "parsing", "Extracting content") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing file with LlamaCloud ETL: {ctx.filename}", + { + "file_type": "document", + "etl_service": "LLAMACLOUD", + "processing_stage": "parsing", + "estimated_pages": estimated_pages, + }, + ) + + raw_result = await parse_with_llamacloud_retry( + file_path=ctx.file_path, + estimated_pages=estimated_pages, + task_logger=ctx.task_logger, + log_entry=ctx.log_entry, + ) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False) + + await _notify(ctx, "chunking", chunks_count=len(markdown_documents)) + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"LlamaCloud parsing completed, creating documents: {ctx.filename}", + { + "processing_stage": "parsing_complete", + "documents_count": len(markdown_documents), + }, + ) + + if not markdown_documents: + await ctx.task_logger.log_task_failure( + ctx.log_entry, + f"LlamaCloud parsing returned no documents: {ctx.filename}", + "ETL service returned empty document list", + {"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"}, + ) + raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}") + + actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents) + final_pages = max(estimated_pages, actual_pages) + await _log_page_divergence( + ctx.task_logger, + ctx.log_entry, + ctx.filename, + estimated_pages, + actual_pages, + final_pages, + ) + + any_created = False + last_doc: Document | None = None + + for doc in markdown_documents: + doc_result = await add_received_file_document_using_llamacloud( + ctx.session, + ctx.filename, + llamacloud_markdown_document=doc.text, + search_space_id=ctx.search_space_id, + user_id=ctx.user_id, + connector=ctx.connector, + enable_summary=ctx.enable_summary, + ) + if doc_result: + any_created = True + last_doc = doc_result + + if any_created: + await page_limit_service.update_page_usage( + ctx.user_id, final_pages, allow_exceed=True + ) + if ctx.connector: + await update_document_from_connector(last_doc, ctx.connector, ctx.session) + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file with LlamaCloud: {ctx.filename}", + { + "document_id": last_doc.id, + "content_hash": last_doc.content_hash, + "file_type": "document", + "etl_service": "LLAMACLOUD", + "pages_processed": final_pages, + "documents_count": len(markdown_documents), + }, + ) + return last_doc + + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Document already exists (duplicate): {ctx.filename}", + { + "duplicate_detected": True, + "file_type": "document", + "etl_service": "LLAMACLOUD", + "documents_count": len(markdown_documents), + }, + ) + return None + + +async def _etl_docling( + ctx: _ProcessingContext, + page_limit_service, + estimated_pages: int, +) -> Document | None: + """Parse and save via the Docling ETL service.""" + await _notify(ctx, "parsing", "Extracting content") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing file with Docling ETL: {ctx.filename}", + { + "file_type": "document", + "etl_service": "DOCLING", + "processing_stage": "parsing", + }, + ) + + content = await parse_with_docling(ctx.file_path, ctx.filename) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Docling parsing completed, creating document: {ctx.filename}", + {"processing_stage": "parsing_complete", "content_length": len(content)}, + ) + + actual_pages = page_limit_service.estimate_pages_from_content_length(len(content)) + final_pages = max(estimated_pages, actual_pages) + await _log_page_divergence( + ctx.task_logger, + ctx.log_entry, + ctx.filename, + estimated_pages, + actual_pages, + final_pages, + ) + + await _notify(ctx, "chunking") + + result = await add_received_file_document_using_docling( + ctx.session, + ctx.filename, + docling_markdown_document=content, + search_space_id=ctx.search_space_id, + user_id=ctx.user_id, + connector=ctx.connector, + enable_summary=ctx.enable_summary, + ) + + if result: + await page_limit_service.update_page_usage( + ctx.user_id, final_pages, allow_exceed=True + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file with Docling: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "document", + "etl_service": "DOCLING", + "pages_processed": final_pages, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Document already exists (duplicate): {ctx.filename}", + { + "duplicate_detected": True, + "file_type": "document", + "etl_service": "DOCLING", + }, + ) + return result + + +async def _process_document_upload(ctx: _ProcessingContext) -> Document | None: + """Route a document file to the configured ETL service.""" + from app.services.page_limit_service import PageLimitExceededError, PageLimitService + + page_limit_service = PageLimitService(ctx.session) + estimated_pages = _estimate_pages_safe(page_limit_service, ctx.file_path) + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Estimated {estimated_pages} pages for file: {ctx.filename}", + {"estimated_pages": estimated_pages, "file_type": "document"}, + ) + + try: + await page_limit_service.check_page_limit(ctx.user_id, estimated_pages) + except PageLimitExceededError as e: + await ctx.task_logger.log_task_failure( + ctx.log_entry, + f"Page limit exceeded before processing: {ctx.filename}", + str(e), + { + "error_type": "PageLimitExceeded", + "pages_used": e.pages_used, + "pages_limit": e.pages_limit, + "estimated_pages": estimated_pages, + }, + ) + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + raise HTTPException(status_code=403, detail=str(e)) from e + + etl_dispatch = { + "UNSTRUCTURED": _etl_unstructured, + "LLAMACLOUD": _etl_llamacloud, + "DOCLING": _etl_docling, + } + handler = etl_dispatch.get(app_config.ETL_SERVICE) + if handler is None: + raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + + return await handler(ctx, page_limit_service, estimated_pages) + + +# =================================================================== +# Public orchestrators +# =================================================================== async def process_file_in_background( @@ -910,726 +690,35 @@ async def process_file_in_background( session: AsyncSession, task_logger: TaskLoggingService, log_entry: Log, - connector: dict - | None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}} - notification: Notification - | None = None, # Optional notification for progress updates + connector: dict | None = None, + notification: Notification | None = None, ) -> Document | None: + ctx = _ProcessingContext( + session=session, + file_path=file_path, + filename=filename, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + connector=connector, + notification=notification, + ) + try: - # Check if the file is a markdown or text file - if filename.lower().endswith((".md", ".markdown", ".txt")): - # Update notification: parsing stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Reading file", - ) - ) + category = classify_file(filename) - await task_logger.log_task_progress( - log_entry, - f"Processing markdown/text file: {filename}", - {"file_type": "markdown", "processing_stage": "reading_file"}, - ) + if category == FileCategory.MARKDOWN: + return await _process_markdown_upload(ctx) + if category == FileCategory.DIRECT_CONVERT: + return await _process_direct_convert_upload(ctx) + if category == FileCategory.AUDIO: + return await _process_audio_upload(ctx) + return await _process_document_upload(ctx) - # For markdown files, read the content directly - with open(file_path, encoding="utf-8") as f: - markdown_content = f.read() - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Update notification: chunking stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Creating document from markdown content: {filename}", - { - "processing_stage": "creating_document", - "content_length": len(markdown_content), - }, - ) - - # Process markdown directly through specialized function - result = await add_received_markdown_file_document( - session, filename, markdown_content, search_space_id, user_id, connector - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully processed markdown file: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "markdown", - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Markdown file already exists (duplicate): {filename}", - {"duplicate_detected": True, "file_type": "markdown"}, - ) - return None - - # Check if the file is an audio file - elif filename.lower().endswith( - (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") - ): - # Update notification: parsing stage (transcription) - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Transcribing audio", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing audio file for transcription: {filename}", - {"file_type": "audio", "processing_stage": "starting_transcription"}, - ) - - # Determine STT service type - stt_service_type = ( - "local" - if app_config.STT_SERVICE - and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - - # Check if using local STT service - if stt_service_type == "local": - # Use local Faster-Whisper for transcription - from app.services.stt_service import stt_service - - try: - result = stt_service.transcribe_file(file_path) - transcribed_text = result.get("text", "") - - if not transcribed_text: - raise ValueError("Transcription returned empty text") - - # Add metadata about the transcription - transcribed_text = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - except Exception as e: - raise HTTPException( - status_code=422, - detail=f"Failed to transcribe audio file {filename}: {e!s}", - ) from e - - await task_logger.log_task_progress( - log_entry, - f"Local STT transcription completed: {filename}", - { - "processing_stage": "local_transcription_complete", - "language": result.get("language"), - "confidence": result.get("language_probability"), - "duration": result.get("duration"), - }, - ) - else: - # Use LiteLLM for audio transcription - with open(file_path, "rb") as audio_file: - transcription_kwargs = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = ( - app_config.STT_SERVICE_API_BASE - ) - - transcription_response = await atranscription( - **transcription_kwargs - ) - - # Extract the transcribed text - transcribed_text = transcription_response.get("text", "") - - if not transcribed_text: - raise ValueError("Transcription returned empty text") - - # Add metadata about the transcription - transcribed_text = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - - await task_logger.log_task_progress( - log_entry, - f"Transcription completed, creating document: {filename}", - { - "processing_stage": "transcription_complete", - "transcript_length": len(transcribed_text), - }, - ) - - # Update notification: chunking stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - ) - - # Clean up the temp file - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Process transcription as markdown document - result = await add_received_markdown_file_document( - session, filename, transcribed_text, search_space_id, user_id, connector - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully transcribed and processed audio file: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "audio", - "transcript_length": len(transcribed_text), - "stt_service": stt_service_type, - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Audio file transcript already exists (duplicate): {filename}", - {"duplicate_detected": True, "file_type": "audio"}, - ) - return None - - else: - # Import page limit service - from app.services.page_limit_service import ( - PageLimitExceededError, - PageLimitService, - ) - - # Initialize page limit service - page_limit_service = PageLimitService(session) - - # CRITICAL: Estimate page count BEFORE making expensive ETL API calls - # This prevents users from incurring costs on files that would exceed their limit - try: - estimated_pages_before = ( - page_limit_service.estimate_pages_before_processing(file_path) - ) - except Exception: - # If estimation fails, use a conservative estimate based on file size - import os - - file_size = os.path.getsize(file_path) - estimated_pages_before = max( - 1, file_size // (80 * 1024) - ) # ~80KB per page - - await task_logger.log_task_progress( - log_entry, - f"Estimated {estimated_pages_before} pages for file: {filename}", - { - "estimated_pages": estimated_pages_before, - "file_type": "document", - }, - ) - - # Check page limit BEFORE calling ETL service to avoid unnecessary costs - try: - await page_limit_service.check_page_limit( - user_id, estimated_pages_before - ) - except PageLimitExceededError as e: - await task_logger.log_task_failure( - log_entry, - f"Page limit exceeded before processing: {filename}", - str(e), - { - "error_type": "PageLimitExceeded", - "pages_used": e.pages_used, - "pages_limit": e.pages_limit, - "estimated_pages": estimated_pages_before, - }, - ) - # Clean up the temp file - import os - - with contextlib.suppress(Exception): - os.unlink(file_path) - - raise HTTPException( - status_code=403, - detail=str(e), - ) from e - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with Unstructured ETL: {filename}", - { - "file_type": "document", - "etl_service": "UNSTRUCTURED", - "processing_stage": "loading", - }, - ) - - from langchain_unstructured import UnstructuredLoader - - # Process the file - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - - docs = await loader.aload() - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking", chunks_count=len(docs) - ) - - await task_logger.log_task_progress( - log_entry, - f"Unstructured ETL completed, creating document: {filename}", - {"processing_stage": "etl_complete", "elements_count": len(docs)}, - ) - - # Verify actual page count from parsed documents - actual_pages = page_limit_service.estimate_pages_from_elements(docs) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - result = await add_received_file_document_using_unstructured( - session, - filename, - docs, - search_space_id, - user_id, - connector, - enable_summary=enable_summary, - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with Unstructured: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "document", - "etl_service": "UNSTRUCTURED", - "pages_processed": final_page_count, - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "UNSTRUCTURED", - }, - ) - return None - - elif app_config.ETL_SERVICE == "LLAMACLOUD": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with LlamaCloud ETL: {filename}", - { - "file_type": "document", - "etl_service": "LLAMACLOUD", - "processing_stage": "parsing", - "estimated_pages": estimated_pages_before, - }, - ) - - # Parse file with retry logic for SSL/connection errors (common with large files) - result = await parse_with_llamacloud_retry( - file_path=file_path, - estimated_pages=estimated_pages_before, - task_logger=task_logger, - log_entry=log_entry, - ) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Get markdown documents from the result - markdown_documents = await result.aget_markdown_documents( - split_by_page=False - ) - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="chunking", - chunks_count=len(markdown_documents), - ) - - await task_logger.log_task_progress( - log_entry, - f"LlamaCloud parsing completed, creating documents: {filename}", - { - "processing_stage": "parsing_complete", - "documents_count": len(markdown_documents), - }, - ) - - # Check if LlamaCloud returned any documents - if not markdown_documents or len(markdown_documents) == 0: - await task_logger.log_task_failure( - log_entry, - f"LlamaCloud parsing returned no documents: {filename}", - "ETL service returned empty document list", - { - "error_type": "EmptyDocumentList", - "etl_service": "LLAMACLOUD", - }, - ) - raise ValueError( - f"LlamaCloud parsing returned no documents for {filename}" - ) - - # Verify actual page count from parsed markdown documents - actual_pages = page_limit_service.estimate_pages_from_markdown( - markdown_documents - ) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Track if any document was successfully created (not a duplicate) - any_doc_created = False - last_created_doc = None - - for doc in markdown_documents: - # Extract text content from the markdown documents - markdown_content = doc.text - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - doc_result = await add_received_file_document_using_llamacloud( - session, - filename, - llamacloud_markdown_document=markdown_content, - search_space_id=search_space_id, - user_id=user_id, - connector=connector, - enable_summary=enable_summary, - ) - - # Track if this document was successfully created - if doc_result: - any_doc_created = True - last_created_doc = doc_result - - # Update page usage once after processing all documents - # Only update if at least one document was created (not all duplicates) - if any_doc_created: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - if connector: - await _update_document_from_connector( - last_created_doc, connector, session - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with LlamaCloud: {filename}", - { - "document_id": last_created_doc.id, - "content_hash": last_created_doc.content_hash, - "file_type": "document", - "etl_service": "LLAMACLOUD", - "pages_processed": final_page_count, - "documents_count": len(markdown_documents), - }, - ) - return last_created_doc - else: - # All documents were duplicates (markdown_documents was not empty, but all returned None) - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "LLAMACLOUD", - "documents_count": len(markdown_documents), - }, - ) - return None - - elif app_config.ETL_SERVICE == "DOCLING": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with Docling ETL: {filename}", - { - "file_type": "document", - "etl_service": "DOCLING", - "processing_stage": "parsing", - }, - ) - - # Use Docling service for document processing - from app.services.docling_service import create_docling_service - - # Create Docling service - docling_service = create_docling_service() - - # Suppress pdfminer warnings that can cause processing to hang - # These warnings are harmless but can spam logs and potentially halt processing - # Suppress both Python warnings and logging warnings from pdfminer - pdfminer_logger = getLogger("pdfminer") - original_level = pdfminer_logger.level - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", category=UserWarning, module="pdfminer" - ) - warnings.filterwarnings( - "ignore", - message=".*Cannot set gray non-stroke color.*", - ) - warnings.filterwarnings("ignore", message=".*invalid float value.*") - - # Temporarily suppress pdfminer logging warnings - pdfminer_logger.setLevel(ERROR) - - try: - # Process the document - result = await docling_service.process_document( - file_path, filename - ) - finally: - # Restore original logging level - pdfminer_logger.setLevel(original_level) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - await task_logger.log_task_progress( - log_entry, - f"Docling parsing completed, creating document: {filename}", - { - "processing_stage": "parsing_complete", - "content_length": len(result["content"]), - }, - ) - - # Verify actual page count from content length - actual_pages = page_limit_service.estimate_pages_from_content_length( - len(result["content"]) - ) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - doc_result = await add_received_file_document_using_docling( - session, - filename, - docling_markdown_document=result["content"], - search_space_id=search_space_id, - user_id=user_id, - connector=connector, - enable_summary=enable_summary, - ) - - if doc_result: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - if connector: - await _update_document_from_connector( - doc_result, connector, session - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with Docling: {filename}", - { - "document_id": doc_result.id, - "content_hash": doc_result.content_hash, - "file_type": "document", - "etl_service": "DOCLING", - "pages_processed": final_page_count, - }, - ) - return doc_result - else: - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "DOCLING", - }, - ) - return None except Exception as e: await session.rollback() - # For page limit errors, use the detailed message from the exception from app.services.page_limit_service import PageLimitExceededError if isinstance(e, PageLimitExceededError): @@ -1645,10 +734,225 @@ async def process_file_in_background( str(e), {"error_type": type(e).__name__, "filename": filename}, ) - import logging - logging.error(f"Error processing file in background: {error_message}") - raise # Re-raise so the wrapper can also handle it + raise + + +# =================================================================== +# 2-phase handler (process_file_in_background_with_document) +# =================================================================== + + +async def _extract_file_content( + file_path: str, + filename: str, + session: AsyncSession, + user_id: str, + task_logger: TaskLoggingService, + log_entry: Log, + notification: Notification | None, +) -> tuple[str, str]: + """ + Extract markdown content from a file regardless of type. + + Returns: + Tuple of (markdown_content, etl_service_name). + """ + category = classify_file(filename) + + if category == FileCategory.MARKDOWN: + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Reading file", + ) + await task_logger.log_task_progress( + log_entry, + f"Processing markdown/text file: {filename}", + {"file_type": "markdown", "processing_stage": "reading_file"}, + ) + with open(file_path, encoding="utf-8") as f: + content = f.read() + with contextlib.suppress(Exception): + os.unlink(file_path) + return content, "MARKDOWN" + + if category == FileCategory.DIRECT_CONVERT: + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Converting file", + ) + await task_logger.log_task_progress( + log_entry, + f"Direct-converting file to markdown: {filename}", + {"file_type": "direct_convert", "processing_stage": "converting"}, + ) + content = convert_file_directly(file_path, filename) + with contextlib.suppress(Exception): + os.unlink(file_path) + return content, "DIRECT_CONVERT" + + if category == FileCategory.AUDIO: + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Transcribing audio", + ) + await task_logger.log_task_progress( + log_entry, + f"Processing audio file for transcription: {filename}", + {"file_type": "audio", "processing_stage": "starting_transcription"}, + ) + transcribed_text = await _transcribe_audio(file_path, filename) + with contextlib.suppress(Exception): + os.unlink(file_path) + return transcribed_text, "AUDIO_TRANSCRIPTION" + + # Document file — use ETL service + return await _extract_document_content( + file_path, + filename, + session, + user_id, + task_logger, + log_entry, + notification, + ) + + +async def _transcribe_audio(file_path: str, filename: str) -> str: + """Transcribe an audio file and return formatted markdown text.""" + stt_service_type = ( + "local" + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + + if stt_service_type == "local": + from app.services.stt_service import stt_service + + result = stt_service.transcribe_file(file_path) + text = result.get("text", "") + if not text: + raise ValueError("Transcription returned empty text") + else: + from litellm import atranscription + + with open(file_path, "rb") as audio_file: + kwargs: dict = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + kwargs["api_base"] = app_config.STT_SERVICE_API_BASE + response = await atranscription(**kwargs) + text = response.get("text", "") + if not text: + raise ValueError("Transcription returned empty text") + + return f"# Transcription of {filename}\n\n{text}" + + +async def _extract_document_content( + file_path: str, + filename: str, + session: AsyncSession, + user_id: str, + task_logger: TaskLoggingService, + log_entry: Log, + notification: Notification | None, +) -> tuple[str, str]: + """ + Parse a document file via the configured ETL service. + + Returns: + Tuple of (markdown_content, etl_service_name). + """ + from app.services.page_limit_service import PageLimitService + + page_limit_service = PageLimitService(session) + + try: + estimated_pages = page_limit_service.estimate_pages_before_processing(file_path) + except Exception: + file_size = os.path.getsize(file_path) + estimated_pages = max(1, file_size // (80 * 1024)) + + await page_limit_service.check_page_limit(user_id, estimated_pages) + + etl_service = app_config.ETL_SERVICE + markdown_content: str | None = None + + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Extracting content", + ) + + if etl_service == "UNSTRUCTURED": + from app.utils.document_converters import convert_document_to_markdown + + docs = await parse_with_unstructured(file_path) + markdown_content = await convert_document_to_markdown(docs) + actual_pages = page_limit_service.estimate_pages_from_elements(docs) + final_pages = max(estimated_pages, actual_pages) + await page_limit_service.update_page_usage( + user_id, final_pages, allow_exceed=True + ) + + elif etl_service == "LLAMACLOUD": + raw_result = await parse_with_llamacloud_retry( + file_path=file_path, + estimated_pages=estimated_pages, + task_logger=task_logger, + log_entry=log_entry, + ) + markdown_documents = await raw_result.aget_markdown_documents( + split_by_page=False + ) + if not markdown_documents: + raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}") + markdown_content = markdown_documents[0].text + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) + + elif etl_service == "DOCLING": + getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) + getLogger("docling.document_converter").setLevel(ERROR) + getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel( + ERROR + ) + + from docling.document_converter import DocumentConverter + + converter = DocumentConverter() + result = converter.convert(file_path) + markdown_content = result.document.export_to_markdown() + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) + + else: + raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}") + + with contextlib.suppress(Exception): + os.unlink(file_path) + + if not markdown_content: + raise RuntimeError(f"Failed to extract content from file: {filename}") + + return markdown_content, etl_service async def process_file_in_background_with_document( @@ -1667,272 +971,50 @@ async def process_file_in_background_with_document( """ Process file and update existing pending document (2-phase pattern). - This function is Phase 2 of the real-time document status updates: - - Phase 1 (API): Created document with pending status - - Phase 2 (this): Process file and update document to ready/failed - - The document already exists with pending status. This function: - 1. Parses the file content (markdown, audio, or ETL services) - 2. Updates the document with content, embeddings, and chunks - 3. Sets status to 'ready' on success - - Args: - document: Existing document with pending status - file_path: Path to the uploaded file - filename: Original filename - search_space_id: ID of the search space - user_id: ID of the user - session: Database session - task_logger: Task logging service - log_entry: Log entry for this task - connector: Optional connector info for Google Drive files - notification: Optional notification for progress updates - - Returns: - Updated Document object if successful, None if duplicate content detected + Phase 1 (API layer): Created document with pending status. + Phase 2 (this function): Process file and update document to ready/failed. """ - import os - - from app.config import config as app_config + from app.indexing_pipeline.adapters.file_upload_adapter import ( + UploadDocumentAdapter, + ) from app.services.llm_service import get_user_long_context_llm + from app.utils.document_converters import generate_content_hash + + from .base import check_duplicate_document doc_id = document.id try: - markdown_content = None - etl_service = None - - # ===== STEP 1: Parse file content based on type ===== - - # Check if the file is a markdown or text file - if filename.lower().endswith((".md", ".markdown", ".txt")): - # Update notification: parsing stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Reading file", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing markdown/text file: {filename}", - {"file_type": "markdown", "processing_stage": "reading_file"}, - ) - - # Read markdown content directly - with open(file_path, encoding="utf-8") as f: - markdown_content = f.read() - etl_service = "MARKDOWN" - - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) - - # Check if the file is an audio file - elif filename.lower().endswith( - (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") - ): - # Update notification: parsing stage (transcription) - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Transcribing audio", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing audio file for transcription: {filename}", - {"file_type": "audio", "processing_stage": "starting_transcription"}, - ) - - # Transcribe audio - stt_service_type = ( - "local" - if app_config.STT_SERVICE - and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - - if stt_service_type == "local": - from app.services.stt_service import stt_service - - result = stt_service.transcribe_file(file_path) - transcribed_text = result.get("text", "") - if not transcribed_text: - raise ValueError("Transcription returned empty text") - markdown_content = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - else: - with open(file_path, "rb") as audio_file: - transcription_kwargs = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = ( - app_config.STT_SERVICE_API_BASE - ) - transcription_response = await atranscription( - **transcription_kwargs - ) - transcribed_text = transcription_response.get("text", "") - if not transcribed_text: - raise ValueError("Transcription returned empty text") - markdown_content = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - - etl_service = "AUDIO_TRANSCRIPTION" - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) - - else: - # Document files - use ETL service - from app.services.page_limit_service import ( - PageLimitExceededError, - PageLimitService, - ) - - page_limit_service = PageLimitService(session) - - # Estimate page count - try: - estimated_pages = page_limit_service.estimate_pages_before_processing( - file_path - ) - except Exception: - file_size = os.path.getsize(file_path) - estimated_pages = max(1, file_size // (80 * 1024)) - - # Check page limit - await page_limit_service.check_page_limit(user_id, estimated_pages) - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - from langchain_unstructured import UnstructuredLoader - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - docs = await loader.aload() - markdown_content = await convert_document_to_markdown(docs) - actual_pages = page_limit_service.estimate_pages_from_elements(docs) - final_page_count = max(estimated_pages, actual_pages) - etl_service = "UNSTRUCTURED" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - elif app_config.ETL_SERVICE == "LLAMACLOUD": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - result = await parse_with_llamacloud_retry( - file_path=file_path, - estimated_pages=estimated_pages, - task_logger=task_logger, - log_entry=log_entry, - ) - markdown_documents = await result.aget_markdown_documents( - split_by_page=False - ) - if not markdown_documents: - raise RuntimeError( - f"LlamaCloud parsing returned no documents: {filename}" - ) - markdown_content = markdown_documents[0].text - etl_service = "LLAMACLOUD" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, estimated_pages, allow_exceed=True - ) - - elif app_config.ETL_SERVICE == "DOCLING": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - # Suppress logging during Docling import - getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) - getLogger("docling.document_converter").setLevel(ERROR) - getLogger( - "docling_core.transforms.chunker.hierarchical_chunker" - ).setLevel(ERROR) - - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file_path) - markdown_content = result.document.export_to_markdown() - etl_service = "DOCLING" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, estimated_pages, allow_exceed=True - ) - - else: - raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") - - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) + # Step 1: extract content + markdown_content, etl_service = await _extract_file_content( + file_path, + filename, + session, + user_id, + task_logger, + log_entry, + notification, + ) if not markdown_content: raise RuntimeError(f"Failed to extract content from file: {filename}") - # ===== STEP 2: Check for duplicate content ===== + # Step 2: duplicate check content_hash = generate_content_hash(markdown_content, search_space_id) - existing_by_content = await check_duplicate_document(session, content_hash) if existing_by_content and existing_by_content.id != doc_id: - # Duplicate content found - mark this document as failed logging.info( f"Duplicate content detected for {filename}, " f"matches document {existing_by_content.id}" ) return None - # ===== STEP 3+4: Index via pipeline ===== + # Step 3: index via pipeline if notification: await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" + session, + notification, + stage="chunking", ) user_llm = await get_user_long_context_llm(session, user_id, search_space_id) @@ -1957,7 +1039,6 @@ async def process_file_in_background_with_document( "file_type": etl_service, }, ) - return document except Exception as e: diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py index 2fb711bf8..0ff340c0e 100644 --- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py +++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py @@ -14,88 +14,19 @@ from app.utils.document_converters import ( create_document_chunks, generate_content_hash, generate_document_summary, - generate_unique_identifier_hash, ) +from ._helpers import ( + find_existing_document_with_migration, + get_google_drive_unique_identifier, +) from .base import ( - check_document_by_unique_identifier, check_duplicate_document, get_current_timestamp, safe_set_chunks, ) -def _get_google_drive_unique_identifier( - connector: dict | None, - filename: str, - search_space_id: int, -) -> tuple[str, str | None]: - """ - Get unique identifier hash for a file, with special handling for Google Drive. - - For Google Drive files, uses file_id as the unique identifier (doesn't change on rename). - For other files, uses filename. - - Args: - connector: Optional connector info dict with type and metadata - filename: The filename (used for non-Google Drive files or as fallback) - search_space_id: The search space ID - - Returns: - Tuple of (primary_hash, legacy_hash or None) - """ - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - metadata = connector.get("metadata", {}) - file_id = metadata.get("google_drive_file_id") - - if file_id: - primary_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - legacy_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id - ) - return primary_hash, legacy_hash - - primary_hash = generate_unique_identifier_hash( - DocumentType.FILE, filename, search_space_id - ) - return primary_hash, None - - -async def _find_existing_document_with_migration( - session: AsyncSession, - primary_hash: str, - legacy_hash: str | None, - content_hash: str | None = None, -) -> Document | None: - """ - Find existing document, checking both new hash and legacy hash for migration, - with fallback to content_hash for cross-source deduplication. - """ - existing_document = await check_document_by_unique_identifier(session, primary_hash) - - if not existing_document and legacy_hash: - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - logging.info( - "Found legacy document (filename-based hash), will migrate to file_id-based hash" - ) - - # Fallback: check by content_hash to catch duplicates from different sources - if not existing_document and content_hash: - existing_document = await check_duplicate_document(session, content_hash) - if existing_document: - logging.info( - f"Found duplicate content from different source (content_hash match). " - f"Original document ID: {existing_document.id}, type: {existing_document.document_type}" - ) - - return existing_document - - async def _handle_existing_document_update( session: AsyncSession, existing_document: Document, @@ -224,7 +155,7 @@ async def add_received_markdown_file_document( try: # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = _get_google_drive_unique_identifier( + primary_hash, legacy_hash = get_google_drive_unique_identifier( connector, file_name, search_space_id ) @@ -232,7 +163,7 @@ async def add_received_markdown_file_document( content_hash = generate_content_hash(file_in_markdown, search_space_id) # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await _find_existing_document_with_migration( + existing_document = await find_existing_document_with_migration( session, primary_hash, legacy_hash, content_hash ) diff --git a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py index a8dab43f0..a56398baa 100644 --- a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py +++ b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py @@ -2,12 +2,11 @@ Integration tests for backend file upload limit enforcement. These tests verify that the API rejects uploads that exceed: - - Max files per upload (10) - - Max per-file size (50 MB) - - Max total upload size (200 MB) + - Max per-file size (500 MB) -The limits mirror the frontend's DocumentUploadTab.tsx constants and are -enforced server-side to protect against direct API calls. +No file count or total size limits are enforced — the frontend batches +uploads in groups of 5 and there is no cap on how many files a user can +upload in a single session. Prerequisites: - PostgreSQL + pgvector @@ -24,60 +23,12 @@ pytestmark = pytest.mark.integration # --------------------------------------------------------------------------- -# Test A: File count limit -# --------------------------------------------------------------------------- - - -class TestFileCountLimit: - """Uploading more than 10 files in a single request should be rejected.""" - - async def test_11_files_returns_413( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - ): - files = [ - ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) - for i in range(11) - ] - resp = await client.post( - "/api/v1/documents/fileupload", - headers=headers, - files=files, - data={"search_space_id": str(search_space_id)}, - ) - assert resp.status_code == 413 - assert "too many files" in resp.json()["detail"].lower() - - async def test_10_files_accepted( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - cleanup_doc_ids: list[int], - ): - files = [ - ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) - for i in range(10) - ] - resp = await client.post( - "/api/v1/documents/fileupload", - headers=headers, - files=files, - data={"search_space_id": str(search_space_id)}, - ) - assert resp.status_code == 200 - cleanup_doc_ids.extend(resp.json().get("document_ids", [])) - - -# --------------------------------------------------------------------------- -# Test B: Per-file size limit +# Test: Per-file size limit (500 MB) # --------------------------------------------------------------------------- class TestPerFileSizeLimit: - """A single file exceeding 50 MB should be rejected.""" + """A single file exceeding 500 MB should be rejected.""" async def test_oversized_file_returns_413( self, @@ -85,7 +36,7 @@ class TestPerFileSizeLimit: headers: dict[str, str], search_space_id: int, ): - oversized = io.BytesIO(b"\x00" * (50 * 1024 * 1024 + 1)) + oversized = io.BytesIO(b"\x00" * (500 * 1024 * 1024 + 1)) resp = await client.post( "/api/v1/documents/fileupload", headers=headers, @@ -102,11 +53,11 @@ class TestPerFileSizeLimit: search_space_id: int, cleanup_doc_ids: list[int], ): - at_limit = io.BytesIO(b"\x00" * (50 * 1024 * 1024)) + at_limit = io.BytesIO(b"\x00" * (500 * 1024 * 1024)) resp = await client.post( "/api/v1/documents/fileupload", headers=headers, - files=[("files", ("exact50mb.txt", at_limit, "text/plain"))], + files=[("files", ("exact500mb.txt", at_limit, "text/plain"))], data={"search_space_id": str(search_space_id)}, ) assert resp.status_code == 200 @@ -114,26 +65,23 @@ class TestPerFileSizeLimit: # --------------------------------------------------------------------------- -# Test C: Total upload size limit +# Test: Multiple files accepted without count limit # --------------------------------------------------------------------------- -class TestTotalSizeLimit: - """Multiple files whose combined size exceeds 200 MB should be rejected.""" +class TestNoFileCountLimit: + """Many files in a single request should be accepted.""" - async def test_total_size_over_200mb_returns_413( + async def test_many_files_accepted( self, client: httpx.AsyncClient, headers: dict[str, str], search_space_id: int, + cleanup_doc_ids: list[int], ): - chunk_size = 45 * 1024 * 1024 # 45 MB each files = [ - ( - "files", - (f"chunk_{i}.txt", io.BytesIO(b"\x00" * chunk_size), "text/plain"), - ) - for i in range(5) # 5 x 45 MB = 225 MB > 200 MB + ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) + for i in range(20) ] resp = await client.post( "/api/v1/documents/fileupload", @@ -141,5 +89,5 @@ class TestTotalSizeLimit: files=files, data={"search_space_id": str(search_space_id)}, ) - assert resp.status_code == 413 - assert "total upload size" in resp.json()["detail"].lower() + assert resp.status_code == 200 + cleanup_doc_ids.extend(resp.json().get("document_ids", [])) diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py index 163dd0d1d..a8cf5c93b 100644 --- a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py +++ b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py @@ -248,7 +248,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", @@ -298,7 +298,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", @@ -334,7 +334,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index 4e0c36267..1c246ed71 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -329,14 +329,15 @@ export function DocumentsTableShell({ const handleViewDocument = useCallback(async (doc: Document) => { setViewingDoc(doc); - if (doc.content) { - setViewingContent(doc.content); + const preview = doc.content_preview || doc.content; + if (preview) { + setViewingContent(preview); return; } setViewingLoading(true); try { const fullDoc = await documentsApiService.getDocument({ id: doc.id }); - setViewingContent(fullDoc.content); + setViewingContent(fullDoc.content_preview || fullDoc.content); } catch (err) { console.error("[DocumentsTableShell] Failed to fetch document content:", err); setViewingContent("Failed to load document content."); @@ -946,13 +947,36 @@ export function DocumentsTableShell({ WebkitMaskImage: `linear-gradient(to bottom, ${previewScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${previewScrollPos === "bottom" ? "black" : "transparent"})`, }} > - {viewingLoading ? ( -
- -
- ) : ( - - )} + {viewingLoading ? ( +
+ +
+ ) : ( + <> + + {viewingDoc && ( +
+ +
+ )} + + )}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts index d87f7374b..88914bd4f 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts @@ -9,9 +9,9 @@ export type Document = { id: number; title: string; document_type: DocumentType; - // Optional: Only needed when viewing document details (lazy loaded) document_metadata?: any; content?: string; + content_preview?: string; created_at: string; search_space_id: number; created_by_id?: string | null; diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 3ea36f800..4b7079aef 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -1,12 +1,13 @@ "use client"; import { useAtomValue, useSetAtom } from "jotai"; -import { AlertCircle, XIcon } from "lucide-react"; +import { AlertCircle, Download, FileText, Loader2, XIcon } from "lucide-react"; import dynamic from "next/dynamic"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom"; import { MarkdownViewer } from "@/components/markdown-viewer"; +import { Alert, AlertDescription } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer"; import { Skeleton } from "@/components/ui/skeleton"; @@ -18,11 +19,16 @@ const PlateEditor = dynamic( { ssr: false, loading: () => } ); +const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB + interface EditorContent { document_id: number; title: string; document_type?: string; source_markdown: string; + content_size_bytes?: number; + chunk_count?: number; + truncated?: boolean; } const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]); @@ -62,6 +68,7 @@ export function EditorPanelContent({ const [isLoading, setIsLoading] = useState(true); const [error, setError] = useState(null); const [saving, setSaving] = useState(false); + const [downloading, setDownloading] = useState(false); const [editedMarkdown, setEditedMarkdown] = useState(null); const markdownRef = useRef(""); @@ -69,6 +76,8 @@ export function EditorPanelContent({ const changeCountRef = useRef(0); const [displayTitle, setDisplayTitle] = useState(title || "Untitled"); + const isLargeDocument = (editorDoc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD; + useEffect(() => { let cancelled = false; setIsLoading(true); @@ -86,10 +95,12 @@ export function EditorPanelContent({ } try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, - { method: "GET" } + const url = new URL( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content` ); + url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD)); + + const response = await authenticatedFetch(url.toString(), { method: "GET" }); if (cancelled) return; @@ -175,7 +186,7 @@ export function EditorPanelContent({ }, [documentId, searchSpaceId]); const isEditableType = editorDoc - ? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") + ? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") && !isLargeDocument : false; return ( @@ -206,6 +217,57 @@ export function EditorPanelContent({

{error || "An unknown error occurred"}

+ ) : isLargeDocument ? ( +
+ + + + + This document is too large for the editor ({Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {editorDoc.chunk_count ?? 0} chunks). Showing a preview below. + + + + + +
) : isEditableType ? ( (null); const [isEditing, setIsEditing] = useState(false); const [saving, setSaving] = useState(false); + const [downloading, setDownloading] = useState(false); const [editedMarkdown, setEditedMarkdown] = useState(null); const markdownRef = useRef(""); const initialLoadDone = useRef(false); const changeCountRef = useRef(0); + const isLargeDocument = (doc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD; + useEffect(() => { let cancelled = false; setIsLoading(true); @@ -72,10 +81,12 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen } try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, - { method: "GET" } + const url = new URL( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content` ); + url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD)); + + const response = await authenticatedFetch(url.toString(), { method: "GET" }); if (cancelled) return; @@ -173,9 +184,9 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen ); } - const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? ""); + const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "") && !isLargeDocument; - if (isEditing) { + if (isEditing && !isLargeDocument) { return (
@@ -236,7 +247,60 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
- + {isLargeDocument ? ( + <> + + + + + This document is too large for the editor ({Math.round((doc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {doc.chunk_count ?? 0} chunks). Showing a preview below. + + + + + + + ) : ( + + )}
diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx index e22df8998..abd999301 100644 --- a/surfsense_web/components/markdown-viewer.tsx +++ b/surfsense_web/components/markdown-viewer.tsx @@ -15,6 +15,7 @@ const math = createMathPlugin({ interface MarkdownViewerProps { content: string; className?: string; + maxLength?: number; } /** @@ -79,8 +80,10 @@ function convertLatexDelimiters(content: string): string { return content; } -export function MarkdownViewer({ content, className }: MarkdownViewerProps) { - const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(content)); +export function MarkdownViewer({ content, className, maxLength }: MarkdownViewerProps) { + const isTruncated = maxLength != null && content.length > maxLength; + const displayContent = isTruncated ? content.slice(0, maxLength) : content; + const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(displayContent)); const components: StreamdownProps["components"] = { p: ({ children, ...props }) => (

@@ -171,6 +174,11 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) { > {processedContent} + {isTruncated && ( +

+ Content truncated ({Math.round(content.length / 1024)}KB total). Showing first {Math.round(maxLength / 1024)}KB. +

+ )}
); } diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx index b02b2e217..c17616c53 100644 --- a/surfsense_web/components/new-chat/source-detail-panel.tsx +++ b/surfsense_web/components/new-chat/source-detail-panel.tsx @@ -1,7 +1,7 @@ "use client"; import { useQuery } from "@tanstack/react-query"; -import { BookOpen, ChevronDown, ExternalLink, FileText, Hash, Sparkles, X } from "lucide-react"; +import { BookOpen, ChevronDown, ChevronUp, ExternalLink, FileText, Hash, Loader2, Sparkles, X } from "lucide-react"; import { AnimatePresence, motion, useReducedMotion } from "motion/react"; import { useTranslations } from "next-intl"; import type React from "react"; @@ -10,7 +10,6 @@ import { createPortal } from "react-dom"; import { MarkdownViewer } from "@/components/markdown-viewer"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; -import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; import { ScrollArea } from "@/components/ui/scroll-area"; import { Spinner } from "@/components/ui/spinner"; import type { @@ -48,7 +47,8 @@ const formatDocumentType = (type: string) => { // which break auto-scroll functionality interface ChunkCardProps { chunk: { id: number; content: string }; - index: number; + localIndex: number; + chunkNumber: number; totalChunks: number; isCited: boolean; isActive: boolean; @@ -56,11 +56,11 @@ interface ChunkCardProps { } const ChunkCard = memo( - forwardRef(({ chunk, index, totalChunks, isCited }, ref) => { + forwardRef(({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => { return (
- {/* Cited indicator glow effect */} {isCited &&
} - {/* Header */}
- {index + 1} + {chunkNumber}
- of {totalChunks} chunks + Chunk {chunkNumber} of {totalChunks}
{isCited && ( @@ -94,9 +92,8 @@ const ChunkCard = memo( )}
- {/* Content */}
- +
); @@ -118,7 +115,6 @@ export function SourceDetailPanel({ const t = useTranslations("dashboard"); const scrollAreaRef = useRef(null); const hasScrolledRef = useRef(false); // Use ref to avoid stale closures - const [summaryOpen, setSummaryOpen] = useState(false); const [activeChunkIndex, setActiveChunkIndex] = useState(null); const [mounted, setMounted] = useState(false); const [_hasScrolledToCited, setHasScrolledToCited] = useState(false); @@ -140,20 +136,88 @@ export function SourceDetailPanel({ if (isDocsChunk) { return documentsApiService.getSurfsenseDocByChunk(chunkId); } - return documentsApiService.getDocumentByChunk({ chunk_id: chunkId }); + return documentsApiService.getDocumentByChunk({ chunk_id: chunkId, chunk_window: 5 }); }, enabled: !!chunkId && open, staleTime: 5 * 60 * 1000, }); + const totalChunks = (documentData && "total_chunks" in documentData) + ? (documentData.total_chunks ?? documentData.chunks.length) + : (documentData?.chunks?.length ?? 0); + const [beforeChunks, setBeforeChunks] = useState>([]); + const [afterChunks, setAfterChunks] = useState>([]); + const [loadingBefore, setLoadingBefore] = useState(false); + const [loadingAfter, setLoadingAfter] = useState(false); + + useEffect(() => { + setBeforeChunks([]); + setAfterChunks([]); + }, [chunkId, open]); + + const chunkStartIndex = (documentData && "chunk_start_index" in documentData) + ? (documentData.chunk_start_index ?? 0) : 0; + const initialChunks = documentData?.chunks ?? []; + const allChunks = [...beforeChunks, ...initialChunks, ...afterChunks]; + const absoluteStart = chunkStartIndex - beforeChunks.length; + const absoluteEnd = chunkStartIndex + initialChunks.length + afterChunks.length; + const canLoadBefore = absoluteStart > 0; + const canLoadAfter = absoluteEnd < totalChunks; + + const EXPAND_SIZE = 10; + + const loadBefore = useCallback(async () => { + if (!documentData || !("search_space_id" in documentData) || !canLoadBefore) return; + setLoadingBefore(true); + try { + const count = Math.min(EXPAND_SIZE, absoluteStart); + const result = await documentsApiService.getDocumentChunks({ + document_id: documentData.id, + page: 0, + page_size: count, + start_offset: absoluteStart - count, + }); + const existingIds = new Set(allChunks.map(c => c.id)); + const newChunks = result.items + .filter(c => !existingIds.has(c.id)) + .map(c => ({ id: c.id, content: c.content, created_at: c.created_at })); + setBeforeChunks(prev => [...newChunks, ...prev]); + } catch (err) { + console.error("Failed to load earlier chunks:", err); + } finally { + setLoadingBefore(false); + } + }, [documentData, absoluteStart, canLoadBefore, allChunks]); + + const loadAfter = useCallback(async () => { + if (!documentData || !("search_space_id" in documentData) || !canLoadAfter) return; + setLoadingAfter(true); + try { + const result = await documentsApiService.getDocumentChunks({ + document_id: documentData.id, + page: 0, + page_size: EXPAND_SIZE, + start_offset: absoluteEnd, + }); + const existingIds = new Set(allChunks.map(c => c.id)); + const newChunks = result.items + .filter(c => !existingIds.has(c.id)) + .map(c => ({ id: c.id, content: c.content, created_at: c.created_at })); + setAfterChunks(prev => [...prev, ...newChunks]); + } catch (err) { + console.error("Failed to load later chunks:", err); + } finally { + setLoadingAfter(false); + } + }, [documentData, absoluteEnd, canLoadAfter, allChunks]); + const isDirectRenderSource = sourceType === "TAVILY_API" || sourceType === "LINKUP_API" || sourceType === "SEARXNG_API" || sourceType === "BAIDU_SEARCH_API"; - // Find cited chunk index - const citedChunkIndex = documentData?.chunks?.findIndex((chunk) => chunk.id === chunkId) ?? -1; + const citedChunkIndex = allChunks.findIndex((chunk) => chunk.id === chunkId); // Simple scroll function that scrolls to a chunk by index const scrollToChunkByIndex = useCallback( @@ -336,12 +400,12 @@ export function SourceDetailPanel({ {documentData && "document_type" in documentData ? formatDocumentType(documentData.document_type) : sourceType && formatDocumentType(sourceType)} - {documentData?.chunks && ( - - • {documentData.chunks.length} chunk - {documentData.chunks.length !== 1 ? "s" : ""} - - )} + {totalChunks > 0 && ( + + • {totalChunks} chunk{totalChunks !== 1 ? "s" : ""} + {allChunks.length < totalChunks && ` (showing ${allChunks.length})`} + + )}

@@ -450,7 +514,7 @@ export function SourceDetailPanel({ {!isDirectRenderSource && documentData && (
{/* Chunk Navigation Sidebar */} - {documentData.chunks.length > 1 && ( + {allChunks.length > 1 && (
- {documentData.chunks.map((chunk, idx) => { + {allChunks.map((chunk, idx) => { + const absNum = absoluteStart + idx + 1; const isCited = chunk.id === chunkId; const isActive = activeChunkIndex === idx; return ( @@ -478,9 +543,9 @@ export function SourceDetailPanel({ ? "bg-muted text-foreground" : "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground" )} - title={isCited ? `Chunk ${idx + 1} (Cited)` : `Chunk ${idx + 1}`} + title={isCited ? `Chunk ${absNum} (Cited)` : `Chunk ${absNum}`} > - {idx + 1} + {absNum} {isCited && ( @@ -524,44 +589,11 @@ export function SourceDetailPanel({ )} - {/* Summary Collapsible */} - {documentData.content && ( - - - - - - Document Summary - - - - - - - - - - - - - )} - {/* Chunks Header */} -
+

- Content Chunks + Chunks {absoluteStart + 1}–{absoluteEnd} of {totalChunks}

{citedChunkIndex !== -1 && ( +
+ )} + {/* Chunks */}
- {documentData.chunks.map((chunk, idx) => { + {allChunks.map((chunk, idx) => { const isCited = chunk.id === chunkId; + const chunkNumber = absoluteStart + idx + 1; return ( 30} + disableLayoutAnimation={allChunks.length > 30} /> ); })}
+ + {/* Load Later */} + {canLoadAfter && ( +
+ +
+ )}
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 6817b19db..faa042d8e 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -1,10 +1,10 @@ "use client"; import { useAtom } from "jotai"; -import { CheckCircle2, FileType, Info, Upload, X } from "lucide-react"; +import { CheckCircle2, FileType, FolderOpen, Info, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; -import { useCallback, useMemo, useRef, useState } from "react"; +import { type ChangeEvent, useCallback, useMemo, useRef, useState } from "react"; import { useDropzone } from "react-dropzone"; import { toast } from "sonner"; import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; @@ -51,6 +51,7 @@ const commonTypes = { "application/vnd.openxmlformats-officedocument.presentationml.presentation": [".pptx"], "text/html": [".html", ".htm"], "text/csv": [".csv"], + "text/tab-separated-values": [".tsv"], "image/jpeg": [".jpg", ".jpeg"], "image/png": [".png"], "image/bmp": [".bmp"], @@ -76,7 +77,6 @@ const FILE_TYPE_CONFIG: Record> = { "application/rtf": [".rtf"], "application/xml": [".xml"], "application/epub+zip": [".epub"], - "text/tab-separated-values": [".tsv"], "text/html": [".html", ".htm", ".web"], "image/gif": [".gif"], "image/svg+xml": [".svg"], @@ -102,7 +102,6 @@ const FILE_TYPE_CONFIG: Record> = { "application/vnd.ms-powerpoint": [".ppt"], "text/x-rst": [".rst"], "application/rtf": [".rtf"], - "text/tab-separated-values": [".tsv"], "application/vnd.ms-excel": [".xls"], "application/xml": [".xml"], ...audioFileTypes, @@ -116,10 +115,8 @@ interface FileWithId { const cardClass = "border border-border bg-slate-400/5 dark:bg-white/5"; -// Upload limits — files are sent in batches of 5 to avoid proxy timeouts -const MAX_FILES = 50; -const MAX_TOTAL_SIZE_MB = 200; -const MAX_TOTAL_SIZE_BYTES = MAX_TOTAL_SIZE_MB * 1024 * 1024; +const MAX_FILE_SIZE_MB = 500; +const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024; export function DocumentUploadTab({ searchSpaceId, @@ -134,6 +131,7 @@ export function DocumentUploadTab({ const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom); const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation; const fileInputRef = useRef(null); + const folderInputRef = useRef(null); const acceptedFileTypes = useMemo(() => { const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE; @@ -145,49 +143,76 @@ export function DocumentUploadTab({ [acceptedFileTypes] ); - const onDrop = useCallback( - (acceptedFiles: File[]) => { + const supportedExtensionsSet = useMemo( + () => new Set(supportedExtensions.map((ext) => ext.toLowerCase())), + [supportedExtensions] + ); + + const addFiles = useCallback( + (incoming: File[]) => { + const oversized = incoming.filter((f) => f.size > MAX_FILE_SIZE_BYTES); + if (oversized.length > 0) { + toast.error(t("file_too_large"), { + description: t("file_too_large_desc", { + name: oversized[0].name, + maxMB: MAX_FILE_SIZE_MB, + }), + }); + } + const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES); + if (valid.length === 0) return; + setFiles((prev) => { - const newEntries = acceptedFiles.map((f) => ({ + const newEntries = valid.map((f) => ({ id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, file: f, })); - const newFiles = [...prev, ...newEntries]; - - if (newFiles.length > MAX_FILES) { - toast.error(t("max_files_exceeded"), { - description: t("max_files_exceeded_desc", { max: MAX_FILES }), - }); - return prev; - } - - const newTotalSize = newFiles.reduce((sum, entry) => sum + entry.file.size, 0); - if (newTotalSize > MAX_TOTAL_SIZE_BYTES) { - toast.error(t("max_size_exceeded"), { - description: t("max_size_exceeded_desc", { max: MAX_TOTAL_SIZE_MB }), - }); - return prev; - } - - return newFiles; + return [...prev, ...newEntries]; }); }, [t] ); + const onDrop = useCallback( + (acceptedFiles: File[]) => { + addFiles(acceptedFiles); + }, + [addFiles] + ); + const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, accept: acceptedFileTypes, - maxSize: 50 * 1024 * 1024, // 50MB per file + maxSize: MAX_FILE_SIZE_BYTES, noClick: false, - disabled: files.length >= MAX_FILES, }); - // Handle file input click to prevent event bubbling that might reopen dialog const handleFileInputClick = useCallback((e: React.MouseEvent) => { e.stopPropagation(); }, []); + const handleFolderChange = useCallback( + (e: ChangeEvent) => { + const fileList = e.target.files; + if (!fileList || fileList.length === 0) return; + + const folderFiles = Array.from(fileList).filter((f) => { + const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : ""; + return ext !== "" && supportedExtensionsSet.has(ext); + }); + + if (folderFiles.length === 0) { + toast.error(t("no_supported_files_in_folder")); + e.target.value = ""; + return; + } + + addFiles(folderFiles); + e.target.value = ""; + }, + [addFiles, supportedExtensionsSet, t] + ); + const formatFileSize = (bytes: number) => { if (bytes === 0) return "0 Bytes"; const k = 1024; @@ -198,15 +223,6 @@ export function DocumentUploadTab({ const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0); - // Check if limits are reached - const isFileCountLimitReached = files.length >= MAX_FILES; - const isSizeLimitReached = totalFileSize >= MAX_TOTAL_SIZE_BYTES; - const remainingFiles = MAX_FILES - files.length; - const remainingSizeMB = Math.max( - 0, - (MAX_TOTAL_SIZE_BYTES - totalFileSize) / (1024 * 1024) - ).toFixed(1); - // Track accordion state changes const handleAccordionChange = useCallback( (value: string) => { @@ -257,11 +273,21 @@ export function DocumentUploadTab({ - {t("file_size_limit")}{" "} - {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} + {t("file_size_limit", { maxMB: MAX_FILE_SIZE_MB })}{" "} + {t("upload_limits")} + {/* Hidden folder input */} + )} + /> +
@@ -269,11 +295,7 @@ export function DocumentUploadTab({
- {isFileCountLimitReached ? ( -
- -
-

- {t("file_limit_reached")} -

-

- {t("file_limit_reached_desc", { max: MAX_FILES })} -

-
-
- ) : isDragActive ? ( + {isDragActive ? (

{t("drop_files")}

@@ -305,29 +315,35 @@ export function DocumentUploadTab({

{t("drag_drop")}

{t("or_browse")}

- {files.length > 0 && ( -

- {t("remaining_capacity", { files: remainingFiles, sizeMB: remainingSizeMB })} -

- )} -
- )} - {!isFileCountLimitReached && ( -
-
)} +
+ + +
diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index 1a3326bae..f5431aecb 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -39,6 +39,7 @@ export const document = z.object({ document_type: documentTypeEnum, document_metadata: z.record(z.string(), z.any()), content: z.string(), + content_preview: z.string().optional().default(""), content_hash: z.string(), unique_identifier_hash: z.string().nullable(), created_at: z.string(), @@ -69,6 +70,8 @@ export const documentWithChunks = document.extend({ created_at: z.string(), }) ), + total_chunks: z.number().optional().default(0), + chunk_start_index: z.number().optional().default(0), }); /** @@ -243,10 +246,36 @@ export const getDocumentTypeCountsResponse = z.record(z.string(), z.number()); */ export const getDocumentByChunkRequest = z.object({ chunk_id: z.number(), + chunk_window: z.number().optional(), }); export const getDocumentByChunkResponse = documentWithChunks; +/** + * Get paginated chunks for a document + */ +export const getDocumentChunksRequest = z.object({ + document_id: z.number(), + page: z.number().optional().default(0), + page_size: z.number().optional().default(20), + start_offset: z.number().optional(), +}); + +export const chunkRead = z.object({ + id: z.number(), + content: z.string(), + document_id: z.number(), + created_at: z.string(), +}); + +export const getDocumentChunksResponse = z.object({ + items: z.array(chunkRead), + total: z.number(), + page: z.number(), + page_size: z.number(), + has_more: z.boolean(), +}); + /** * Get Surfsense docs by chunk */ @@ -328,3 +357,6 @@ export type GetSurfsenseDocsByChunkRequest = z.infer; export type GetSurfsenseDocsRequest = z.infer; export type GetSurfsenseDocsResponse = z.infer; +export type GetDocumentChunksRequest = z.infer; +export type GetDocumentChunksResponse = z.infer; +export type ChunkRead = z.infer; diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index 14a247032..71fa58852 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -6,6 +6,7 @@ import { deleteDocumentRequest, deleteDocumentResponse, type GetDocumentByChunkRequest, + type GetDocumentChunksRequest, type GetDocumentRequest, type GetDocumentsRequest, type GetDocumentsStatusRequest, @@ -13,6 +14,8 @@ import { type GetSurfsenseDocsRequest, getDocumentByChunkRequest, getDocumentByChunkResponse, + getDocumentChunksRequest, + getDocumentChunksResponse, getDocumentRequest, getDocumentResponse, getDocumentsRequest, @@ -295,23 +298,52 @@ class DocumentsApiService { }; /** - * Get document by chunk ID (includes all chunks) + * Get document by chunk ID (includes a window of chunks around the cited one) */ getDocumentByChunk = async (request: GetDocumentByChunkRequest) => { - // Validate the request const parsedRequest = getDocumentByChunkRequest.safeParse(request); if (!parsedRequest.success) { console.error("Invalid request:", parsedRequest.error); - // Format a user friendly error message const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", "); throw new ValidationError(`Invalid request: ${errorMessage}`); } + const params = new URLSearchParams(); + if (request.chunk_window != null) { + params.set("chunk_window", String(request.chunk_window)); + } + const qs = params.toString(); + const url = `/api/v1/documents/by-chunk/${request.chunk_id}${qs ? `?${qs}` : ""}`; + + return baseApiService.get(url, getDocumentByChunkResponse); + }; + + /** + * Get paginated chunks for a document + */ + getDocumentChunks = async (request: GetDocumentChunksRequest) => { + const parsedRequest = getDocumentChunksRequest.safeParse(request); + + if (!parsedRequest.success) { + console.error("Invalid request:", parsedRequest.error); + + const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", "); + throw new ValidationError(`Invalid request: ${errorMessage}`); + } + + const params = new URLSearchParams({ + page: String(parsedRequest.data.page), + page_size: String(parsedRequest.data.page_size), + }); + if (parsedRequest.data.start_offset != null) { + params.set("start_offset", String(parsedRequest.data.start_offset)); + } + return baseApiService.get( - `/api/v1/documents/by-chunk/${request.chunk_id}`, - getDocumentByChunkResponse + `/api/v1/documents/${parsedRequest.data.document_id}/chunks?${params}`, + getDocumentChunksResponse ); }; diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 53f80ea5f..cacaec557 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -376,12 +376,13 @@ "upload_documents": { "title": "Upload Documents", "subtitle": "Upload your files to make them searchable and accessible through AI-powered conversations.", - "file_size_limit": "Maximum file size: 50MB per file.", - "upload_limits": "Upload limit: {maxFiles} files, {maxSizeMB}MB total.", - "drop_files": "Drop files here", - "drag_drop": "Drag & drop files here", - "or_browse": "or click to browse", + "file_size_limit": "Maximum file size: {maxMB}MB per file.", + "upload_limits": "Upload files or entire folders", + "drop_files": "Drop files or folders here", + "drag_drop": "Drag & drop files or folders here", + "or_browse": "or click to browse files and folders", "browse_files": "Browse Files", + "browse_folder": "Browse Folder", "selected_files": "Selected Files ({count})", "total_size": "Total size", "clear_all": "Clear all", @@ -394,13 +395,9 @@ "upload_error_desc": "Error uploading files", "supported_file_types": "Supported File Types", "file_types_desc": "These file types are supported based on your current ETL service configuration.", - "max_files_exceeded": "File Limit Exceeded", - "max_files_exceeded_desc": "You can upload a maximum of {max} files at a time.", - "max_size_exceeded": "Size Limit Exceeded", - "max_size_exceeded_desc": "Total file size cannot exceed {max}MB.", - "file_limit_reached": "Maximum Files Reached", - "file_limit_reached_desc": "Remove some files to add more (max {max} files).", - "remaining_capacity": "{files} files remaining • {sizeMB}MB available" + "file_too_large": "File Too Large", + "file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.", + "no_supported_files_in_folder": "No supported file types found in the selected folder." }, "add_webpage": { "title": "Add Webpages for Crawling", diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json index 36e627295..7670e76df 100644 --- a/surfsense_web/messages/es.json +++ b/surfsense_web/messages/es.json @@ -376,12 +376,13 @@ "upload_documents": { "title": "Subir documentos", "subtitle": "Sube tus archivos para hacerlos buscables y accesibles a través de conversaciones con IA.", - "file_size_limit": "Tamaño máximo de archivo: 50 MB por archivo.", - "upload_limits": "Límite de subida: {maxFiles} archivos, {maxSizeMB} MB en total.", - "drop_files": "Suelta los archivos aquí", - "drag_drop": "Arrastra y suelta archivos aquí", - "or_browse": "o haz clic para explorar", + "file_size_limit": "Tamaño máximo de archivo: {maxMB} MB por archivo.", + "upload_limits": "Sube archivos o carpetas enteras", + "drop_files": "Suelta archivos o carpetas aquí", + "drag_drop": "Arrastra y suelta archivos o carpetas aquí", + "or_browse": "o haz clic para explorar archivos y carpetas", "browse_files": "Explorar archivos", + "browse_folder": "Explorar carpeta", "selected_files": "Archivos seleccionados ({count})", "total_size": "Tamaño total", "clear_all": "Limpiar todo", @@ -394,13 +395,9 @@ "upload_error_desc": "Error al subir archivos", "supported_file_types": "Tipos de archivo soportados", "file_types_desc": "Estos tipos de archivo son soportados según la configuración actual de tu servicio ETL.", - "max_files_exceeded": "Límite de archivos excedido", - "max_files_exceeded_desc": "Puedes subir un máximo de {max} archivos a la vez.", - "max_size_exceeded": "Límite de tamaño excedido", - "max_size_exceeded_desc": "El tamaño total de los archivos no puede exceder {max} MB.", - "file_limit_reached": "Máximo de archivos alcanzado", - "file_limit_reached_desc": "Elimina algunos archivos para agregar más (máximo {max} archivos).", - "remaining_capacity": "{files} archivos restantes • {sizeMB} MB disponibles" + "file_too_large": "Archivo demasiado grande", + "file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.", + "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada." }, "add_webpage": { "title": "Agregar páginas web para rastreo", diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json index fd51acdc2..cbcff0b30 100644 --- a/surfsense_web/messages/hi.json +++ b/surfsense_web/messages/hi.json @@ -376,12 +376,13 @@ "upload_documents": { "title": "दस्तावेज़ अपलोड करें", "subtitle": "AI-संचालित बातचीत के माध्यम से अपनी फ़ाइलों को खोजने योग्य और सुलभ बनाने के लिए अपलोड करें।", - "file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल 50MB।", - "upload_limits": "अपलोड सीमा: {maxFiles} फ़ाइलें, कुल {maxSizeMB}MB।", - "drop_files": "फ़ाइलें यहां छोड़ें", - "drag_drop": "फ़ाइलें यहां खींचें और छोड़ें", - "or_browse": "या ब्राउज़ करने के लिए क्लिक करें", + "file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल {maxMB}MB।", + "upload_limits": "फ़ाइलें या पूरे फ़ोल्डर अपलोड करें", + "drop_files": "फ़ाइलें या फ़ोल्डर यहां छोड़ें", + "drag_drop": "फ़ाइलें या फ़ोल्डर यहां खींचें और छोड़ें", + "or_browse": "या फ़ाइलें और फ़ोल्डर ब्राउज़ करने के लिए क्लिक करें", "browse_files": "फ़ाइलें ब्राउज़ करें", + "browse_folder": "फ़ोल्डर ब्राउज़ करें", "selected_files": "चयनित फ़ाइलें ({count})", "total_size": "कुल आकार", "clear_all": "सभी साफ करें", @@ -394,13 +395,9 @@ "upload_error_desc": "फ़ाइलें अपलोड करने में त्रुटि", "supported_file_types": "समर्थित फ़ाइल प्रकार", "file_types_desc": "ये फ़ाइल प्रकार आपकी वर्तमान ETL सेवा कॉन्फ़िगरेशन के आधार पर समर्थित हैं।", - "max_files_exceeded": "फ़ाइल सीमा पार हो गई", - "max_files_exceeded_desc": "आप एक बार में अधिकतम {max} फ़ाइलें अपलोड कर सकते हैं।", - "max_size_exceeded": "आकार सीमा पार हो गई", - "max_size_exceeded_desc": "कुल फ़ाइल आकार {max}MB से अधिक नहीं हो सकता।", - "file_limit_reached": "अधिकतम फ़ाइलें पहुंच गई", - "file_limit_reached_desc": "और जोड़ने के लिए कुछ फ़ाइलें हटाएं (अधिकतम {max} फ़ाइलें)।", - "remaining_capacity": "{files} फ़ाइलें शेष • {sizeMB}MB उपलब्ध" + "file_too_large": "फ़ाइल बहुत बड़ी है", + "file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।", + "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।" }, "add_webpage": { "title": "क्रॉलिंग के लिए वेबपेज जोड़ें", diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json index e26499f90..ec72ef0da 100644 --- a/surfsense_web/messages/pt.json +++ b/surfsense_web/messages/pt.json @@ -376,12 +376,13 @@ "upload_documents": { "title": "Enviar documentos", "subtitle": "Envie seus arquivos para torná-los pesquisáveis e acessíveis através de conversas com IA.", - "file_size_limit": "Tamanho máximo do arquivo: 50 MB por arquivo.", - "upload_limits": "Limite de envio: {maxFiles} arquivos, {maxSizeMB} MB no total.", - "drop_files": "Solte os arquivos aqui", - "drag_drop": "Arraste e solte arquivos aqui", - "or_browse": "ou clique para navegar", + "file_size_limit": "Tamanho máximo do arquivo: {maxMB} MB por arquivo.", + "upload_limits": "Envie arquivos ou pastas inteiras", + "drop_files": "Solte arquivos ou pastas aqui", + "drag_drop": "Arraste e solte arquivos ou pastas aqui", + "or_browse": "ou clique para navegar arquivos e pastas", "browse_files": "Navegar arquivos", + "browse_folder": "Navegar pasta", "selected_files": "Arquivos selecionados ({count})", "total_size": "Tamanho total", "clear_all": "Limpar tudo", @@ -394,13 +395,9 @@ "upload_error_desc": "Erro ao enviar arquivos", "supported_file_types": "Tipos de arquivo suportados", "file_types_desc": "Estes tipos de arquivo são suportados com base na configuração atual do seu serviço ETL.", - "max_files_exceeded": "Limite de arquivos excedido", - "max_files_exceeded_desc": "Você pode enviar no máximo {max} arquivos de uma vez.", - "max_size_exceeded": "Limite de tamanho excedido", - "max_size_exceeded_desc": "O tamanho total dos arquivos não pode exceder {max} MB.", - "file_limit_reached": "Máximo de arquivos atingido", - "file_limit_reached_desc": "Remova alguns arquivos para adicionar mais (máximo {max} arquivos).", - "remaining_capacity": "{files} arquivos restantes • {sizeMB} MB disponíveis" + "file_too_large": "Arquivo muito grande", + "file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.", + "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada." }, "add_webpage": { "title": "Adicionar páginas web para rastreamento", diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json index 819432410..db634dfd9 100644 --- a/surfsense_web/messages/zh.json +++ b/surfsense_web/messages/zh.json @@ -360,12 +360,13 @@ "upload_documents": { "title": "上传文档", "subtitle": "上传您的文件,使其可通过 AI 对话进行搜索和访问。", - "file_size_limit": "最大文件大小:每个文件 50MB。", - "upload_limits": "上传限制:最多 {maxFiles} 个文件,总大小不超过 {maxSizeMB}MB。", - "drop_files": "放下文件到这里", - "drag_drop": "拖放文件到这里", - "or_browse": "或点击浏览", + "file_size_limit": "最大文件大小:每个文件 {maxMB}MB。", + "upload_limits": "上传文件或整个文件夹", + "drop_files": "将文件或文件夹拖放到此处", + "drag_drop": "将文件或文件夹拖放到此处", + "or_browse": "或点击浏览文件和文件夹", "browse_files": "浏览文件", + "browse_folder": "浏览文件夹", "selected_files": "已选择的文件 ({count})", "total_size": "总大小", "clear_all": "全部清除", @@ -378,13 +379,9 @@ "upload_error_desc": "上传文件时出错", "supported_file_types": "支持的文件类型", "file_types_desc": "根据您当前的 ETL 服务配置支持这些文件类型。", - "max_files_exceeded": "超过文件数量限制", - "max_files_exceeded_desc": "一次最多只能上传 {max} 个文件。", - "max_size_exceeded": "超过文件大小限制", - "max_size_exceeded_desc": "文件总大小不能超过 {max}MB。", - "file_limit_reached": "已达到最大文件数量", - "file_limit_reached_desc": "移除一些文件以添加更多(最多 {max} 个文件)。", - "remaining_capacity": "剩余 {files} 个文件名额 • 可用 {sizeMB}MB" + "file_too_large": "文件过大", + "file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。", + "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。" }, "add_webpage": { "title": "添加网页爬取", From eb1785027471dbce67a0b9e09f8ce6791c57c869 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 2 Apr 2026 19:45:28 -0700 Subject: [PATCH 064/202] chore: linting --- .../components/DocumentsTableShell.tsx | 60 ++++---- .../new-chat/[[...chat_id]]/page.tsx | 2 +- surfsense_web/app/docs/[[...slug]]/page.tsx | 2 +- surfsense_web/app/error.tsx | 1 - .../comment-composer/comment-composer.tsx | 5 +- .../components/editor-panel/editor-panel.tsx | 4 +- .../layout/ui/tabs/DocumentTabContent.tsx | 4 +- surfsense_web/components/markdown-viewer.tsx | 3 +- .../new-chat/source-detail-panel.tsx | 141 ++++++++++-------- .../components/sources/DocumentUploadTab.tsx | 3 +- surfsense_web/components/ui/checkbox.tsx | 2 +- surfsense_web/components/ui/dropdown-menu.tsx | 2 +- surfsense_web/components/ui/toggle-group.tsx | 2 +- surfsense_web/components/ui/toggle.tsx | 2 +- .../query-client/query-client.provider.tsx | 2 +- 15 files changed, 127 insertions(+), 108 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index 1c246ed71..ceef9f2e1 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -947,36 +947,36 @@ export function DocumentsTableShell({ WebkitMaskImage: `linear-gradient(to bottom, ${previewScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${previewScrollPos === "bottom" ? "black" : "transparent"})`, }} > - {viewingLoading ? ( -
- -
- ) : ( - <> - - {viewingDoc && ( -
- -
- )} - - )} + {viewingLoading ? ( +
+ +
+ ) : ( + <> + + {viewingDoc && ( +
+ +
+ )} + + )}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx index 9e9374dd6..ac1fcdaf9 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/new-chat/[[...chat_id]]/page.tsx @@ -232,7 +232,7 @@ export default function NewChatPage() { const prevById = new Map(prev.map((m) => [m.id, m])); return syncedMessages.map((msg) => { - const member = msg.author_id ? memberById.get(msg.author_id) ?? null : null; + const member = msg.author_id ? (memberById.get(msg.author_id) ?? null) : null; // Preserve existing author info if member lookup fails (e.g., cloned chats) const existingMsg = prevById.get(`msg-${msg.id}`); diff --git a/surfsense_web/app/docs/[[...slug]]/page.tsx b/surfsense_web/app/docs/[[...slug]]/page.tsx index 166131f90..0905be8a0 100644 --- a/surfsense_web/app/docs/[[...slug]]/page.tsx +++ b/surfsense_web/app/docs/[[...slug]]/page.tsx @@ -1,8 +1,8 @@ import { DocsBody, DocsDescription, DocsPage, DocsTitle } from "fumadocs-ui/page"; import { notFound } from "next/navigation"; +import { cache } from "react"; import { source } from "@/lib/source"; import { getMDXComponents } from "@/mdx-components"; -import { cache } from "react"; const getDocPage = cache((slug?: string[]) => { return source.getPage(slug); diff --git a/surfsense_web/app/error.tsx b/surfsense_web/app/error.tsx index 3935f84d5..70c3d9632 100644 --- a/surfsense_web/app/error.tsx +++ b/surfsense_web/app/error.tsx @@ -1,6 +1,5 @@ "use client"; - import { useEffect } from "react"; export default function ErrorPage({ diff --git a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx index bee3f2da6..3d6ea384b 100644 --- a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx +++ b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx @@ -16,10 +16,7 @@ function convertDisplayToData(displayContent: string, mentions: InsertedMention[ const sortedMentions = [...mentions].sort((a, b) => b.displayName.length - a.displayName.length); const mentionPatterns = sortedMentions.map((mention) => ({ - pattern: new RegExp( - `@${escapeRegExp(mention.displayName)}(?=\\s|$|[.,!?;:])`, - "g" - ), + pattern: new RegExp(`@${escapeRegExp(mention.displayName)}(?=\\s|$|[.,!?;:])`, "g"), dataFormat: `@[${mention.id}]`, })); diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 4b7079aef..3f167dc24 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -223,7 +223,9 @@ export function EditorPanelContent({ - This document is too large for the editor ({Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {editorDoc.chunk_count ?? 0} chunks). Showing a preview below. + This document is too large for the editor ( + {Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB,{" "} + {editorDoc.chunk_count ?? 0} chunks). Showing a preview below.
diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx index c17616c53..bff088971 100644 --- a/surfsense_web/components/new-chat/source-detail-panel.tsx +++ b/surfsense_web/components/new-chat/source-detail-panel.tsx @@ -1,7 +1,17 @@ "use client"; import { useQuery } from "@tanstack/react-query"; -import { BookOpen, ChevronDown, ChevronUp, ExternalLink, FileText, Hash, Loader2, Sparkles, X } from "lucide-react"; +import { + BookOpen, + ChevronDown, + ChevronUp, + ExternalLink, + FileText, + Hash, + Loader2, + Sparkles, + X, +} from "lucide-react"; import { AnimatePresence, motion, useReducedMotion } from "motion/react"; import { useTranslations } from "next-intl"; import type React from "react"; @@ -56,48 +66,52 @@ interface ChunkCardProps { } const ChunkCard = memo( - forwardRef(({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => { - return ( -
- {isCited &&
} - -
-
-
- {chunkNumber} -
- Chunk {chunkNumber} of {totalChunks} -
- {isCited && ( - - - Cited Source - + forwardRef( + ({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => { + return ( +
+ > + {isCited &&
} -
- +
+
+
+ {chunkNumber} +
+ + Chunk {chunkNumber} of {totalChunks} + +
+ {isCited && ( + + + Cited Source + + )} +
+ +
+ +
-
- ); - }) + ); + } + ) ); ChunkCard.displayName = "ChunkCard"; @@ -142,11 +156,16 @@ export function SourceDetailPanel({ staleTime: 5 * 60 * 1000, }); - const totalChunks = (documentData && "total_chunks" in documentData) - ? (documentData.total_chunks ?? documentData.chunks.length) - : (documentData?.chunks?.length ?? 0); - const [beforeChunks, setBeforeChunks] = useState>([]); - const [afterChunks, setAfterChunks] = useState>([]); + const totalChunks = + documentData && "total_chunks" in documentData + ? (documentData.total_chunks ?? documentData.chunks.length) + : (documentData?.chunks?.length ?? 0); + const [beforeChunks, setBeforeChunks] = useState< + Array<{ id: number; content: string; created_at: string }> + >([]); + const [afterChunks, setAfterChunks] = useState< + Array<{ id: number; content: string; created_at: string }> + >([]); const [loadingBefore, setLoadingBefore] = useState(false); const [loadingAfter, setLoadingAfter] = useState(false); @@ -155,8 +174,8 @@ export function SourceDetailPanel({ setAfterChunks([]); }, [chunkId, open]); - const chunkStartIndex = (documentData && "chunk_start_index" in documentData) - ? (documentData.chunk_start_index ?? 0) : 0; + const chunkStartIndex = + documentData && "chunk_start_index" in documentData ? (documentData.chunk_start_index ?? 0) : 0; const initialChunks = documentData?.chunks ?? []; const allChunks = [...beforeChunks, ...initialChunks, ...afterChunks]; const absoluteStart = chunkStartIndex - beforeChunks.length; @@ -177,11 +196,11 @@ export function SourceDetailPanel({ page_size: count, start_offset: absoluteStart - count, }); - const existingIds = new Set(allChunks.map(c => c.id)); + const existingIds = new Set(allChunks.map((c) => c.id)); const newChunks = result.items - .filter(c => !existingIds.has(c.id)) - .map(c => ({ id: c.id, content: c.content, created_at: c.created_at })); - setBeforeChunks(prev => [...newChunks, ...prev]); + .filter((c) => !existingIds.has(c.id)) + .map((c) => ({ id: c.id, content: c.content, created_at: c.created_at })); + setBeforeChunks((prev) => [...newChunks, ...prev]); } catch (err) { console.error("Failed to load earlier chunks:", err); } finally { @@ -199,11 +218,11 @@ export function SourceDetailPanel({ page_size: EXPAND_SIZE, start_offset: absoluteEnd, }); - const existingIds = new Set(allChunks.map(c => c.id)); + const existingIds = new Set(allChunks.map((c) => c.id)); const newChunks = result.items - .filter(c => !existingIds.has(c.id)) - .map(c => ({ id: c.id, content: c.content, created_at: c.created_at })); - setAfterChunks(prev => [...prev, ...newChunks]); + .filter((c) => !existingIds.has(c.id)) + .map((c) => ({ id: c.id, content: c.content, created_at: c.created_at })); + setAfterChunks((prev) => [...prev, ...newChunks]); } catch (err) { console.error("Failed to load later chunks:", err); } finally { @@ -400,12 +419,12 @@ export function SourceDetailPanel({ {documentData && "document_type" in documentData ? formatDocumentType(documentData.document_type) : sourceType && formatDocumentType(sourceType)} - {totalChunks > 0 && ( - - • {totalChunks} chunk{totalChunks !== 1 ? "s" : ""} - {allChunks.length < totalChunks && ` (showing ${allChunks.length})`} - - )} + {totalChunks > 0 && ( + + • {totalChunks} chunk{totalChunks !== 1 ? "s" : ""} + {allChunks.length < totalChunks && ` (showing ${allChunks.length})`} + + )}

diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index faa042d8e..723a3ad36 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -273,8 +273,7 @@ export function DocumentUploadTab({ - {t("file_size_limit", { maxMB: MAX_FILE_SIZE_MB })}{" "} - {t("upload_limits")} + {t("file_size_limit", { maxMB: MAX_FILE_SIZE_MB })} {t("upload_limits")} diff --git a/surfsense_web/components/ui/checkbox.tsx b/surfsense_web/components/ui/checkbox.tsx index 586e3e602..056761547 100644 --- a/surfsense_web/components/ui/checkbox.tsx +++ b/surfsense_web/components/ui/checkbox.tsx @@ -1,7 +1,7 @@ "use client"; -import { CheckIcon } from "lucide-react"; import * as CheckboxPrimitive from "@radix-ui/react-checkbox"; +import { CheckIcon } from "lucide-react"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/dropdown-menu.tsx b/surfsense_web/components/ui/dropdown-menu.tsx index 2904b93dd..d387a4592 100644 --- a/surfsense_web/components/ui/dropdown-menu.tsx +++ b/surfsense_web/components/ui/dropdown-menu.tsx @@ -1,7 +1,7 @@ "use client"; -import { CheckIcon, ChevronRightIcon, CircleIcon } from "lucide-react"; import * as DropdownMenuPrimitive from "@radix-ui/react-dropdown-menu"; +import { CheckIcon, ChevronRightIcon, CircleIcon } from "lucide-react"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/toggle-group.tsx b/surfsense_web/components/ui/toggle-group.tsx index 33aa433b2..6740fade0 100644 --- a/surfsense_web/components/ui/toggle-group.tsx +++ b/surfsense_web/components/ui/toggle-group.tsx @@ -1,7 +1,7 @@ "use client"; -import type { VariantProps } from "class-variance-authority"; import * as ToggleGroupPrimitive from "@radix-ui/react-toggle-group"; +import type { VariantProps } from "class-variance-authority"; import * as React from "react"; import { toggleVariants } from "@/components/ui/toggle"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/components/ui/toggle.tsx b/surfsense_web/components/ui/toggle.tsx index f0e68cec9..6a6e24025 100644 --- a/surfsense_web/components/ui/toggle.tsx +++ b/surfsense_web/components/ui/toggle.tsx @@ -1,7 +1,7 @@ "use client"; -import { cva, type VariantProps } from "class-variance-authority"; import * as TogglePrimitive from "@radix-ui/react-toggle"; +import { cva, type VariantProps } from "class-variance-authority"; import type * as React from "react"; import { cn } from "@/lib/utils"; diff --git a/surfsense_web/lib/query-client/query-client.provider.tsx b/surfsense_web/lib/query-client/query-client.provider.tsx index 6dc2a4258..30c6d9767 100644 --- a/surfsense_web/lib/query-client/query-client.provider.tsx +++ b/surfsense_web/lib/query-client/query-client.provider.tsx @@ -1,6 +1,6 @@ "use client"; -import dynamic from "next/dynamic"; import { QueryClientAtomProvider } from "jotai-tanstack-query/react"; +import dynamic from "next/dynamic"; import { queryClient } from "./client"; const ReactQueryDevtools = dynamic( From 9a370a37d4b485ca9f25e1f7db74bab951e1ce04 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 2 Apr 2026 20:06:02 -0700 Subject: [PATCH 065/202] docs: update README files to highlight SurfSense advantages over NotebookLM - Expanded sections detailing limitations of NotebookLM. - Added comparison table between SurfSense and Google NotebookLM. - Enhanced descriptions of SurfSense features and capabilities. - Updated example links and images for better clarity and relevance. --- README.es.md | 64 +++++++++++++++++++++++++++++++++---------------- README.hi.md | 64 +++++++++++++++++++++++++++++++++---------------- README.md | 62 ++++++++++++++++++++++++++++++++--------------- README.pt-BR.md | 64 +++++++++++++++++++++++++++++++++---------------- README.zh-CN.md | 64 +++++++++++++++++++++++++++++++++---------------- 5 files changed, 219 insertions(+), 99 deletions(-) diff --git a/README.es.md b/README.es.md index d61504cd5..b62d2cece 100644 --- a/README.es.md +++ b/README.es.md @@ -21,9 +21,28 @@
# SurfSense -Conecta cualquier LLM a tus fuentes de conocimiento internas y chatea con él en tiempo real junto a tu equipo. Alternativa de código abierto a NotebookLM, Perplexity y Glean. -SurfSense es un agente de investigación de IA altamente personalizable, conectado a fuentes externas como motores de búsqueda (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian y más por venir. +NotebookLM es una de las mejores y más útiles plataformas de IA que existen, pero una vez que comienzas a usarla regularmente también sientes sus limitaciones dejando algo que desear. + +1. Hay límites en la cantidad de fuentes que puedes agregar en un notebook. +2. Hay límites en la cantidad de notebooks que puedes tener. +3. No puedes tener fuentes que excedan 500,000 palabras y más de 200MB. +4. Estás bloqueado con los servicios de Google (LLMs, modelos de uso, etc.) sin opción de configurarlos. +5. Fuentes de datos externas e integraciones de servicios limitadas. +6. El agente de NotebookLM está específicamente optimizado solo para estudiar e investigar, pero puedes hacer mucho más con los datos de origen. +7. Falta de soporte multijugador. + +...y más. + +**SurfSense está específicamente hecho para resolver estos problemas.** SurfSense te permite: + +- **Controla Tu Flujo de Datos** - Mantén tus datos privados y seguros. +- **Sin Límites de Datos** - Agrega una cantidad ilimitada de fuentes y notebooks. +- **Sin Dependencia de Proveedores** - Configura cualquier modelo LLM, de imagen, TTS y STT. +- **25+ Fuentes de Datos Externas** - Agrega tus fuentes desde Google Drive, OneDrive, Dropbox, Notion y muchos otros servicios externos. +- **Soporte Multijugador en Tiempo Real** - Trabaja fácilmente con los miembros de tu equipo en un notebook compartido. + +...y más por venir. @@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## Ejemplo de Agente de Video -https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562 +https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a @@ -133,24 +152,29 @@ Para Docker Compose, instalación manual y otras opciones de despliegue, consult

Comentarios en Tiempo Real

-## Funcionalidades Principales +## SurfSense vs Google NotebookLM -| Funcionalidad | Descripción | -|----------------|-------------| -| Alternativa OSS | Reemplazo directo de NotebookLM, Perplexity y Glean con colaboración en equipo en tiempo real | -| 50+ Formatos de Archivo | Sube documentos, imágenes, videos vía LlamaCloud, Unstructured o Docling (local) | -| Búsqueda Híbrida | Semántica + Texto completo con Índices Jerárquicos y Reciprocal Rank Fusion | -| Respuestas con Citas | Chatea con tu base de conocimiento y obtén respuestas citadas al estilo Perplexity | -| Arquitectura de Agentes Profundos | Impulsado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) con planificación, subagentes y acceso al sistema de archivos | -| Soporte Universal de LLM | 100+ LLMs, 6000+ modelos de embeddings, todos los principales rerankers vía OpenAI spec y LiteLLM | -| Privacidad Primero | Soporte completo de LLM local (vLLM, Ollama) tus datos son tuyos | -| Colaboración en Equipo | RBAC con roles de Propietario / Admin / Editor / Visor, chat en tiempo real e hilos de comentarios | -| Generación de Videos | Genera videos con narración y visuales | -| Generación de Presentaciones | Crea presentaciones editables basadas en diapositivas | -| Generación de Podcasts | Podcast de 3 min en menos de 20 segundos; múltiples proveedores TTS (OpenAI, Azure, Kokoro) | -| Extensión de Navegador | Extensión multi-navegador para guardar cualquier página web, incluyendo páginas protegidas por autenticación | -| 27+ Conectores | Motores de búsqueda, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord y [más](#fuentes-externas) | -| Auto-Hospedable | Código abierto, Docker en un solo comando o Docker Compose completo para producción | +| Característica | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **Fuentes por Notebook** | 50 (Gratis) a 600 (Ultra, $249.99/mes) | Ilimitadas | +| **Número de Notebooks** | 100 (Gratis) a 500 (planes de pago) | Ilimitados | +| **Límite de Tamaño de Fuente** | 500,000 palabras / 200MB por fuente | Sin límite | +| **Precios** | Nivel gratuito disponible; Pro $19.99/mes, Ultra $249.99/mes | Gratuito y de código abierto, auto-hospedable en tu propia infra | +| **Soporte de LLM** | Solo Google Gemini | 100+ LLMs vía OpenAI spec y LiteLLM | +| **Modelos de Embeddings** | Solo Google | 6,000+ modelos de embeddings, todos los principales rerankers | +| **LLMs Locales / Privados** | No disponible | Soporte completo (vLLM, Ollama) - tus datos son tuyos | +| **Auto-Hospedable** | No | Sí - Docker en un solo comando o Docker Compose completo | +| **Código Abierto** | No | Sí | +| **Conectores Externos** | Google Drive, YouTube, sitios web | 27+ conectores - Motores de búsqueda, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord y [más](#fuentes-externas) | +| **Soporte de Formatos de Archivo** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, imágenes, URLs web, YouTube | 50+ formatos - documentos, imágenes, videos vía LlamaCloud, Unstructured o Docling (local) | +| **Búsqueda** | Búsqueda semántica | Búsqueda Híbrida - Semántica + Texto completo con Índices Jerárquicos y Reciprocal Rank Fusion | +| **Respuestas con Citas** | Sí | Sí - Respuestas citadas al estilo Perplexity | +| **Arquitectura de Agentes** | No | Sí - impulsado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) con planificación, subagentes y acceso al sistema de archivos | +| **Multijugador en Tiempo Real** | Notebooks compartidos con roles de Visor/Editor (sin chat en tiempo real) | RBAC con roles de Propietario / Admin / Editor / Visor, chat en tiempo real e hilos de comentarios | +| **Generación de Videos** | Resúmenes en video cinemáticos vía Veo 3 (solo Ultra) | Disponible (NotebookLM es mejor aquí, mejorando activamente) | +| **Generación de Presentaciones** | Diapositivas más atractivas pero no editables | Crea presentaciones editables basadas en diapositivas | +| **Generación de Podcasts** | Resúmenes de audio con hosts e idiomas personalizables | Disponible con múltiples proveedores TTS (NotebookLM es mejor aquí, mejorando activamente) | +| **Extensión de Navegador** | No | Extensión multi-navegador para guardar cualquier página web, incluyendo páginas protegidas por autenticación |
Lista completa de Fuentes Externas diff --git a/README.hi.md b/README.hi.md index 011dbf5db..b49bddc72 100644 --- a/README.hi.md +++ b/README.hi.md @@ -21,9 +21,28 @@
# SurfSense -किसी भी LLM को अपने आंतरिक ज्ञान स्रोतों से जोड़ें और अपनी टीम के साथ रीयल-टाइम में चैट करें। NotebookLM, Perplexity और Glean का ओपन सोर्स विकल्प। -SurfSense एक अत्यधिक अनुकूलन योग्य AI शोध एजेंट है, जो बाहरी स्रोतों से जुड़ा है जैसे सर्च इंजन (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian और भी बहुत कुछ आने वाला है। +NotebookLM वहाँ उपलब्ध सबसे अच्छे और सबसे उपयोगी AI प्लेटफ़ॉर्म में से एक है, लेकिन जब आप इसे नियमित रूप से उपयोग करना शुरू करते हैं तो आप इसकी सीमाओं को भी महसूस करते हैं जो कुछ और की चाह छोड़ती हैं। + +1. एक notebook में जोड़े जा सकने वाले स्रोतों की मात्रा पर सीमाएं हैं। +2. आपके पास कितने notebooks हो सकते हैं इस पर सीमाएं हैं। +3. आपके पास ऐसे स्रोत नहीं हो सकते जो 500,000 शब्दों और 200MB से अधिक हों। +4. आप Google सेवाओं (LLMs, उपयोग मॉडल, आदि) में बंद हैं और उन्हें कॉन्फ़िगर करने का कोई विकल्प नहीं है। +5. सीमित बाहरी डेटा स्रोत और सेवा एकीकरण। +6. NotebookLM एजेंट विशेष रूप से केवल अध्ययन और शोध के लिए अनुकूलित है, लेकिन आप स्रोत डेटा के साथ और भी बहुत कुछ कर सकते हैं। +7. मल्टीप्लेयर सपोर्ट की कमी। + +...और भी बहुत कुछ। + +**SurfSense विशेष रूप से इन समस्याओं को हल करने के लिए बनाया गया है।** SurfSense आपको सक्षम बनाता है: + +- **अपने डेटा प्रवाह को नियंत्रित करें** - अपने डेटा को निजी और सुरक्षित रखें। +- **कोई डेटा सीमा नहीं** - असीमित मात्रा में स्रोत और notebooks जोड़ें। +- **कोई विक्रेता लॉक-इन नहीं** - किसी भी LLM, इमेज, TTS और STT मॉडल को कॉन्फ़िगर करें। +- **25+ बाहरी डेटा स्रोत** - Google Drive, OneDrive, Dropbox, Notion और कई अन्य बाहरी सेवाओं से अपने स्रोत जोड़ें। +- **रीयल-टाइम मल्टीप्लेयर सपोर्ट** - एक साझा notebook में अपनी टीम के सदस्यों के साथ आसानी से काम करें। + +...और भी बहुत कुछ आने वाला है। @@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## वीडियो एजेंट नमूना -https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562 +https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a @@ -133,24 +152,29 @@ Docker Compose, मैनुअल इंस्टॉलेशन और अन

रीयल-टाइम कमेंट्स

-## प्रमुख विशेषताएं +## SurfSense vs Google NotebookLM -| विशेषता | विवरण | -|----------|--------| -| OSS विकल्प | रीयल-टाइम टीम सहयोग के साथ NotebookLM, Perplexity और Glean का सीधा प्रतिस्थापन | -| 50+ फ़ाइल फ़ॉर्मेट | LlamaCloud, Unstructured या Docling (लोकल) के माध्यम से दस्तावेज़, चित्र, वीडियो अपलोड करें | -| हाइब्रिड सर्च | हायरार्किकल इंडाइसेस और Reciprocal Rank Fusion के साथ सिमैंटिक + फुल टेक्स्ट सर्च | -| उद्धृत उत्तर | अपने ज्ञान आधार के साथ चैट करें और Perplexity शैली के उद्धृत उत्तर पाएं | -| डीप एजेंट आर्किटेक्चर | [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) द्वारा संचालित, योजना, सब-एजेंट और फ़ाइल सिस्टम एक्सेस | -| यूनिवर्सल LLM सपोर्ट | 100+ LLMs, 6000+ एम्बेडिंग मॉडल, सभी प्रमुख रीरैंकर्स OpenAI spec और LiteLLM के माध्यम से | -| प्राइवेसी फर्स्ट | पूर्ण लोकल LLM सपोर्ट (vLLM, Ollama) आपका डेटा आपका रहता है | -| टीम सहयोग | मालिक / एडमिन / संपादक / दर्शक भूमिकाओं के साथ RBAC, रीयल-टाइम चैट और कमेंट थ्रेड | -| वीडियो जनरेशन | नैरेशन और विज़ुअल के साथ वीडियो बनाएं | -| प्रेजेंटेशन जनरेशन | संपादन योग्य, स्लाइड आधारित प्रेजेंटेशन बनाएं | -| पॉडकास्ट जनरेशन | 20 सेकंड से कम में 3 मिनट का पॉडकास्ट; कई TTS प्रदाता (OpenAI, Azure, Kokoro) | -| ब्राउज़र एक्सटेंशन | किसी भी वेबपेज को सहेजने के लिए क्रॉस-ब्राउज़र एक्सटेंशन, प्रमाणीकरण सुरक्षित पेज सहित | -| 27+ कनेक्टर्स | सर्च इंजन, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord और [अधिक](#बाहरी-स्रोत) | -| सेल्फ-होस्ट करने योग्य | ओपन सोर्स, Docker एक कमांड या प्रोडक्शन के लिए पूर्ण Docker Compose | +| विशेषता | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **प्रति Notebook स्रोत** | 50 (मुफ़्त) से 600 (Ultra, $249.99/माह) | असीमित | +| **Notebooks की संख्या** | 100 (मुफ़्त) से 500 (सशुल्क योजनाएं) | असीमित | +| **स्रोत आकार सीमा** | 500,000 शब्द / 200MB प्रति स्रोत | कोई सीमा नहीं | +| **मूल्य निर्धारण** | मुफ़्त स्तर उपलब्ध; Pro $19.99/माह, Ultra $249.99/माह | मुफ़्त और ओपन सोर्स, अपनी इंफ्रा पर सेल्फ-होस्ट करें | +| **LLM सपोर्ट** | केवल Google Gemini | 100+ LLMs OpenAI spec और LiteLLM के माध्यम से | +| **एम्बेडिंग मॉडल** | केवल Google | 6,000+ एम्बेडिंग मॉडल, सभी प्रमुख रीरैंकर्स | +| **लोकल / प्राइवेट LLMs** | उपलब्ध नहीं | पूर्ण सपोर्ट (vLLM, Ollama) - आपका डेटा आपका रहता है | +| **सेल्फ-होस्ट करने योग्य** | नहीं | हाँ - Docker एक कमांड या पूर्ण Docker Compose | +| **ओपन सोर्स** | नहीं | हाँ | +| **बाहरी कनेक्टर्स** | Google Drive, YouTube, वेबसाइटें | 27+ कनेक्टर्स - सर्च इंजन, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord और [अधिक](#बाहरी-स्रोत) | +| **फ़ाइल फ़ॉर्मेट सपोर्ट** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, इमेज, वेब URLs, YouTube | 50+ फ़ॉर्मेट - दस्तावेज़, इमेज, वीडियो LlamaCloud, Unstructured या Docling (लोकल) के माध्यम से | +| **सर्च** | सिमैंटिक सर्च | हाइब्रिड सर्च - हायरार्किकल इंडाइसेस और Reciprocal Rank Fusion के साथ सिमैंटिक + फुल टेक्स्ट | +| **उद्धृत उत्तर** | हाँ | हाँ - Perplexity शैली के उद्धृत उत्तर | +| **एजेंट आर्किटेक्चर** | नहीं | हाँ - [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) द्वारा संचालित, योजना, सब-एजेंट और फ़ाइल सिस्टम एक्सेस | +| **रीयल-टाइम मल्टीप्लेयर** | दर्शक/संपादक भूमिकाओं के साथ साझा notebooks (कोई रीयल-टाइम चैट नहीं) | मालिक / एडमिन / संपादक / दर्शक भूमिकाओं के साथ RBAC, रीयल-टाइम चैट और कमेंट थ्रेड | +| **वीडियो जनरेशन** | Veo 3 के माध्यम से सिनेमैटिक वीडियो ओवरव्यू (केवल Ultra) | उपलब्ध (NotebookLM यहाँ बेहतर है, सक्रिय रूप से सुधार हो रहा है) | +| **प्रेजेंटेशन जनरेशन** | बेहतर दिखने वाली स्लाइड्स लेकिन संपादन योग्य नहीं | संपादन योग्य, स्लाइड आधारित प्रेजेंटेशन बनाएं | +| **पॉडकास्ट जनरेशन** | कस्टमाइज़ेबल होस्ट और भाषाओं के साथ ऑडियो ओवरव्यू | कई TTS प्रदाताओं के साथ उपलब्ध (NotebookLM यहाँ बेहतर है, सक्रिय रूप से सुधार हो रहा है) | +| **ब्राउज़र एक्सटेंशन** | नहीं | किसी भी वेबपेज को सहेजने के लिए क्रॉस-ब्राउज़र एक्सटेंशन, प्रमाणीकरण सुरक्षित पेज सहित |
बाहरी स्रोतों की पूरी सूची diff --git a/README.md b/README.md index f007fd43c..12ea4912a 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,28 @@
# SurfSense -Connect any LLM to your internal knowledge sources and chat with it in real time alongside your team. OSS alternative to NotebookLM, Perplexity, and Glean. -SurfSense is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian and more to come. +NotebookLM is one of the best and most useful AI platforms out there, but once you start using it regularly you also feel its limitations leaving something to be desired more. + +1. There are limits on the amount of sources you can add in a notebook. +2. There are limits on the number of notebooks you can have. +3. You cannot have sources that exceed 500,000 words and are more than 200MB. +4. You are vendor locked in to Google services (LLMs, usage models, etc.) with no option to configure them. +5. Limited external data sources and service integrations. +6. NotebookLM Agent is specifically optimised for just studying and researching, but you can do so much more with the source data. +7. Lack of multiplayer support. + +...and more. + +**SurfSense is specifically made to solve these problems.** SurfSense empowers you to: + +- **Control Your Data Flow** - Keep your data private and secure. +- **No Data Limits** - Add an unlimited amount of sources and notebooks. +- **No Vendor Lock-in** - Configure any LLM, image, TTS, and STT models to use. +- **25+ External Data Sources** - Add your sources from Google Drive, OneDrive, Dropbox, Notion, and many other external services. +- **Real-Time Multiplayer Support** - Work easily with your team members in a shared notebook. + +...and more to come. @@ -134,24 +153,29 @@ For Docker Compose, manual installation, and other deployment options, see the [

Realtime Comments

-## Key Features +## SurfSense vs Google NotebookLM -| Feature | Description | -|---------|-------------| -| OSS Alternative | Drop in replacement for NotebookLM, Perplexity, and Glean with real time team collaboration | -| 50+ File Formats | Upload documents, images, videos via LlamaCloud, Unstructured, or Docling (local) | -| Hybrid Search | Semantic + Full Text Search with Hierarchical Indices and Reciprocal Rank Fusion | -| Cited Answers | Chat with your knowledge base and get Perplexity style cited responses | -| Deep Agent Architecture | Powered by [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) planning, subagents, and file system access | -| Universal LLM Support | 100+ LLMs, 6000+ embedding models, all major rerankers via OpenAI spec & LiteLLM | -| Privacy First | Full local LLM support (vLLM, Ollama) your data stays yours | -| Team Collaboration | RBAC with Owner / Admin / Editor / Viewer roles, real time chat & comment threads | -| Video Generation | Generate videos with narration and visuals | -| Presentation Generation | Create editable, slide based presentations | -| Podcast Generation | 3 min podcast in under 20 seconds; multiple TTS providers (OpenAI, Azure, Kokoro) | -| Browser Extension | Cross browser extension to save any webpage, including auth protected pages | -| 27+ Connectors | Search Engines, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord & [more](#external-sources) | -| Self Hostable | Open source, Docker one liner or full Docker Compose for production | +| Feature | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **Sources per Notebook** | 50 (Free) to 600 (Ultra, $249.99/mo) | Unlimited | +| **Number of Notebooks** | 100 (Free) to 500 (paid tiers) | Unlimited | +| **Source Size Limit** | 500,000 words / 200MB per source | No limit | +| **Pricing** | Free tier available; Pro $19.99/mo, Ultra $249.99/mo | Free and open source, self-host on your own infra | +| **LLM Support** | Google Gemini only | 100+ LLMs via OpenAI spec & LiteLLM | +| **Embedding Models** | Google only | 6,000+ embedding models, all major rerankers | +| **Local / Private LLMs** | Not available | Full support (vLLM, Ollama) - your data stays yours | +| **Self Hostable** | No | Yes - Docker one-liner or full Docker Compose | +| **Open Source** | No | Yes | +| **External Connectors** | Google Drive, YouTube, websites | 27+ connectors - Search Engines, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord & [more](#external-sources) | +| **File Format Support** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, images, web URLs, YouTube | 50+ formats - documents, images, videos via LlamaCloud, Unstructured, or Docling (local) | +| **Search** | Semantic search | Hybrid Search - Semantic + Full Text with Hierarchical Indices & Reciprocal Rank Fusion | +| **Cited Answers** | Yes | Yes - Perplexity-style cited responses | +| **Agentic Architecture** | No | Yes - powered by [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) with planning, subagents, and file system access | +| **Real-Time Multiplayer** | Shared notebooks with Viewer/Editor roles (no real-time chat) | RBAC with Owner / Admin / Editor / Viewer roles, real-time chat & comment threads | +| **Video Generation** | Cinematic Video Overviews via Veo 3 (Ultra only) | Available (NotebookLM is better here, actively improving) | +| **Presentation Generation** | Better looking slides but not editable | Create editable, slide-based presentations | +| **Podcast Generation** | Audio Overviews with customizable hosts and languages | Available with multiple TTS providers (NotebookLM is better here, actively improving) | +| **Browser Extension** | No | Cross-browser extension to save any webpage, including auth-protected pages |
Full list of External Sources diff --git a/README.pt-BR.md b/README.pt-BR.md index 4306b0767..50a8b739e 100644 --- a/README.pt-BR.md +++ b/README.pt-BR.md @@ -21,9 +21,28 @@
# SurfSense -Conecte qualquer LLM às suas fontes de conhecimento internas e converse com ele em tempo real junto com sua equipe. Alternativa de código aberto ao NotebookLM, Perplexity e Glean. -SurfSense é um agente de pesquisa de IA altamente personalizável, conectado a fontes externas como mecanismos de busca (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian e mais por vir. +O NotebookLM é uma das melhores e mais úteis plataformas de IA disponíveis, mas quando você começa a usá-lo regularmente também sente suas limitações deixando algo a desejar. + +1. Há limites na quantidade de fontes que você pode adicionar em um notebook. +2. Há limites no número de notebooks que você pode ter. +3. Você não pode ter fontes que excedam 500.000 palavras e mais de 200MB. +4. Você fica preso aos serviços do Google (LLMs, modelos de uso, etc.) sem opção de configurá-los. +5. Fontes de dados externas e integrações de serviços limitadas. +6. O agente do NotebookLM é especificamente otimizado apenas para estudar e pesquisar, mas você pode fazer muito mais com os dados de origem. +7. Falta de suporte multiplayer. + +...e mais. + +**O SurfSense foi feito especificamente para resolver esses problemas.** O SurfSense permite que você: + +- **Controle Seu Fluxo de Dados** - Mantenha seus dados privados e seguros. +- **Sem Limites de Dados** - Adicione uma quantidade ilimitada de fontes e notebooks. +- **Sem Dependência de Fornecedor** - Configure qualquer modelo LLM, de imagem, TTS e STT. +- **25+ Fontes de Dados Externas** - Adicione suas fontes do Google Drive, OneDrive, Dropbox, Notion e muitos outros serviços externos. +- **Suporte Multiplayer em Tempo Real** - Trabalhe facilmente com os membros da sua equipe em um notebook compartilhado. + +...e mais por vir. @@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## Exemplo de Agente de Vídeo -https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562 +https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a @@ -133,24 +152,29 @@ Para Docker Compose, instalação manual e outras opções de implantação, con

Comentários em Tempo Real

-## Funcionalidades Principais +## SurfSense vs Google NotebookLM -| Funcionalidade | Descrição | -|----------------|-----------| -| Alternativa OSS | Substituto direto do NotebookLM, Perplexity e Glean com colaboração em equipe em tempo real | -| 50+ Formatos de Arquivo | Faça upload de documentos, imagens, vídeos via LlamaCloud, Unstructured ou Docling (local) | -| Busca Híbrida | Semântica + Texto completo com Índices Hierárquicos e Reciprocal Rank Fusion | -| Respostas com Citações | Converse com sua base de conhecimento e obtenha respostas citadas no estilo Perplexity | -| Arquitetura de Agentes Profundos | Alimentado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) com planejamento, subagentes e acesso ao sistema de arquivos | -| Suporte Universal de LLM | 100+ LLMs, 6000+ modelos de embeddings, todos os principais rerankers via OpenAI spec e LiteLLM | -| Privacidade em Primeiro Lugar | Suporte completo a LLM local (vLLM, Ollama) seus dados ficam com você | -| Colaboração em Equipe | RBAC com papéis de Proprietário / Admin / Editor / Visualizador, chat em tempo real e threads de comentários | -| Geração de Vídeos | Gera vídeos com narração e visuais | -| Geração de Apresentações | Cria apresentações editáveis baseadas em slides | -| Geração de Podcasts | Podcast de 3 min em menos de 20 segundos; múltiplos provedores TTS (OpenAI, Azure, Kokoro) | -| Extensão de Navegador | Extensão multi-navegador para salvar qualquer página web, incluindo páginas protegidas por autenticação | -| 27+ Conectores | Mecanismos de busca, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord e [mais](#fontes-externas) | -| Auto-Hospedável | Código aberto, Docker em um único comando ou Docker Compose completo para produção | +| Recurso | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **Fontes por Notebook** | 50 (Grátis) a 600 (Ultra, $249.99/mês) | Ilimitadas | +| **Número de Notebooks** | 100 (Grátis) a 500 (planos pagos) | Ilimitados | +| **Limite de Tamanho da Fonte** | 500.000 palavras / 200MB por fonte | Sem limite | +| **Preços** | Nível gratuito disponível; Pro $19.99/mês, Ultra $249.99/mês | Gratuito e de código aberto, auto-hospedável na sua própria infra | +| **Suporte a LLM** | Apenas Google Gemini | 100+ LLMs via OpenAI spec e LiteLLM | +| **Modelos de Embeddings** | Apenas Google | 6.000+ modelos de embeddings, todos os principais rerankers | +| **LLMs Locais / Privados** | Não disponível | Suporte completo (vLLM, Ollama) - seus dados ficam com você | +| **Auto-Hospedável** | Não | Sim - Docker em um único comando ou Docker Compose completo | +| **Código Aberto** | Não | Sim | +| **Conectores Externos** | Google Drive, YouTube, sites | 27+ conectores - Mecanismos de busca, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord e [mais](#fontes-externas) | +| **Suporte a Formatos de Arquivo** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, imagens, URLs web, YouTube | 50+ formatos - documentos, imagens, vídeos via LlamaCloud, Unstructured ou Docling (local) | +| **Busca** | Busca semântica | Busca Híbrida - Semântica + Texto completo com Índices Hierárquicos e Reciprocal Rank Fusion | +| **Respostas com Citações** | Sim | Sim - Respostas citadas no estilo Perplexity | +| **Arquitetura de Agentes** | Não | Sim - alimentado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) com planejamento, subagentes e acesso ao sistema de arquivos | +| **Multiplayer em Tempo Real** | Notebooks compartilhados com papéis de Visualizador/Editor (sem chat em tempo real) | RBAC com papéis de Proprietário / Admin / Editor / Visualizador, chat em tempo real e threads de comentários | +| **Geração de Vídeos** | Visões gerais cinemáticas via Veo 3 (apenas Ultra) | Disponível (NotebookLM é melhor aqui, melhorando ativamente) | +| **Geração de Apresentações** | Slides mais bonitos mas não editáveis | Cria apresentações editáveis baseadas em slides | +| **Geração de Podcasts** | Visões gerais em áudio com hosts e idiomas personalizáveis | Disponível com múltiplos provedores TTS (NotebookLM é melhor aqui, melhorando ativamente) | +| **Extensão de Navegador** | Não | Extensão multi-navegador para salvar qualquer página web, incluindo páginas protegidas por autenticação |
Lista completa de Fontes Externas diff --git a/README.zh-CN.md b/README.zh-CN.md index 96ebb25ad..419a831ae 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -21,9 +21,28 @@
# SurfSense -将任何 LLM 连接到您的内部知识源,并与团队成员实时聊天。NotebookLM、Perplexity 和 Glean 的开源替代方案。 -SurfSense 是一个高度可定制的 AI 研究助手,可以连接外部数据源,如搜索引擎(SearxNG、Tavily、LinkUp)、Google Drive、OneDrive、Dropbox、Slack、Microsoft Teams、Linear、Jira、ClickUp、Confluence、BookStack、Gmail、Notion、YouTube、GitHub、Discord、Airtable、Google Calendar、Luma、Circleback、Elasticsearch、Obsidian 等,未来还会支持更多。 +NotebookLM 是目前最好、最实用的 AI 平台之一,但当你开始经常使用它时,你也会感受到它的局限性,总觉得还有不足之处。 + +1. 一个笔记本中可以添加的来源数量有限制。 +2. 可以拥有的笔记本数量有限制。 +3. 来源不能超过 500,000 个单词和 200MB。 +4. 你被锁定在 Google 服务中(LLM、使用模型等),没有配置选项。 +5. 有限的外部数据源和服务集成。 +6. NotebookLM 代理专门针对学习和研究进行了优化,但你可以用源数据做更多事情。 +7. 缺乏多人协作支持。 + +...还有更多。 + +**SurfSense 正是为了解决这些问题而生。** SurfSense 赋予你: + +- **控制你的数据流** - 保持数据私密和安全。 +- **无数据限制** - 添加无限数量的来源和笔记本。 +- **无供应商锁定** - 配置任何 LLM、图像、TTS 和 STT 模型。 +- **25+ 外部数据源** - 从 Google Drive、OneDrive、Dropbox、Notion 和许多其他外部服务添加你的来源。 +- **实时多人协作支持** - 在共享笔记本中轻松与团队成员协作。 + +...更多功能即将推出。 @@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## 视频代理示例 -https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562 +https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a @@ -133,24 +152,29 @@ irm https://raw.githubusercontent.com/MODSetter/SurfSense/main/docker/scripts/in

实时评论

-## 核心功能 +## SurfSense vs Google NotebookLM -| 功能 | 描述 | -|------|------| -| 开源替代方案 | 支持实时团队协作的 NotebookLM、Perplexity 和 Glean 替代品 | -| 50+ 文件格式 | 通过 LlamaCloud、Unstructured 或 Docling(本地)上传文档、图像、视频 | -| 混合搜索 | 语义搜索 + 全文搜索,结合层次化索引和倒数排名融合 | -| 引用回答 | 与知识库对话,获得 Perplexity 风格的引用回答 | -| 深度代理架构 | 基于 [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) 构建,支持规划、子代理和文件系统访问 | -| 通用 LLM 支持 | 100+ LLM、6000+ 嵌入模型、所有主流重排序器,通过 OpenAI spec 和 LiteLLM | -| 隐私优先 | 完整本地 LLM 支持(vLLM、Ollama),您的数据由您掌控 | -| 团队协作 | RBAC 角色控制(所有者/管理员/编辑者/查看者),实时聊天和评论线程 | -| 视频生成 | 生成带有旁白和视觉效果的视频 | -| 演示文稿生成 | 创建可编辑的幻灯片式演示文稿 | -| 播客生成 | 20 秒内生成 3 分钟播客;多种 TTS 提供商(OpenAI、Azure、Kokoro) | -| 浏览器扩展 | 跨浏览器扩展,保存任何网页,包括需要身份验证的页面 | -| 27+ 连接器 | 搜索引擎、Google Drive、OneDrive、Dropbox、Slack、Teams、Jira、Notion、GitHub、Discord 等[更多](#外部数据源) | -| 可自托管 | 开源,Docker 一行命令或完整 Docker Compose 用于生产环境 | +| 功能 | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **每个笔记本的来源数** | 50(免费)到 600(Ultra,$249.99/月) | 无限制 | +| **笔记本数量** | 100(免费)到 500(付费方案) | 无限制 | +| **来源大小限制** | 500,000 词 / 200MB 每个来源 | 无限制 | +| **定价** | 免费版可用;Pro $19.99/月,Ultra $249.99/月 | 免费开源,在自己的基础设施上自托管 | +| **LLM 支持** | 仅 Google Gemini | 100+ LLM,通过 OpenAI spec 和 LiteLLM | +| **嵌入模型** | 仅 Google | 6,000+ 嵌入模型,所有主流重排序器 | +| **本地 / 私有 LLM** | 不可用 | 完整支持(vLLM、Ollama)- 您的数据由您掌控 | +| **可自托管** | 否 | 是 - Docker 一行命令或完整 Docker Compose | +| **开源** | 否 | 是 | +| **外部连接器** | Google Drive、YouTube、网站 | 27+ 连接器 - 搜索引擎、Google Drive、OneDrive、Dropbox、Slack、Teams、Jira、Notion、GitHub、Discord 等[更多](#外部数据源) | +| **文件格式支持** | PDF、Docs、Slides、Sheets、CSV、Word、EPUB、图像、网页 URL、YouTube | 50+ 格式 - 文档、图像、视频,通过 LlamaCloud、Unstructured 或 Docling(本地) | +| **搜索** | 语义搜索 | 混合搜索 - 语义 + 全文搜索,结合层次化索引和倒数排名融合 | +| **引用回答** | 是 | 是 - Perplexity 风格的引用回答 | +| **代理架构** | 否 | 是 - 基于 [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) 构建,支持规划、子代理和文件系统访问 | +| **实时多人协作** | 共享笔记本,支持查看者/编辑者角色(无实时聊天) | RBAC 角色控制(所有者/管理员/编辑者/查看者),实时聊天和评论线程 | +| **视频生成** | 通过 Veo 3 的电影级视频概览(仅 Ultra) | 可用(NotebookLM 在此方面更好,正在积极改进) | +| **演示文稿生成** | 更美观的幻灯片但不可编辑 | 创建可编辑的幻灯片式演示文稿 | +| **播客生成** | 可自定义主持人和语言的音频概览 | 可用,支持多种 TTS 提供商(NotebookLM 在此方面更好,正在积极改进) | +| **浏览器扩展** | 否 | 跨浏览器扩展,保存任何网页,包括需要身份验证的页面 |
外部数据源完整列表 From e13ca675d97c7308ed77b754fdcc97afc393dd59 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 2 Apr 2026 20:26:34 -0700 Subject: [PATCH 066/202] chore: optimize zero publication column migration process - Updated migration instructions to emphasize the importance of stopping zero-cache before and after running the migration. - Added a function to terminate blocked PIDs that could interfere with the migration. - Set a lock timeout to prevent deadlocks during the migration process. --- ..._optimize_zero_publication_column_lists.py | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py index 3c2d34c76..78a26a381 100644 --- a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py +++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py @@ -11,10 +11,11 @@ to FULL for the old Electric SQL setup (migration 66/75/76). With DEFAULT (primary-key) identity, column-list publications only need to include the PK — not every column. -After running this migration you MUST: - 1. Stop zero-cache - 2. Delete / reset the zero-cache data volume - 3. Restart zero-cache (it will do a fresh initial sync) +IMPORTANT — before AND after running this migration: + 1. Stop zero-cache (it holds replication locks that will deadlock DDL) + 2. Run: alembic upgrade head + 3. Delete / reset the zero-cache data volume + 4. Restart zero-cache (it will do a fresh initial sync) Revision ID: 117 Revises: 116 @@ -62,9 +63,29 @@ CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE """ +def _terminate_blocked_pids(conn, table: str) -> None: + """Kill backends whose locks on *table* would block our AccessExclusiveLock.""" + conn.execute( + sa.text( + "SELECT pg_terminate_backend(l.pid) " + "FROM pg_locks l " + "JOIN pg_class c ON c.oid = l.relation " + "WHERE c.relname = :tbl " + " AND l.pid != pg_backend_pid()" + ), + {"tbl": table}, + ) + + def upgrade() -> None: conn = op.get_bind() + conn.execute(sa.text("SET lock_timeout = '10s'")) + + for tbl in sorted(TABLES_WITH_FULL_IDENTITY): + _terminate_blocked_pids(conn, tbl) + conn.execute(sa.text(f'LOCK TABLE "{tbl}" IN ACCESS EXCLUSIVE MODE')) + for tbl in TABLES_WITH_FULL_IDENTITY: conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT')) From 6b06d3abb1ce5674339db4d18d52a49f5cc9eb76 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 2 Apr 2026 20:38:10 -0700 Subject: [PATCH 067/202] chore: optimize zero publication column migration process - Updated migration instructions to emphasize the importance of stopping zero-cache before and after running the migration. - Added a function to terminate blocked PIDs that could interfere with the migration. - Set a lock timeout to prevent deadlocks during the migration process. --- ..._optimize_zero_publication_column_lists.py | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py index 3c2d34c76..78a26a381 100644 --- a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py +++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py @@ -11,10 +11,11 @@ to FULL for the old Electric SQL setup (migration 66/75/76). With DEFAULT (primary-key) identity, column-list publications only need to include the PK — not every column. -After running this migration you MUST: - 1. Stop zero-cache - 2. Delete / reset the zero-cache data volume - 3. Restart zero-cache (it will do a fresh initial sync) +IMPORTANT — before AND after running this migration: + 1. Stop zero-cache (it holds replication locks that will deadlock DDL) + 2. Run: alembic upgrade head + 3. Delete / reset the zero-cache data volume + 4. Restart zero-cache (it will do a fresh initial sync) Revision ID: 117 Revises: 116 @@ -62,9 +63,29 @@ CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE """ +def _terminate_blocked_pids(conn, table: str) -> None: + """Kill backends whose locks on *table* would block our AccessExclusiveLock.""" + conn.execute( + sa.text( + "SELECT pg_terminate_backend(l.pid) " + "FROM pg_locks l " + "JOIN pg_class c ON c.oid = l.relation " + "WHERE c.relname = :tbl " + " AND l.pid != pg_backend_pid()" + ), + {"tbl": table}, + ) + + def upgrade() -> None: conn = op.get_bind() + conn.execute(sa.text("SET lock_timeout = '10s'")) + + for tbl in sorted(TABLES_WITH_FULL_IDENTITY): + _terminate_blocked_pids(conn, tbl) + conn.execute(sa.text(f'LOCK TABLE "{tbl}" IN ACCESS EXCLUSIVE MODE')) + for tbl in TABLES_WITH_FULL_IDENTITY: conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT')) From 9a65163fe4d89d35e6891015c1b46b555f11f321 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:20:44 +0530 Subject: [PATCH 068/202] feat: improve DocumentUploadTab UI with updated styles, enhanced file size limit messages --- .../components/sources/DocumentUploadTab.tsx | 89 ++++++++++--------- surfsense_web/messages/en.json | 4 +- surfsense_web/messages/es.json | 4 +- surfsense_web/messages/hi.json | 4 +- surfsense_web/messages/pt.json | 4 +- surfsense_web/messages/zh.json | 4 +- 6 files changed, 55 insertions(+), 54 deletions(-) diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 7176afae5..9a32e5a59 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtom } from "jotai"; -import { CheckCircle2, ChevronDown, File as FileIcon, FileType, FolderOpen, Plus, Upload, X } from "lucide-react"; +import { ChevronDown, Dot, File as FileIcon, FolderOpen, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; import { useCallback, useMemo, useRef, useState } from "react"; @@ -22,7 +22,6 @@ import { DropdownMenuItem, DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; -import { Label } from "@/components/ui/label"; import { Progress } from "@/components/ui/progress"; import { Spinner } from "@/components/ui/spinner"; import { Switch } from "@/components/ui/switch"; @@ -362,12 +361,12 @@ export function DocumentUploadTab({ return ( e.stopPropagation()}> - - e.stopPropagation()}> + e.stopPropagation()}> Files @@ -416,32 +415,31 @@ export function DocumentUploadTab({ {renderBrowseButton({ compact: true, fullWidth: true })}
) : ( - + ) ) ) : (
{ if (!isElectron) fileInputRef.current?.click(); }} > - -
-

+ +

+

{isElectron ? "Select files or folder" : "Tap to select files"}

-

- {t("file_size_limit")}{" "} - {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} +

+ {t("file_size_limit")} + + {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })}

{isElectron && ( @@ -491,9 +489,10 @@ export function DocumentUploadTab({

{t("drag_drop")}

-

- {t("file_size_limit")}{" "} - {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} +

+ {t("file_size_limit")} + + {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })}

{renderBrowseButton()}
@@ -520,28 +519,29 @@ export function DocumentUploadTab({
-
- - -
- -
-
-

Enable AI Summary

-

- Improves search quality but adds latency -

+
+
+
+

Watch folder

+

+ Auto-sync when files change +

+
+ +
+
+
+

Enable AI Summary

+

+ Improves search quality but adds latency +

+
+
-
- - - - Version History - + + + Version History - - + + ); } +export function VersionHistoryDialog({ + open, + onOpenChange, + documentId, +}: { + open: boolean; + onOpenChange: (open: boolean) => void; + documentId: number; +}) { + return ( + + + Version History + {open && } + + + ); +} + +function formatRelativeTime(dateStr: string): string { + const now = Date.now(); + const then = new Date(dateStr).getTime(); + const diffMs = now - then; + const diffMin = Math.floor(diffMs / 60_000); + if (diffMin < 1) return "Just now"; + if (diffMin < 60) return `${diffMin} minute${diffMin !== 1 ? "s" : ""} ago`; + const diffHr = Math.floor(diffMin / 60); + if (diffHr < 24) return `${diffHr} hour${diffHr !== 1 ? "s" : ""} ago`; + return new Date(dateStr).toLocaleDateString(undefined, { + weekday: "short", + month: "short", + day: "numeric", + year: "numeric", + hour: "numeric", + minute: "2-digit", + }); +} + function VersionHistoryPanel({ documentId }: { documentId: number }) { const [versions, setVersions] = useState([]); const [loading, setLoading] = useState(true); @@ -55,6 +100,7 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) { const [versionContent, setVersionContent] = useState(""); const [contentLoading, setContentLoading] = useState(false); const [restoring, setRestoring] = useState(false); + const [copied, setCopied] = useState(false); const loadVersions = useCallback(async () => { setLoading(true); @@ -73,6 +119,7 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) { }, [loadVersions]); const handleSelectVersion = async (versionNumber: number) => { + if (selectedVersion === versionNumber) return; setSelectedVersion(versionNumber); setContentLoading(true); try { @@ -101,9 +148,15 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) { } }; + const handleCopy = () => { + navigator.clipboard.writeText(versionContent); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + }; + if (loading) { return ( -
+
); @@ -111,75 +164,111 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) { if (versions.length === 0) { return ( -
- +

No version history available yet.

Versions are created when file content changes.

); } - return ( -
-
- {versions.map((v) => ( -
handleSelectVersion(v.version_number)} - > -
-
-

Version {v.version_number}

- {v.created_at && ( -

- {new Date(v.created_at).toLocaleString()} -

- )} - {v.title && ( -

- {v.title} -

- )} -
- -
-
- ))} -
+ const selectedVersionData = versions.find((v) => v.version_number === selectedVersion); - {selectedVersion !== null && ( -
-

- Preview — Version {selectedVersion} -

- {contentLoading ? ( -
- -
- ) : ( -
-							{versionContent || "(empty)"}
-						
- )} + return ( + <> + {/* Left panel — version list */} +
+
+
+ {versions.map((v) => ( + + ))} +
+
+ + + {/* Right panel — content preview */} +
+ {selectedVersion !== null && selectedVersionData ? ( + <> +
+

+ {selectedVersionData.title || `Version ${selectedVersion}`} +

+
+ + +
+
+ +
+ {contentLoading ? ( +
+ +
+ ) : ( +
+									{versionContent || "(empty)"}
+								
+ )} +
+ + ) : ( +
+

Select a version to preview

+
+ )} +
+ ); } diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 7aa518361..031390c9e 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -21,6 +21,7 @@ import type { DocumentNodeDoc } from "@/components/documents/DocumentNode"; import type { FolderDisplay } from "@/components/documents/FolderNode"; import { FolderPickerDialog } from "@/components/documents/FolderPickerDialog"; import { FolderTreeView } from "@/components/documents/FolderTreeView"; +import { VersionHistoryDialog } from "@/components/documents/version-history"; import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { EXPORT_FILE_EXTENSIONS } from "@/components/shared/ExportMenuItems"; import { @@ -579,6 +580,7 @@ export function DocumentsSidebar({ const [bulkDeleteConfirmOpen, setBulkDeleteConfirmOpen] = useState(false); const [isBulkDeleting, setIsBulkDeleting] = useState(false); + const [versionDocId, setVersionDocId] = useState(null); const handleBulkDeleteSelected = useCallback(async () => { if (deletableSelectedIds.length === 0) return; @@ -826,6 +828,7 @@ export function DocumentsSidebar({ onDeleteDocument={(doc) => handleDeleteDocument(doc.id)} onMoveDocument={handleMoveDocument} onExportDocument={handleExportDocument} + onVersionHistory={(doc) => setVersionDocId(doc.id)} activeTypes={activeTypes} onDropIntoFolder={handleDropIntoFolder} onReorderFolder={handleReorderFolder} @@ -850,6 +853,14 @@ export function DocumentsSidebar({ }} /> + {versionDocId !== null && ( + { if (!open) setVersionDocId(null); }} + documentId={versionDocId} + /> + )} + Date: Fri, 3 Apr 2026 10:56:43 +0530 Subject: [PATCH 074/202] fix: update version history messages for clarity by removing unnecessary punctuation --- surfsense_web/components/documents/version-history.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/surfsense_web/components/documents/version-history.tsx b/surfsense_web/components/documents/version-history.tsx index f438a7190..7aba92b47 100644 --- a/surfsense_web/components/documents/version-history.tsx +++ b/surfsense_web/components/documents/version-history.tsx @@ -165,8 +165,8 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) { if (versions.length === 0) { return (
-

No version history available yet.

-

Versions are created when file content changes.

+

No version history available yet

+

Versions are created when file content changes

); } From 79f19b9bc637d13bf89a8c08ae2e815123093491 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 11:10:46 +0530 Subject: [PATCH 075/202] fix: adjust layout in DocumentsSidebar for improved UI responsiveness and interaction, including changes to button styles and positioning --- .../components/layout/ui/sidebar/DocumentsSidebar.tsx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 031390c9e..24f6666c9 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -784,12 +784,13 @@ export function DocumentsSidebar({ />
+
{deletableSelectedIds.length > 0 && ( -
+
)} - +
Date: Fri, 3 Apr 2026 11:19:54 +0530 Subject: [PATCH 076/202] feat: trigger document reindexing after restoring a document version to ensure content is up-to-date --- surfsense_backend/app/routes/documents_routes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index e6eed7836..083ed2b89 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1285,6 +1285,9 @@ async def restore_document_version( document.content_needs_reindexing = True await session.commit() + from app.tasks.celery_tasks.document_reindex_tasks import reindex_document_task + reindex_document_task.delay(document_id, str(user.id)) + return { "message": f"Restored version {version_number}", "document_id": document_id, From 3833084dad3444d7bf534a7b39dc62ebf24a89fd Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 11:45:53 +0530 Subject: [PATCH 077/202] feat: changed the revision number of folder alembic migration --- ...ing.py => 118_add_local_folder_sync_and_versioning.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename surfsense_backend/alembic/versions/{117_add_local_folder_sync_and_versioning.py => 118_add_local_folder_sync_and_versioning.py} (98%) diff --git a/surfsense_backend/alembic/versions/117_add_local_folder_sync_and_versioning.py b/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py similarity index 98% rename from surfsense_backend/alembic/versions/117_add_local_folder_sync_and_versioning.py rename to surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py index e322a608d..1fef9fbcb 100644 --- a/surfsense_backend/alembic/versions/117_add_local_folder_sync_and_versioning.py +++ b/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py @@ -1,7 +1,7 @@ """Add LOCAL_FOLDER_FILE document type, folder metadata, and document_versions table -Revision ID: 117 -Revises: 116 +Revision ID: 118 +Revises: 117 """ from collections.abc import Sequence @@ -10,8 +10,8 @@ import sqlalchemy as sa from alembic import op -revision: str = "117" -down_revision: str | None = "116" +revision: str = "118" +down_revision: str | None = "117" branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None From 3621951f2aeceebaff70328ffe8a6c91e9bad83d Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:21:57 -0700 Subject: [PATCH 078/202] perf: throttle scroll handlers with requestAnimationFrame Wrap scroll handlers in thread.tsx, InboxSidebar.tsx, and DocumentsTableShell.tsx with requestAnimationFrame batching so scroll position state updates fire at most once per animation frame instead of on every scroll event (up to 60/sec at 60fps). Add cleanup useEffect to cancel pending frames on unmount. Fixes #1103 --- .../(manage)/components/DocumentsTableShell.tsx | 12 +++++++++--- surfsense_web/components/assistant-ui/thread.tsx | 12 +++++++++--- .../components/layout/ui/sidebar/InboxSidebar.tsx | 12 +++++++++--- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index ceef9f2e1..dc8966571 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -267,12 +267,18 @@ export function DocumentsTableShell({ const [metadataJson, setMetadataJson] = useState | null>(null); const [metadataLoading, setMetadataLoading] = useState(false); const [previewScrollPos, setPreviewScrollPos] = useState<"top" | "middle" | "bottom">("top"); + const previewRafRef = useRef(); const handlePreviewScroll = useCallback((e: React.UIEvent) => { const el = e.currentTarget; - const atTop = el.scrollTop <= 2; - const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2; - setPreviewScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle"); + if (previewRafRef.current) return; + previewRafRef.current = requestAnimationFrame(() => { + const atTop = el.scrollTop <= 2; + const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2; + setPreviewScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle"); + previewRafRef.current = undefined; + }); }, []); + useEffect(() => () => { if (previewRafRef.current) cancelAnimationFrame(previewRafRef.current); }, []); const [deleteDoc, setDeleteDoc] = useState(null); const [isDeleting, setIsDeleting] = useState(false); diff --git a/surfsense_web/components/assistant-ui/thread.tsx b/surfsense_web/components/assistant-ui/thread.tsx index 0d0163d8a..0f230cec3 100644 --- a/surfsense_web/components/assistant-ui/thread.tsx +++ b/surfsense_web/components/assistant-ui/thread.tsx @@ -816,12 +816,18 @@ const ComposerAction: FC = ({ isBlockedByOtherUser = false const isDesktop = useMediaQuery("(min-width: 640px)"); const { openDialog: openUploadDialog } = useDocumentUploadDialog(); const [toolsScrollPos, setToolsScrollPos] = useState<"top" | "middle" | "bottom">("top"); + const toolsRafRef = useRef(); const handleToolsScroll = useCallback((e: React.UIEvent) => { const el = e.currentTarget; - const atTop = el.scrollTop <= 2; - const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2; - setToolsScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle"); + if (toolsRafRef.current) return; + toolsRafRef.current = requestAnimationFrame(() => { + const atTop = el.scrollTop <= 2; + const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2; + setToolsScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle"); + toolsRafRef.current = undefined; + }); }, []); + useEffect(() => () => { if (toolsRafRef.current) cancelAnimationFrame(toolsRafRef.current); }, []); const isComposerTextEmpty = useAuiState(({ composer }) => { const text = composer.text?.trim() || ""; return text.length === 0; diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx index 72400a589..4aa8d4c60 100644 --- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx @@ -178,12 +178,18 @@ export function InboxSidebarContent({ const [mounted, setMounted] = useState(false); const [openDropdown, setOpenDropdown] = useState<"filter" | null>(null); const [connectorScrollPos, setConnectorScrollPos] = useState<"top" | "middle" | "bottom">("top"); + const connectorRafRef = useRef(); const handleConnectorScroll = useCallback((e: React.UIEvent) => { const el = e.currentTarget; - const atTop = el.scrollTop <= 2; - const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2; - setConnectorScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle"); + if (connectorRafRef.current) return; + connectorRafRef.current = requestAnimationFrame(() => { + const atTop = el.scrollTop <= 2; + const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2; + setConnectorScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle"); + connectorRafRef.current = undefined; + }); }, []); + useEffect(() => () => { if (connectorRafRef.current) cancelAnimationFrame(connectorRafRef.current); }, []); const [filterDrawerOpen, setFilterDrawerOpen] = useState(false); const [markingAsReadId, setMarkingAsReadId] = useState(null); From e38a0ff7c345cb83121f7983eec838cfdf579f66 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:43:19 -0700 Subject: [PATCH 079/202] style: format useEffect cleanup to satisfy biome --- .../documents/(manage)/components/DocumentsTableShell.tsx | 7 ++++++- surfsense_web/components/assistant-ui/thread.tsx | 7 ++++++- .../components/layout/ui/sidebar/InboxSidebar.tsx | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index dc8966571..748fb1911 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -278,7 +278,12 @@ export function DocumentsTableShell({ previewRafRef.current = undefined; }); }, []); - useEffect(() => () => { if (previewRafRef.current) cancelAnimationFrame(previewRafRef.current); }, []); + useEffect( + () => () => { + if (previewRafRef.current) cancelAnimationFrame(previewRafRef.current); + }, + [] + ); const [deleteDoc, setDeleteDoc] = useState(null); const [isDeleting, setIsDeleting] = useState(false); diff --git a/surfsense_web/components/assistant-ui/thread.tsx b/surfsense_web/components/assistant-ui/thread.tsx index 0f230cec3..718bf3961 100644 --- a/surfsense_web/components/assistant-ui/thread.tsx +++ b/surfsense_web/components/assistant-ui/thread.tsx @@ -827,7 +827,12 @@ const ComposerAction: FC = ({ isBlockedByOtherUser = false toolsRafRef.current = undefined; }); }, []); - useEffect(() => () => { if (toolsRafRef.current) cancelAnimationFrame(toolsRafRef.current); }, []); + useEffect( + () => () => { + if (toolsRafRef.current) cancelAnimationFrame(toolsRafRef.current); + }, + [] + ); const isComposerTextEmpty = useAuiState(({ composer }) => { const text = composer.text?.trim() || ""; return text.length === 0; diff --git a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx index 4aa8d4c60..525b7cf74 100644 --- a/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/InboxSidebar.tsx @@ -189,7 +189,12 @@ export function InboxSidebarContent({ connectorRafRef.current = undefined; }); }, []); - useEffect(() => () => { if (connectorRafRef.current) cancelAnimationFrame(connectorRafRef.current); }, []); + useEffect( + () => () => { + if (connectorRafRef.current) cancelAnimationFrame(connectorRafRef.current); + }, + [] + ); const [filterDrawerOpen, setFilterDrawerOpen] = useState(false); const [markingAsReadId, setMarkingAsReadId] = useState(null); From b9b2bac16f89203e16b637ab12e3edb5ef3b4589 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Thu, 2 Apr 2026 23:59:15 -0700 Subject: [PATCH 080/202] fix: clean up onboarding tour timer leaks Fix two timer cleanup bugs in onboarding-tour.tsx: 1. Remove cleanup return from useCallback (only works in useEffect). Clear retryTimerRef at the start of updateTarget and in a dedicated useEffect cleanup instead. 2. Track recursive setTimeout calls via startCheckTimerRef so they are properly cancelled on unmount instead of leaking. Fixes #1091 --- surfsense_web/components/onboarding-tour.tsx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/surfsense_web/components/onboarding-tour.tsx b/surfsense_web/components/onboarding-tour.tsx index 1c52169cb..d762d9c15 100644 --- a/surfsense_web/components/onboarding-tour.tsx +++ b/surfsense_web/components/onboarding-tour.tsx @@ -429,6 +429,7 @@ export function OnboardingTour() { const pathname = usePathname(); const retryCountRef = useRef(0); const retryTimerRef = useRef | null>(null); + const startCheckTimerRef = useRef | null>(null); const maxRetries = 10; // Track previous user ID to detect user changes const previousUserIdRef = useRef(null); @@ -460,6 +461,7 @@ export function OnboardingTour() { // Find and track target element with retry logic const updateTarget = useCallback(() => { + if (retryTimerRef.current) clearTimeout(retryTimerRef.current); if (!currentStep) return; const el = document.querySelector(currentStep.target); @@ -480,11 +482,13 @@ export function OnboardingTour() { } }, 200); } + }, [currentStep]); + useEffect(() => { return () => { if (retryTimerRef.current) clearTimeout(retryTimerRef.current); }; - }, [currentStep]); + }, []); // Check if tour should run: localStorage + data validation with user ID tracking useEffect(() => { @@ -573,15 +577,15 @@ export function OnboardingTour() { setPosition(calculatePosition(connectorEl, TOUR_STEPS[0].placement)); } else { // Retry after delay - setTimeout(checkAndStartTour, 200); + startCheckTimerRef.current = setTimeout(checkAndStartTour, 200); } }; // Start checking after initial delay - const timer = setTimeout(checkAndStartTour, 500); + startCheckTimerRef.current = setTimeout(checkAndStartTour, 500); return () => { cancelled = true; - clearTimeout(timer); + if (startCheckTimerRef.current) clearTimeout(startCheckTimerRef.current); }; }, [mounted, user?.id, searchSpaceId, pathname, threadsData, documentTypeCounts, connectors]); From 388811194e720a28dce7a92e47757d01f9f2820f Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 12:33:47 +0530 Subject: [PATCH 081/202] feat: update DocumentUploadTab to use a dropdown for file and folder selection, enhancing user experience; also update upload limits and file size messages in multiple languages --- .../components/sources/DocumentUploadTab.tsx | 73 ++++++------------- surfsense_web/messages/en.json | 15 +++- surfsense_web/messages/es.json | 15 +++- surfsense_web/messages/hi.json | 15 +++- surfsense_web/messages/pt.json | 15 +++- surfsense_web/messages/zh.json | 17 +++-- 6 files changed, 80 insertions(+), 70 deletions(-) diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index f3b9166dc..9733bd2e6 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -413,18 +413,24 @@ export function DocumentUploadTab({ } return ( - + + e.stopPropagation()}> + + + e.stopPropagation()}> + fileInputRef.current?.click()}> + + {t("browse_files")} + + folderInputRef.current?.click()}> + + {t("browse_folder")} + + + ); }; @@ -476,7 +482,7 @@ export function DocumentUploadTab({

- {isElectron ? "Select files or folder" : "Tap to select files"} + {isElectron ? "Select files or folder" : "Tap to select files or folder"}

{t("file_size_limit")} @@ -484,40 +490,9 @@ export function DocumentUploadTab({ {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })}

- {isElectron && ( -
e.stopPropagation()}> - {renderBrowseButton({ fullWidth: true })} -
- )} - {!isElectron && ( -
- - -
- )} +
e.stopPropagation()}> + {renderBrowseButton({ fullWidth: true })} +
)}
@@ -570,8 +545,8 @@ export function DocumentUploadTab({ )}
- {/* FOLDER SELECTED */} - {selectedFolder && ( + {/* FOLDER SELECTED (Electron only — web flattens folder contents into file list) */} + {isElectron && selectedFolder && (
diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 47d08e921..3a8c0c632 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -376,11 +376,11 @@ "upload_documents": { "title": "Upload Documents", "subtitle": "Upload your files to make them searchable and accessible through AI-powered conversations.", - "file_size_limit": "Maximum file size: {maxMB}MB per file", - "upload_limits": "Upload files or entire folders", + "file_size_limit": "Maximum file size: 50MB per file", + "upload_limits": "Upload limit: {maxFiles} files, {maxSizeMB}MB total", "drop_files": "Drop files or folders here", "drag_drop": "Drag & drop files or folders here", - "or_browse": "or click to browse files and folders", + "or_browse": "or click to browse", "browse_files": "Browse Files", "browse_folder": "Browse Folder", "selected_files": "Selected Files ({count})", @@ -397,7 +397,14 @@ "file_types_desc": "These file types are supported based on your current ETL service configuration.", "file_too_large": "File Too Large", "file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.", - "no_supported_files_in_folder": "No supported file types found in the selected folder." + "no_supported_files_in_folder": "No supported file types found in the selected folder.", + "remaining_capacity": "{files} files, {sizeMB}MB remaining", + "file_limit_reached": "File limit reached", + "file_limit_reached_desc": "Maximum of {max} files allowed", + "max_files_exceeded": "Too many files", + "max_files_exceeded_desc": "You can upload a maximum of {max} files at once", + "max_size_exceeded": "Total size exceeded", + "max_size_exceeded_desc": "Total upload size cannot exceed {max}MB" }, "add_webpage": { "title": "Add Webpages for Crawling", diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json index e7761ba25..2de30d29d 100644 --- a/surfsense_web/messages/es.json +++ b/surfsense_web/messages/es.json @@ -376,11 +376,11 @@ "upload_documents": { "title": "Subir documentos", "subtitle": "Sube tus archivos para hacerlos buscables y accesibles a través de conversaciones con IA.", - "file_size_limit": "Tamaño máximo de archivo: {maxMB} MB por archivo", - "upload_limits": "Sube archivos o carpetas enteras", + "file_size_limit": "Tamaño máximo de archivo: 50 MB por archivo", + "upload_limits": "Límite de subida: {maxFiles} archivos, {maxSizeMB} MB en total", "drop_files": "Suelta archivos o carpetas aquí", "drag_drop": "Arrastra y suelta archivos o carpetas aquí", - "or_browse": "o haz clic para explorar archivos y carpetas", + "or_browse": "o haz clic para explorar", "browse_files": "Explorar archivos", "browse_folder": "Explorar carpeta", "selected_files": "Archivos seleccionados ({count})", @@ -397,7 +397,14 @@ "file_types_desc": "Estos tipos de archivo son soportados según la configuración actual de tu servicio ETL.", "file_too_large": "Archivo demasiado grande", "file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.", - "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada." + "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada.", + "remaining_capacity": "{files} archivos, {sizeMB}MB restante", + "file_limit_reached": "Límite de archivos alcanzado", + "file_limit_reached_desc": "Máximo de {max} archivos permitidos", + "max_files_exceeded": "Demasiados archivos", + "max_files_exceeded_desc": "Puedes subir un máximo de {max} archivos a la vez", + "max_size_exceeded": "Tamaño total excedido", + "max_size_exceeded_desc": "El tamaño total de subida no puede exceder {max}MB" }, "add_webpage": { "title": "Agregar páginas web para rastreo", diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json index 957533206..c27291e3b 100644 --- a/surfsense_web/messages/hi.json +++ b/surfsense_web/messages/hi.json @@ -376,11 +376,11 @@ "upload_documents": { "title": "दस्तावेज़ अपलोड करें", "subtitle": "AI-संचालित बातचीत के माध्यम से अपनी फ़ाइलों को खोजने योग्य और सुलभ बनाने के लिए अपलोड करें।", - "file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल {maxMB}MB", - "upload_limits": "फ़ाइलें या पूरे फ़ोल्डर अपलोड करें", + "file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल 50MB", + "upload_limits": "अपलोड सीमा: {maxFiles} फ़ाइलें, कुल {maxSizeMB}MB", "drop_files": "फ़ाइलें या फ़ोल्डर यहां छोड़ें", "drag_drop": "फ़ाइलें या फ़ोल्डर यहां खींचें और छोड़ें", - "or_browse": "या फ़ाइलें और फ़ोल्डर ब्राउज़ करने के लिए क्लिक करें", + "or_browse": "या ब्राउज़ करने के लिए क्लिक करें", "browse_files": "फ़ाइलें ब्राउज़ करें", "browse_folder": "फ़ोल्डर ब्राउज़ करें", "selected_files": "चयनित फ़ाइलें ({count})", @@ -397,7 +397,14 @@ "file_types_desc": "ये फ़ाइल प्रकार आपकी वर्तमान ETL सेवा कॉन्फ़िगरेशन के आधार पर समर्थित हैं।", "file_too_large": "फ़ाइल बहुत बड़ी है", "file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।", - "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।" + "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।", + "remaining_capacity": "{files} फ़ाइलें, {sizeMB}MB शेष", + "file_limit_reached": "फ़ाइल सीमा पूरी हो गई", + "file_limit_reached_desc": "अधिकतम {max} फ़ाइलें अनुमत हैं", + "max_files_exceeded": "बहुत सारी फ़ाइलें", + "max_files_exceeded_desc": "आप एक बार में अधिकतम {max} फ़ाइलें अपलोड कर सकते हैं", + "max_size_exceeded": "कुल आकार सीमा पार", + "max_size_exceeded_desc": "कुल अपलोड आकार {max}MB से अधिक नहीं हो सकता" }, "add_webpage": { "title": "क्रॉलिंग के लिए वेबपेज जोड़ें", diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json index 9aec7af48..eeb417a27 100644 --- a/surfsense_web/messages/pt.json +++ b/surfsense_web/messages/pt.json @@ -376,11 +376,11 @@ "upload_documents": { "title": "Enviar documentos", "subtitle": "Envie seus arquivos para torná-los pesquisáveis e acessíveis através de conversas com IA.", - "file_size_limit": "Tamanho máximo do arquivo: {maxMB} MB por arquivo", - "upload_limits": "Envie arquivos ou pastas inteiras", + "file_size_limit": "Tamanho máximo do arquivo: 50 MB por arquivo", + "upload_limits": "Limite de envio: {maxFiles} arquivos, {maxSizeMB} MB no total", "drop_files": "Solte arquivos ou pastas aqui", "drag_drop": "Arraste e solte arquivos ou pastas aqui", - "or_browse": "ou clique para navegar arquivos e pastas", + "or_browse": "ou clique para navegar", "browse_files": "Navegar arquivos", "browse_folder": "Navegar pasta", "selected_files": "Arquivos selecionados ({count})", @@ -397,7 +397,14 @@ "file_types_desc": "Estes tipos de arquivo são suportados com base na configuração atual do seu serviço ETL.", "file_too_large": "Arquivo muito grande", "file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.", - "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada." + "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada.", + "remaining_capacity": "{files} arquivos, {sizeMB}MB restante", + "file_limit_reached": "Limite de arquivos atingido", + "file_limit_reached_desc": "Máximo de {max} arquivos permitidos", + "max_files_exceeded": "Muitos arquivos", + "max_files_exceeded_desc": "Você pode enviar no máximo {max} arquivos de uma vez", + "max_size_exceeded": "Tamanho total excedido", + "max_size_exceeded_desc": "O tamanho total do envio não pode exceder {max}MB" }, "add_webpage": { "title": "Adicionar páginas web para rastreamento", diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json index 3ceab2443..2ee18a346 100644 --- a/surfsense_web/messages/zh.json +++ b/surfsense_web/messages/zh.json @@ -360,11 +360,11 @@ "upload_documents": { "title": "上传文档", "subtitle": "上传您的文件,使其可通过 AI 对话进行搜索和访问。", - "file_size_limit": "最大文件大小:每个文件 {maxMB}MB", - "upload_limits": "上传文件或整个文件夹", + "file_size_limit": "最大文件大小:每个文件 50MB", + "upload_limits": "上传限制:最多 {maxFiles} 个文件,总大小不超过 {maxSizeMB}MB", "drop_files": "将文件或文件夹拖放到此处", - "drag_drop": "将文件或文件夹拖放到此处", - "or_browse": "或点击浏览文件和文件夹", + "drag_drop": "拖放文件或文件夹到这里", + "or_browse": "或点击浏览", "browse_files": "浏览文件", "browse_folder": "浏览文件夹", "selected_files": "已选择的文件 ({count})", @@ -381,7 +381,14 @@ "file_types_desc": "根据您当前的 ETL 服务配置支持这些文件类型。", "file_too_large": "文件过大", "file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。", - "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。" + "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。", + "remaining_capacity": "剩余 {files} 个文件,{sizeMB}MB", + "file_limit_reached": "已达文件数量上限", + "file_limit_reached_desc": "最多允许 {max} 个文件", + "max_files_exceeded": "文件数量过多", + "max_files_exceeded_desc": "一次最多上传 {max} 个文件", + "max_size_exceeded": "总大小超出限制", + "max_size_exceeded_desc": "总上传大小不能超过 {max}MB" }, "add_webpage": { "title": "添加网页爬取", From 134beec3920c9c2cc2c84e3588828b8294d856c9 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Fri, 3 Apr 2026 00:05:06 -0700 Subject: [PATCH 082/202] fix: clear upload progress interval on unmount Store the progress setInterval ID in a ref and clear it in a useEffect cleanup. Previously the interval was stored in a local variable and only cleared in onSuccess/onError callbacks, leaking if the component unmounted mid-upload. Fixes #1090 --- .../components/sources/DocumentUploadTab.tsx | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 723a3ad36..5c8ec83a5 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -4,7 +4,7 @@ import { useAtom } from "jotai"; import { CheckCircle2, FileType, FolderOpen, Info, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; -import { type ChangeEvent, useCallback, useMemo, useRef, useState } from "react"; +import { type ChangeEvent, useCallback, useEffect, useMemo, useRef, useState } from "react"; import { useDropzone } from "react-dropzone"; import { toast } from "sonner"; import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; @@ -132,6 +132,15 @@ export function DocumentUploadTab({ const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation; const fileInputRef = useRef(null); const folderInputRef = useRef(null); + const progressIntervalRef = useRef | null>(null); + + useEffect(() => { + return () => { + if (progressIntervalRef.current) { + clearInterval(progressIntervalRef.current); + } + }; + }, []); const acceptedFileTypes = useMemo(() => { const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE; @@ -236,7 +245,7 @@ export function DocumentUploadTab({ setUploadProgress(0); trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize); - const progressInterval = setInterval(() => { + progressIntervalRef.current = setInterval(() => { setUploadProgress((prev) => (prev >= 90 ? prev : prev + Math.random() * 10)); }, 200); @@ -249,14 +258,14 @@ export function DocumentUploadTab({ }, { onSuccess: () => { - clearInterval(progressInterval); + if (progressIntervalRef.current) clearInterval(progressIntervalRef.current); setUploadProgress(100); trackDocumentUploadSuccess(Number(searchSpaceId), files.length); toast(t("upload_initiated"), { description: t("upload_initiated_desc") }); onSuccess?.(); }, onError: (error: unknown) => { - clearInterval(progressInterval); + if (progressIntervalRef.current) clearInterval(progressIntervalRef.current); setUploadProgress(0); const message = error instanceof Error ? error.message : "Upload failed"; trackDocumentUploadFailure(Number(searchSpaceId), message); From 8171605fae6737b4e95cc07d05fe5467493265dd Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 13:05:42 +0530 Subject: [PATCH 083/202] refactor: remove metadata viewing functionality from FolderNode, FolderTreeView, and DocumentsSidebar components --- .../components/documents/FolderNode.tsx | 10 +----- .../components/documents/FolderTreeView.tsx | 3 -- .../layout/ui/sidebar/DocumentsSidebar.tsx | 34 ------------------- 3 files changed, 1 insertion(+), 46 deletions(-) diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 41c1d8f73..909f965f9 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -80,7 +80,6 @@ interface FolderNodeProps { isWatched?: boolean; onRescan?: (folder: FolderDisplay) => void; onStopWatching?: (folder: FolderDisplay) => void; - onViewMetadata?: (folder: FolderDisplay) => void; } function getDropZone( @@ -122,7 +121,6 @@ export const FolderNode = React.memo(function FolderNode({ isWatched, onRescan, onStopWatching, - onViewMetadata, }: FolderNodeProps) { const [renameValue, setRenameValue] = useState(folder.name); const inputRef = useRef(null); @@ -258,13 +256,7 @@ export const FolderNode = React.memo(function FolderNode({ isOver && !canDrop && "cursor-not-allowed" )} style={{ paddingLeft: `${depth * 16 + 4}px` }} - onClick={(e) => { - if ((e.ctrlKey || e.metaKey) && onViewMetadata) { - e.preventDefault(); - e.stopPropagation(); - onViewMetadata(folder); - return; - } + onClick={() => { onToggleExpand(folder.id); }} onKeyDown={(e) => { diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index 6f64d6258..3aa8ce9d8 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -44,7 +44,6 @@ interface FolderTreeViewProps { watchedFolderIds?: Set; onRescanFolder?: (folder: FolderDisplay) => void; onStopWatchingFolder?: (folder: FolderDisplay) => void; - onViewFolderMetadata?: (folder: FolderDisplay) => void; } function groupBy(items: T[], keyFn: (item: T) => string | number): Record { @@ -82,7 +81,6 @@ export function FolderTreeView({ watchedFolderIds, onRescanFolder, onStopWatchingFolder, - onViewFolderMetadata, }: FolderTreeViewProps) { const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]); @@ -247,7 +245,6 @@ export function FolderTreeView({ isWatched={watchedFolderIds?.has(f.id)} onRescan={onRescanFolder} onStopWatching={onStopWatchingFolder} - onViewMetadata={onViewFolderMetadata} /> ); diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index 24f6666c9..c10c5dc82 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -22,7 +22,6 @@ import type { FolderDisplay } from "@/components/documents/FolderNode"; import { FolderPickerDialog } from "@/components/documents/FolderPickerDialog"; import { FolderTreeView } from "@/components/documents/FolderTreeView"; import { VersionHistoryDialog } from "@/components/documents/version-history"; -import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { EXPORT_FILE_EXTENSIONS } from "@/components/shared/ExportMenuItems"; import { AlertDialog, @@ -97,10 +96,6 @@ export function DocumentsSidebar({ const [activeTypes, setActiveTypes] = useState([]); const [watchedFolderIds, setWatchedFolderIds] = useState>(new Set()); - const [metadataFolder, setMetadataFolder] = useState(null); - const [metadataJson, setMetadataJson] = useState | null>(null); - const [metadataLoading, setMetadataLoading] = useState(false); - useEffect(() => { const api = typeof window !== "undefined" ? window.electronAPI : null; if (!api?.getWatchedFolders) return; @@ -333,20 +328,6 @@ export function DocumentsSidebar({ [] ); - const handleViewFolderMetadata = useCallback(async (folder: FolderDisplay) => { - setMetadataFolder(folder); - setMetadataLoading(true); - try { - const fullFolder = await foldersApiService.getFolder(folder.id); - setMetadataJson((fullFolder.metadata as Record) ?? {}); - } catch (err) { - console.error("[DocumentsSidebar] Failed to fetch folder metadata:", err); - setMetadataJson({ error: "Failed to load folder metadata" }); - } finally { - setMetadataLoading(false); - } - }, []); - const handleRenameFolder = useCallback(async (folder: FolderDisplay, newName: string) => { try { await foldersApiService.updateFolder(folder.id, { name: newName }); @@ -836,25 +817,10 @@ export function DocumentsSidebar({ watchedFolderIds={watchedFolderIds} onRescanFolder={handleRescanFolder} onStopWatchingFolder={handleStopWatching} - onViewFolderMetadata={handleViewFolderMetadata} />
- { - if (!open) { - setMetadataFolder(null); - setMetadataJson(null); - setMetadataLoading(false); - } - }} - /> - {versionDocId !== null && ( Date: Fri, 3 Apr 2026 13:10:25 +0530 Subject: [PATCH 084/202] fix: update button alignment in InlineCitation component for consistent styling --- surfsense_web/components/assistant-ui/inline-citation.tsx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/surfsense_web/components/assistant-ui/inline-citation.tsx b/surfsense_web/components/assistant-ui/inline-citation.tsx index 15ad11d94..42144f1d6 100644 --- a/surfsense_web/components/assistant-ui/inline-citation.tsx +++ b/surfsense_web/components/assistant-ui/inline-citation.tsx @@ -32,8 +32,7 @@ export const InlineCitation: FC = ({ chunkId, isDocsChunk = From 746c730b2e03ae23a718d53023bf473fa673d3f7 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 13:14:40 +0530 Subject: [PATCH 085/202] chore: ran linting --- .../app/routes/documents_routes.py | 63 ++--- surfsense_backend/app/routes/editor_routes.py | 6 +- .../routes/search_source_connectors_routes.py | 1 - surfsense_backend/app/schemas/folders.py | 7 +- .../app/tasks/celery_tasks/document_tasks.py | 15 +- .../local_folder_indexer.py | 215 +++++++++++++----- .../app/utils/document_versioning.py | 6 +- .../tests/integration/conftest.py | 2 - .../test_local_folder_pipeline.py | 198 +++++++++------- .../integration/test_document_versioning.py | 55 ++--- .../test_local_folder_scan.py | 4 +- .../app/(home)/login/LocalLoginForm.tsx | 20 +- surfsense_web/app/(home)/register/page.tsx | 131 +++++------ .../(manage)/components/DocumentsFilters.tsx | 22 +- .../components/PromptsContent.tsx | 7 +- .../assistant-ui/connector-popup.tsx | 5 +- .../views/connector-edit-view.tsx | 58 ++--- .../assistant-ui/document-upload-popup.tsx | 5 +- .../assistant-ui/inline-citation.tsx | 3 +- .../components/documents/DocumentNode.tsx | 17 +- .../components/documents/FolderNode.tsx | 186 +++++++-------- .../components/documents/FolderTreeView.tsx | 8 +- .../components/documents/version-history.tsx | 42 ++-- .../components/editor-panel/editor-panel.tsx | 39 ++-- .../layout/ui/sidebar/DocumentsSidebar.tsx | 185 ++++++++------- .../layout/ui/tabs/DocumentTabContent.tsx | 4 +- .../new-chat/source-detail-panel.tsx | 4 +- .../components/settings/llm-role-manager.tsx | 41 ++-- .../components/sources/DocumentUploadTab.tsx | 73 +++--- .../lib/apis/connectors-api.service.ts | 1 - .../lib/apis/documents-api.service.ts | 38 +++- 31 files changed, 801 insertions(+), 660 deletions(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index c86cdab3f..5008b1a10 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -977,15 +977,19 @@ async def get_watched_folders( ) folders = ( - await session.execute( - select(Folder).where( - Folder.search_space_id == search_space_id, - Folder.parent_id.is_(None), - Folder.folder_metadata.isnot(None), - Folder.folder_metadata["watched"].astext == "true", + ( + await session.execute( + select(Folder).where( + Folder.search_space_id == search_space_id, + Folder.parent_id.is_(None), + Folder.folder_metadata.isnot(None), + Folder.folder_metadata["watched"].astext == "true", + ) ) ) - ).scalars().all() + .scalars() + .all() + ) return folders @@ -1265,15 +1269,21 @@ async def list_document_versions( if not document: raise HTTPException(status_code=404, detail="Document not found") - await check_permission(session, user, document.search_space_id, Permission.DOCUMENTS_READ.value) + await check_permission( + session, user, document.search_space_id, Permission.DOCUMENTS_READ.value + ) versions = ( - await session.execute( - select(DocumentVersion) - .where(DocumentVersion.document_id == document_id) - .order_by(DocumentVersion.version_number.desc()) + ( + await session.execute( + select(DocumentVersion) + .where(DocumentVersion.document_id == document_id) + .order_by(DocumentVersion.version_number.desc()) + ) ) - ).scalars().all() + .scalars() + .all() + ) return [ { @@ -1300,7 +1310,9 @@ async def get_document_version( if not document: raise HTTPException(status_code=404, detail="Document not found") - await check_permission(session, user, document.search_space_id, Permission.DOCUMENTS_READ.value) + await check_permission( + session, user, document.search_space_id, Permission.DOCUMENTS_READ.value + ) version = ( await session.execute( @@ -1331,14 +1343,14 @@ async def restore_document_version( ): """Restore a previous version: snapshot current state, then overwrite document content.""" document = ( - await session.execute( - select(Document).where(Document.id == document_id) - ) + await session.execute(select(Document).where(Document.id == document_id)) ).scalar_one_or_none() if not document: raise HTTPException(status_code=404, detail="Document not found") - await check_permission(session, user, document.search_space_id, Permission.DOCUMENTS_UPDATE.value) + await check_permission( + session, user, document.search_space_id, Permission.DOCUMENTS_UPDATE.value + ) version = ( await session.execute( @@ -1363,6 +1375,7 @@ async def restore_document_version( await session.commit() from app.tasks.celery_tasks.document_reindex_tasks import reindex_document_task + reindex_document_task.delay(document_id, str(user.id)) return { @@ -1430,9 +1443,7 @@ async def folder_index( root_folder_id = request.root_folder_id if root_folder_id: existing = ( - await session.execute( - select(Folder).where(Folder.id == root_folder_id) - ) + await session.execute(select(Folder).where(Folder.id == root_folder_id)) ).scalar_one_or_none() if not existing: root_folder_id = None @@ -1492,7 +1503,9 @@ async def folder_index_files( ) if not request.target_file_paths: - raise HTTPException(status_code=400, detail="target_file_paths must not be empty") + raise HTTPException( + status_code=400, detail="target_file_paths must not be empty" + ) await check_permission( session, @@ -1507,11 +1520,11 @@ async def folder_index_files( for fp in request.target_file_paths: try: Path(fp).relative_to(request.folder_path) - except ValueError: + except ValueError as err: raise HTTPException( status_code=400, detail=f"target_file_path {fp} must be inside folder_path", - ) + ) from err from app.tasks.celery_tasks.document_tasks import index_local_folder_task @@ -1530,5 +1543,3 @@ async def folder_index_files( "status": "processing", "file_count": len(request.target_file_paths), } - - diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py index a0505f62f..829b2cf69 100644 --- a/surfsense_backend/app/routes/editor_routes.py +++ b/surfsense_backend/app/routes/editor_routes.py @@ -129,7 +129,11 @@ async def get_editor_content( if not chunk_contents: doc_status = document.status or {} - state = doc_status.get("state", "ready") if isinstance(doc_status, dict) else "ready" + state = ( + doc_status.get("state", "ready") + if isinstance(doc_status, dict) + else "ready" + ) if state in ("pending", "processing"): raise HTTPException( status_code=409, diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index f49ba2d5d..d208ff910 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -20,7 +20,6 @@ Non-OAuth connectors (BookStack, GitHub, etc.) are limited to one per search spa import asyncio import logging -import os from contextlib import suppress from datetime import UTC, datetime, timedelta from typing import Any diff --git a/surfsense_backend/app/schemas/folders.py b/surfsense_backend/app/schemas/folders.py index e8bdf3821..a7e065144 100644 --- a/surfsense_backend/app/schemas/folders.py +++ b/surfsense_backend/app/schemas/folders.py @@ -1,9 +1,8 @@ """Pydantic schemas for folder CRUD, move, and reorder operations.""" from datetime import datetime -from uuid import UUID - from typing import Any +from uuid import UUID from pydantic import BaseModel, ConfigDict, Field @@ -36,7 +35,9 @@ class FolderRead(BaseModel): created_by_id: UUID | None created_at: datetime updated_at: datetime - metadata: dict[str, Any] | None = Field(default=None, validation_alias="folder_metadata") + metadata: dict[str, Any] | None = Field( + default=None, validation_alias="folder_metadata" + ) model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index 506f8118c..4e9249d34 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -1,6 +1,7 @@ """Celery tasks for document processing.""" import asyncio +import contextlib import logging import os from uuid import UUID @@ -1337,9 +1338,7 @@ async def _index_local_folder_async( ) notification_id = notification.id _start_heartbeat(notification_id) - heartbeat_task = asyncio.create_task( - _run_heartbeat_loop(notification_id) - ) + heartbeat_task = asyncio.create_task(_run_heartbeat_loop(notification_id)) except Exception: logger.warning( "Failed to create notification for local folder indexing", @@ -1349,18 +1348,16 @@ async def _index_local_folder_async( async def _heartbeat_progress(completed_count: int) -> None: """Refresh heartbeat and optionally update notification progress.""" if notification: - try: + with contextlib.suppress(Exception): await NotificationService.document_processing.notify_processing_progress( session=session, notification=notification, stage="indexing", stage_message=f"Syncing files ({completed_count}/{file_count or '?'})", ) - except Exception: - pass try: - indexed, skipped_or_failed, _rfid, err = await index_local_folder( + _indexed, _skipped_or_failed, _rfid, err = await index_local_folder( session=session, search_space_id=search_space_id, user_id=user_id, @@ -1371,7 +1368,9 @@ async def _index_local_folder_async( root_folder_id=root_folder_id, enable_summary=enable_summary, target_file_paths=target_file_paths, - on_heartbeat_callback=_heartbeat_progress if (is_batch or is_full_scan) else None, + on_heartbeat_callback=_heartbeat_progress + if (is_batch or is_full_scan) + else None, ) if notification: diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index 4ac8cc594..539cfdd32 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -43,30 +43,110 @@ from .base import ( logger, ) -PLAINTEXT_EXTENSIONS = frozenset({ - ".md", ".markdown", ".txt", ".text", ".csv", ".tsv", - ".json", ".jsonl", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".conf", - ".xml", ".html", ".htm", ".css", ".scss", ".less", ".sass", - ".py", ".pyw", ".pyi", ".pyx", - ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs", - ".java", ".kt", ".kts", ".scala", ".groovy", - ".c", ".h", ".cpp", ".cxx", ".cc", ".hpp", ".hxx", - ".cs", ".fs", ".fsx", - ".go", ".rs", ".rb", ".php", ".pl", ".pm", ".lua", - ".swift", ".m", ".mm", - ".r", ".R", ".jl", - ".sh", ".bash", ".zsh", ".fish", ".bat", ".cmd", ".ps1", - ".sql", ".graphql", ".gql", - ".env", ".gitignore", ".dockerignore", ".editorconfig", - ".makefile", ".cmake", - ".log", ".rst", ".tex", ".bib", ".org", ".adoc", ".asciidoc", - ".vue", ".svelte", ".astro", - ".tf", ".hcl", ".proto", -}) +PLAINTEXT_EXTENSIONS = frozenset( + { + ".md", + ".markdown", + ".txt", + ".text", + ".csv", + ".tsv", + ".json", + ".jsonl", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".xml", + ".html", + ".htm", + ".css", + ".scss", + ".less", + ".sass", + ".py", + ".pyw", + ".pyi", + ".pyx", + ".js", + ".jsx", + ".ts", + ".tsx", + ".mjs", + ".cjs", + ".java", + ".kt", + ".kts", + ".scala", + ".groovy", + ".c", + ".h", + ".cpp", + ".cxx", + ".cc", + ".hpp", + ".hxx", + ".cs", + ".fs", + ".fsx", + ".go", + ".rs", + ".rb", + ".php", + ".pl", + ".pm", + ".lua", + ".swift", + ".m", + ".mm", + ".r", + ".R", + ".jl", + ".sh", + ".bash", + ".zsh", + ".fish", + ".bat", + ".cmd", + ".ps1", + ".sql", + ".graphql", + ".gql", + ".env", + ".gitignore", + ".dockerignore", + ".editorconfig", + ".makefile", + ".cmake", + ".log", + ".rst", + ".tex", + ".bib", + ".org", + ".adoc", + ".asciidoc", + ".vue", + ".svelte", + ".astro", + ".tf", + ".hcl", + ".proto", + } +) -AUDIO_EXTENSIONS = frozenset({ - ".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", -}) +AUDIO_EXTENSIONS = frozenset( + { + ".mp3", + ".mp4", + ".mpeg", + ".mpga", + ".m4a", + ".wav", + ".webm", + } +) def _is_plaintext_file(filename: str) -> bool: @@ -81,6 +161,7 @@ def _needs_etl(filename: str) -> bool: """File is not plaintext and not audio — requires ETL service to parse.""" return not _is_plaintext_file(filename) and not _is_audio_file(filename) + HeartbeatCallbackType = Callable[[int], Awaitable[None]] DEFAULT_EXCLUDE_PATTERNS = [ @@ -121,9 +202,7 @@ def scan_folder( for dirpath, dirnames, filenames in os.walk(root): rel_dir = Path(dirpath).relative_to(root) - dirnames[:] = [ - d for d in dirnames if d not in exclude_patterns - ] + dirnames[:] = [d for d in dirnames if d not in exclude_patterns] if any(part in exclude_patterns for part in rel_dir.parts): continue @@ -134,9 +213,11 @@ def scan_folder( full = Path(dirpath) / fname - if file_extensions is not None: - if full.suffix.lower() not in file_extensions: - continue + if ( + file_extensions is not None + and full.suffix.lower() not in file_extensions + ): + continue try: stat = full.stat() @@ -209,11 +290,14 @@ def _content_hash(content: str, search_space_id: int) -> str: pipeline so that dedup checks are consistent. """ import hashlib - return hashlib.sha256(f"{search_space_id}:{content}".encode("utf-8")).hexdigest() + + return hashlib.sha256(f"{search_space_id}:{content}".encode()).hexdigest() async def _compute_file_content_hash( - file_path: str, filename: str, search_space_id: int, + file_path: str, + filename: str, + search_space_id: int, ) -> tuple[str, str]: """Read a file (via ETL if needed) and compute its content hash. @@ -257,9 +341,7 @@ async def _mirror_folder_structure( if root_folder_id: existing = ( - await session.execute( - select(Folder).where(Folder.id == root_folder_id) - ) + await session.execute(select(Folder).where(Folder.id == root_folder_id)) ).scalar_one_or_none() if existing: mapping[""] = existing.id @@ -412,13 +494,17 @@ async def _cleanup_empty_folders( id_to_rel: dict[int, str] = {fid: rel for rel, fid in folder_mapping.items() if rel} all_folders = ( - await session.execute( - select(Folder).where( - Folder.search_space_id == search_space_id, - Folder.id != root_folder_id, + ( + await session.execute( + select(Folder).where( + Folder.search_space_id == search_space_id, + Folder.id != root_folder_id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) candidates: list[Folder] = [] for folder in all_folders: @@ -520,7 +606,9 @@ async def index_local_folder( metadata={ "folder_path": folder_path, "user_id": str(user_id), - "target_file_paths_count": len(target_file_paths) if target_file_paths else None, + "target_file_paths_count": len(target_file_paths) + if target_file_paths + else None, }, ) @@ -532,7 +620,12 @@ async def index_local_folder( "Folder not found", {}, ) - return 0, 0, root_folder_id, f"Folder path missing or does not exist: {folder_path}" + return ( + 0, + 0, + root_folder_id, + f"Folder path missing or does not exist: {folder_path}", + ) if exclude_patterns is None: exclude_patterns = DEFAULT_EXCLUDE_PATTERNS @@ -639,7 +732,9 @@ async def index_local_folder( ) if existing_document: - stored_mtime = (existing_document.document_metadata or {}).get("mtime") + stored_mtime = (existing_document.document_metadata or {}).get( + "mtime" + ) current_mtime = file_info["modified_at"].timestamp() if stored_mtime and abs(current_mtime - stored_mtime) < 1.0: @@ -709,23 +804,31 @@ async def index_local_folder( # ================================================================ all_root_folder_ids = set(folder_mapping.values()) all_db_folders = ( - await session.execute( - select(Folder.id).where( - Folder.search_space_id == search_space_id, + ( + await session.execute( + select(Folder.id).where( + Folder.search_space_id == search_space_id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) all_root_folder_ids.update(all_db_folders) all_folder_docs = ( - await session.execute( - select(Document).where( - Document.document_type == DocumentType.LOCAL_FOLDER_FILE, - Document.search_space_id == search_space_id, - Document.folder_id.in_(list(all_root_folder_ids)), + ( + await session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == search_space_id, + Document.folder_id.in_(list(all_root_folder_ids)), + ) ) ) - ).scalars().all() + .scalars() + .all() + ) for doc in all_folder_docs: if doc.unique_identifier_hash not in seen_unique_hashes: @@ -742,9 +845,7 @@ async def index_local_folder( ) pipeline = IndexingPipelineService(session) - doc_map = { - compute_unique_identifier_hash(cd): cd for cd in connector_docs - } + doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs} documents = await pipeline.prepare_for_indexing(connector_docs) # Assign folder_id immediately so docs appear in the correct @@ -1033,7 +1134,9 @@ async def _index_single_file( db_doc.document_metadata = doc_meta await session.commit() - indexed = 1 if DocumentStatus.is_state(db_doc.status, DocumentStatus.READY) else 0 + indexed = ( + 1 if DocumentStatus.is_state(db_doc.status, DocumentStatus.READY) else 0 + ) failed_msg = None if indexed else "Indexing failed" if indexed: diff --git a/surfsense_backend/app/utils/document_versioning.py b/surfsense_backend/app/utils/document_versioning.py index 889bc4a3a..e6ad1fb06 100644 --- a/surfsense_backend/app/utils/document_versioning.py +++ b/surfsense_backend/app/utils/document_versioning.py @@ -83,9 +83,9 @@ async def create_version_snapshot( # Cleanup: cap at MAX_VERSIONS_PER_DOCUMENT count = ( await session.execute( - select(func.count()).select_from(DocumentVersion).where( - DocumentVersion.document_id == document.id - ) + select(func.count()) + .select_from(DocumentVersion) + .where(DocumentVersion.document_id == document.id) ) ).scalar_one() diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index 9c91011ae..d9d7cacae 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -166,5 +166,3 @@ def make_connector_document(db_connector, db_user): return ConnectorDocument(**defaults) return _make - - diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py index 67254ec93..4062c3a3b 100644 --- a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -21,7 +21,9 @@ from app.db import ( pytestmark = pytest.mark.integration UNIFIED_FIXTURES = ( - "patched_summarize", "patched_embed_texts", "patched_chunk_text", + "patched_summarize", + "patched_embed_texts", + "patched_chunk_text", ) @@ -37,6 +39,7 @@ class _FakeSessionMaker: @asynccontextmanager async def _ctx(): yield self._session + return _ctx() @@ -59,7 +62,6 @@ def patched_batch_sessions(monkeypatch, db_session): class TestFullIndexer: - @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_i1_new_file_indexed( self, @@ -73,7 +75,7 @@ class TestFullIndexer: (tmp_path / "note.md").write_text("# Hello World\n\nContent here.") - count, skipped, root_folder_id, err = await index_local_folder( + count, _skipped, _root_folder_id, err = await index_local_folder( session=db_session, search_space_id=db_search_space.id, user_id=str(db_user.id), @@ -85,13 +87,17 @@ class TestFullIndexer: assert count == 1 docs = ( - await db_session.execute( - select(Document).where( - Document.document_type == DocumentType.LOCAL_FOLDER_FILE, - Document.search_space_id == db_search_space.id, + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) assert len(docs) == 1 assert docs[0].document_type == DocumentType.LOCAL_FOLDER_FILE assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY) @@ -130,7 +136,9 @@ class TestFullIndexer: total = ( await db_session.execute( - select(func.count()).select_from(Document).where( + select(func.count()) + .select_from(Document) + .where( Document.document_type == DocumentType.LOCAL_FOLDER_FILE, Document.search_space_id == db_search_space.id, ) @@ -174,13 +182,19 @@ class TestFullIndexer: assert count == 1 versions = ( - await db_session.execute( - select(DocumentVersion).join(Document).where( - Document.document_type == DocumentType.LOCAL_FOLDER_FILE, - Document.search_space_id == db_search_space.id, + ( + await db_session.execute( + select(DocumentVersion) + .join(Document) + .where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) assert len(versions) >= 1 @pytest.mark.usefixtures(*UNIFIED_FIXTURES) @@ -207,7 +221,9 @@ class TestFullIndexer: docs_before = ( await db_session.execute( - select(func.count()).select_from(Document).where( + select(func.count()) + .select_from(Document) + .where( Document.document_type == DocumentType.LOCAL_FOLDER_FILE, Document.search_space_id == db_search_space.id, ) @@ -228,7 +244,9 @@ class TestFullIndexer: docs_after = ( await db_session.execute( - select(func.count()).select_from(Document).where( + select(func.count()) + .select_from(Document) + .where( Document.document_type == DocumentType.LOCAL_FOLDER_FILE, Document.search_space_id == db_search_space.id, ) @@ -262,13 +280,17 @@ class TestFullIndexer: assert count == 1 docs = ( - await db_session.execute( - select(Document).where( - Document.document_type == DocumentType.LOCAL_FOLDER_FILE, - Document.search_space_id == db_search_space.id, + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) assert len(docs) == 1 assert docs[0].title == "b.md" @@ -279,7 +301,6 @@ class TestFullIndexer: class TestFolderMirroring: - @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_f1_root_folder_created( self, @@ -335,10 +356,14 @@ class TestFolderMirroring: ) folders = ( - await db_session.execute( - select(Folder).where(Folder.search_space_id == db_search_space.id) + ( + await db_session.execute( + select(Folder).where(Folder.search_space_id == db_search_space.id) + ) ) - ).scalars().all() + .scalars() + .all() + ) folder_names = {f.name for f in folders} assert "notes" in folder_names @@ -376,10 +401,14 @@ class TestFolderMirroring: ) folders_before = ( - await db_session.execute( - select(Folder).where(Folder.search_space_id == db_search_space.id) + ( + await db_session.execute( + select(Folder).where(Folder.search_space_id == db_search_space.id) + ) ) - ).scalars().all() + .scalars() + .all() + ) ids_before = {f.id for f in folders_before} await index_local_folder( @@ -392,10 +421,14 @@ class TestFolderMirroring: ) folders_after = ( - await db_session.execute( - select(Folder).where(Folder.search_space_id == db_search_space.id) + ( + await db_session.execute( + select(Folder).where(Folder.search_space_id == db_search_space.id) + ) ) - ).scalars().all() + .scalars() + .all() + ) ids_after = {f.id for f in folders_after} assert ids_before == ids_after @@ -425,21 +458,23 @@ class TestFolderMirroring: ) docs = ( - await db_session.execute( - select(Document).where( - Document.document_type == DocumentType.LOCAL_FOLDER_FILE, - Document.search_space_id == db_search_space.id, + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) today_doc = next(d for d in docs if d.title == "today.md") root_doc = next(d for d in docs if d.title == "root.md") daily_folder = ( - await db_session.execute( - select(Folder).where(Folder.name == "daily") - ) + await db_session.execute(select(Folder).where(Folder.name == "daily")) ).scalar_one() assert today_doc.folder_id == daily_folder.id @@ -455,9 +490,10 @@ class TestFolderMirroring: tmp_path: Path, ): """F5: Deleted dir's empty Folder row is cleaned up on re-sync.""" - from app.tasks.connector_indexers.local_folder_indexer import index_local_folder import shutil + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + daily = tmp_path / "notes" / "daily" daily.mkdir(parents=True) weekly = tmp_path / "notes" / "weekly" @@ -474,9 +510,7 @@ class TestFolderMirroring: ) weekly_folder = ( - await db_session.execute( - select(Folder).where(Folder.name == "weekly") - ) + await db_session.execute(select(Folder).where(Folder.name == "weekly")) ).scalar_one_or_none() assert weekly_folder is not None @@ -492,16 +526,12 @@ class TestFolderMirroring: ) weekly_after = ( - await db_session.execute( - select(Folder).where(Folder.name == "weekly") - ) + await db_session.execute(select(Folder).where(Folder.name == "weekly")) ).scalar_one_or_none() assert weekly_after is None daily_after = ( - await db_session.execute( - select(Folder).where(Folder.name == "daily") - ) + await db_session.execute(select(Folder).where(Folder.name == "daily")) ).scalar_one_or_none() assert daily_after is not None @@ -551,18 +581,14 @@ class TestFolderMirroring: ).scalar_one() daily_folder = ( - await db_session.execute( - select(Folder).where(Folder.name == "daily") - ) + await db_session.execute(select(Folder).where(Folder.name == "daily")) ).scalar_one() assert doc.folder_id == daily_folder.id assert daily_folder.parent_id is not None notes_folder = ( - await db_session.execute( - select(Folder).where(Folder.name == "notes") - ) + await db_session.execute(select(Folder).where(Folder.name == "notes")) ).scalar_one() assert daily_folder.parent_id == notes_folder.id assert notes_folder.parent_id == root_folder_id @@ -592,9 +618,7 @@ class TestFolderMirroring: ) eph_folder = ( - await db_session.execute( - select(Folder).where(Folder.name == "ephemeral") - ) + await db_session.execute(select(Folder).where(Folder.name == "ephemeral")) ).scalar_one_or_none() assert eph_folder is not None @@ -612,16 +636,12 @@ class TestFolderMirroring: ) eph_after = ( - await db_session.execute( - select(Folder).where(Folder.name == "ephemeral") - ) + await db_session.execute(select(Folder).where(Folder.name == "ephemeral")) ).scalar_one_or_none() assert eph_after is None notes_after = ( - await db_session.execute( - select(Folder).where(Folder.name == "notes") - ) + await db_session.execute(select(Folder).where(Folder.name == "notes")) ).scalar_one_or_none() assert notes_after is None @@ -632,7 +652,6 @@ class TestFolderMirroring: class TestBatchMode: - @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_b1_batch_indexes_multiple_files( self, @@ -649,7 +668,7 @@ class TestBatchMode: (tmp_path / "b.md").write_text("File B content") (tmp_path / "c.md").write_text("File C content") - count, failed, root_folder_id, err = await index_local_folder( + count, failed, _root_folder_id, err = await index_local_folder( session=db_session, search_space_id=db_search_space.id, user_id=str(db_user.id), @@ -667,13 +686,17 @@ class TestBatchMode: assert err is None docs = ( - await db_session.execute( - select(Document).where( - Document.document_type == DocumentType.LOCAL_FOLDER_FILE, - Document.search_space_id == db_search_space.id, + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) assert len(docs) == 3 assert {d.title for d in docs} == {"a.md", "b.md", "c.md"} assert all( @@ -714,13 +737,17 @@ class TestBatchMode: assert err is not None docs = ( - await db_session.execute( - select(Document).where( - Document.document_type == DocumentType.LOCAL_FOLDER_FILE, - Document.search_space_id == db_search_space.id, + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) assert len(docs) == 2 assert {d.title for d in docs} == {"good1.md", "good2.md"} @@ -731,7 +758,6 @@ class TestBatchMode: class TestPipelineIntegration: - @pytest.mark.usefixtures(*UNIFIED_FIXTURES) async def test_p1_local_folder_file_through_pipeline( self, @@ -742,7 +768,9 @@ class TestPipelineIntegration: ): """P1: LOCAL_FOLDER_FILE ConnectorDocument through prepare+index to READY.""" from app.indexing_pipeline.connector_document import ConnectorDocument - from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService + from app.indexing_pipeline.indexing_pipeline_service import ( + IndexingPipelineService, + ) doc = ConnectorDocument( title="Test Local File", @@ -763,12 +791,16 @@ class TestPipelineIntegration: assert result is not None docs = ( - await db_session.execute( - select(Document).where( - Document.document_type == DocumentType.LOCAL_FOLDER_FILE, - Document.search_space_id == db_search_space.id, + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) ) ) - ).scalars().all() + .scalars() + .all() + ) assert len(docs) == 1 assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY) diff --git a/surfsense_backend/tests/integration/test_document_versioning.py b/surfsense_backend/tests/integration/test_document_versioning.py index 87e3c490c..9bd03d219 100644 --- a/surfsense_backend/tests/integration/test_document_versioning.py +++ b/surfsense_backend/tests/integration/test_document_versioning.py @@ -34,14 +34,16 @@ async def db_document( async def _version_count(session: AsyncSession, document_id: int) -> int: result = await session.execute( - select(func.count()).select_from(DocumentVersion).where( - DocumentVersion.document_id == document_id - ) + select(func.count()) + .select_from(DocumentVersion) + .where(DocumentVersion.document_id == document_id) ) return result.scalar_one() -async def _get_versions(session: AsyncSession, document_id: int) -> list[DocumentVersion]: +async def _get_versions( + session: AsyncSession, document_id: int +) -> list[DocumentVersion]: result = await session.execute( select(DocumentVersion) .where(DocumentVersion.document_id == document_id) @@ -74,18 +76,14 @@ class TestCreateVersionSnapshot: from app.utils.document_versioning import create_version_snapshot t0 = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC) - monkeypatch.setattr( - "app.utils.document_versioning._now", lambda: t0 - ) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: t0) await create_version_snapshot(db_session, db_document) # Simulate content change and time passing db_document.source_markdown = "# Test\n\nUpdated content." db_document.content_hash = "def456" t1 = t0 + timedelta(minutes=31) - monkeypatch.setattr( - "app.utils.document_versioning._now", lambda: t1 - ) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: t1) await create_version_snapshot(db_session, db_document) versions = await _get_versions(db_session, db_document.id) @@ -101,9 +99,7 @@ class TestCreateVersionSnapshot: from app.utils.document_versioning import create_version_snapshot t0 = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC) - monkeypatch.setattr( - "app.utils.document_versioning._now", lambda: t0 - ) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: t0) await create_version_snapshot(db_session, db_document) count_after_first = await _version_count(db_session, db_document.id) assert count_after_first == 1 @@ -112,9 +108,7 @@ class TestCreateVersionSnapshot: db_document.source_markdown = "# Test\n\nQuick edit." db_document.content_hash = "quick123" t1 = t0 + timedelta(minutes=10) - monkeypatch.setattr( - "app.utils.document_versioning._now", lambda: t1 - ) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: t1) await create_version_snapshot(db_session, db_document) count_after_second = await _version_count(db_session, db_document.id) @@ -134,22 +128,15 @@ class TestCreateVersionSnapshot: # Create 5 versions spread across time: 3 older than 90 days, 2 recent for i in range(5): - db_document.source_markdown = f"Content v{i+1}" - db_document.content_hash = f"hash_{i+1}" - if i < 3: - t = base + timedelta(days=i) # old - else: - t = base + timedelta(days=100 + i) # recent - monkeypatch.setattr( - "app.utils.document_versioning._now", lambda _t=t: _t - ) + db_document.source_markdown = f"Content v{i + 1}" + db_document.content_hash = f"hash_{i + 1}" + t = base + timedelta(days=i) if i < 3 else base + timedelta(days=100 + i) + monkeypatch.setattr("app.utils.document_versioning._now", lambda _t=t: _t) await create_version_snapshot(db_session, db_document) # Now trigger cleanup from a "current" time that makes the first 3 versions > 90 days old now = base + timedelta(days=200) - monkeypatch.setattr( - "app.utils.document_versioning._now", lambda: now - ) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: now) db_document.source_markdown = "Content v6" db_document.content_hash = "hash_6" await create_version_snapshot(db_session, db_document) @@ -160,9 +147,7 @@ class TestCreateVersionSnapshot: age = now - v.created_at.replace(tzinfo=UTC) assert age <= timedelta(days=90), f"Version {v.version_number} is too old" - async def test_v5_cap_at_20_versions( - self, db_session, db_document, monkeypatch - ): + async def test_v5_cap_at_20_versions(self, db_session, db_document, monkeypatch): """V5: More than 20 versions triggers cap — oldest gets deleted.""" from app.utils.document_versioning import create_version_snapshot @@ -170,12 +155,10 @@ class TestCreateVersionSnapshot: # Create 21 versions (all within 90 days, each 31 min apart) for i in range(21): - db_document.source_markdown = f"Content v{i+1}" - db_document.content_hash = f"hash_{i+1}" + db_document.source_markdown = f"Content v{i + 1}" + db_document.content_hash = f"hash_{i + 1}" t = base + timedelta(minutes=31 * i) - monkeypatch.setattr( - "app.utils.document_versioning._now", lambda _t=t: _t - ) + monkeypatch.setattr("app.utils.document_versioning._now", lambda _t=t: _t) await create_version_snapshot(db_session, db_document) versions = await _get_versions(db_session, db_document.id) diff --git a/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py b/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py index 9b4c73f25..c6e7b160c 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py @@ -51,9 +51,7 @@ class TestScanFolder: git.mkdir() (git / "config").write_text("gitconfig") - results = scan_folder( - str(tmp_path), exclude_patterns=["node_modules", ".git"] - ) + results = scan_folder(str(tmp_path), exclude_patterns=["node_modules", ".git"]) names = {r["relative_path"] for r in results} assert "good.md" in names diff --git a/surfsense_web/app/(home)/login/LocalLoginForm.tsx b/surfsense_web/app/(home)/login/LocalLoginForm.tsx index 1ebbf46b6..e94857334 100644 --- a/surfsense_web/app/(home)/login/LocalLoginForm.tsx +++ b/surfsense_web/app/(home)/login/LocalLoginForm.tsx @@ -160,11 +160,11 @@ export function LocalLoginForm() { placeholder="you@example.com" value={username} onChange={(e) => setUsername(e.target.value)} - className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ - error.title - ? "border-destructive focus:border-destructive focus:ring-destructive/40" - : "border-border focus:border-primary focus:ring-primary/40" - }`} + className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ + error.title + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" + }`} disabled={isLoggingIn} />
@@ -181,11 +181,11 @@ export function LocalLoginForm() { placeholder="Enter your password" value={password} onChange={(e) => setPassword(e.target.value)} - className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ - error.title - ? "border-destructive focus:border-destructive focus:ring-destructive/40" - : "border-border focus:border-primary focus:ring-primary/40" - }`} + className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ + error.title + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" + }`} disabled={isLoggingIn} />
-
- - setPassword(e.target.value)} - className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ - error.title - ? "border-destructive focus:border-destructive focus:ring-destructive/40" - : "border-border focus:border-primary focus:ring-primary/40" - }`} - disabled={isRegistering} - /> -
+
+ + setPassword(e.target.value)} + className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ + error.title + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" + }`} + disabled={isRegistering} + /> +
-
- - setConfirmPassword(e.target.value)} - className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ - error.title - ? "border-destructive focus:border-destructive focus:ring-destructive/40" - : "border-border focus:border-primary focus:ring-primary/40" - }`} - disabled={isRegistering} - /> +
+ + setConfirmPassword(e.target.value)} + className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ + error.title + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" + }`} + disabled={isRegistering} + />
+ {/* Upload Button */} +
); diff --git a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx index 39362d244..1e7087afc 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/user-settings/components/PromptsContent.tsx @@ -2,7 +2,6 @@ import { useAtomValue } from "jotai"; import { AlertTriangle, Globe, Lock, PenLine, Sparkles, Trash2 } from "lucide-react"; -import { ShortcutKbd } from "@/components/ui/shortcut-kbd"; import { useCallback, useState } from "react"; import { toast } from "sonner"; import { @@ -24,6 +23,7 @@ import { import { Button } from "@/components/ui/button"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; +import { ShortcutKbd } from "@/components/ui/shortcut-kbd"; import { Spinner } from "@/components/ui/spinner"; import { Switch } from "@/components/ui/switch"; import type { PromptRead } from "@/contracts/types/prompts.types"; @@ -145,9 +145,8 @@ export function PromptsContent() {

- Create prompt templates triggered with{" "} - in the - chat composer. + Create prompt templates triggered with in + the chat composer.

{!showForm && ( diff --git a/surfsense_web/components/documents/DocumentNode.tsx b/surfsense_web/components/documents/DocumentNode.tsx index 31d1bc7ca..919f904d4 100644 --- a/surfsense_web/components/documents/DocumentNode.tsx +++ b/surfsense_web/components/documents/DocumentNode.tsx @@ -39,8 +39,8 @@ import { Spinner } from "@/components/ui/spinner"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import type { DocumentTypeEnum } from "@/contracts/types/document.types"; import { cn } from "@/lib/utils"; -import { isVersionableType } from "./version-history"; import { DND_TYPES } from "./FolderNode"; +import { isVersionableType } from "./version-history"; const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]); @@ -199,7 +199,10 @@ export const DocumentNode = React.memo(function DocumentNode({ {doc.title} - {getDocumentTypeIcon(doc.document_type as DocumentTypeEnum, "h-3.5 w-3.5 text-muted-foreground") && ( + {getDocumentTypeIcon( + doc.document_type as DocumentTypeEnum, + "h-3.5 w-3.5 text-muted-foreground" + ) && ( {getDocumentTypeIcon( doc.document_type as DocumentTypeEnum, @@ -251,10 +254,7 @@ export const DocumentNode = React.memo(function DocumentNode({ )} {onVersionHistory && isVersionableType(doc.document_type) && ( - onVersionHistory(doc)} - > + onVersionHistory(doc)}> Versions @@ -300,10 +300,7 @@ export const DocumentNode = React.memo(function DocumentNode({ )} {onVersionHistory && isVersionableType(doc.document_type) && ( - onVersionHistory(doc)} - > + onVersionHistory(doc)}> Versions diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 909f965f9..88cc76c69 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -256,15 +256,15 @@ export const FolderNode = React.memo(function FolderNode({ isOver && !canDrop && "cursor-not-allowed" )} style={{ paddingLeft: `${depth * 16 + 4}px` }} - onClick={() => { - onToggleExpand(folder.id); - }} - onKeyDown={(e) => { - if (e.key === "Enter" || e.key === " ") { - e.preventDefault(); + onClick={() => { onToggleExpand(folder.id); - } - }} + }} + onKeyDown={(e) => { + if (e.key === "Enter" || e.key === " ") { + e.preventDefault(); + onToggleExpand(folder.id); + } + }} onDoubleClick={(e) => { e.stopPropagation(); startRename(); @@ -306,7 +306,11 @@ export const FolderNode = React.memo(function FolderNode({ ) : ( e.stopPropagation()} @@ -350,107 +354,107 @@ export const FolderNode = React.memo(function FolderNode({ - - {isWatched && onRescan && ( + + {isWatched && onRescan && ( + { + e.stopPropagation(); + onRescan(folder); + }} + > + + Re-scan + + )} + {isWatched && onStopWatching && ( + { + e.stopPropagation(); + onStopWatching(folder); + }} + > + + Stop watching + + )} { e.stopPropagation(); - onRescan(folder); + onCreateSubfolder(folder.id); }} > - - Re-scan + + New subfolder - )} - {isWatched && onStopWatching && ( { e.stopPropagation(); - onStopWatching(folder); + startRename(); }} > - - Stop watching + + Rename - )} - { - e.stopPropagation(); - onCreateSubfolder(folder.id); - }} - > - - New subfolder - - { - e.stopPropagation(); - startRename(); - }} - > - - Rename - - { - e.stopPropagation(); - onMove(folder); - }} - > - - Move to... - - { - e.stopPropagation(); - onDelete(folder); - }} - > - - Delete - - + { + e.stopPropagation(); + onMove(folder); + }} + > + + Move to... + + { + e.stopPropagation(); + onDelete(folder); + }} + > + + Delete + + )}
- {!isRenaming && contextMenuOpen && ( - - {isWatched && onRescan && ( - onRescan(folder)}> - - Re-scan + {!isRenaming && contextMenuOpen && ( + + {isWatched && onRescan && ( + onRescan(folder)}> + + Re-scan + + )} + {isWatched && onStopWatching && ( + onStopWatching(folder)}> + + Stop watching + + )} + onCreateSubfolder(folder.id)}> + + New subfolder - )} - {isWatched && onStopWatching && ( - onStopWatching(folder)}> - - Stop watching + startRename()}> + + Rename - )} - onCreateSubfolder(folder.id)}> - - New subfolder - - startRename()}> - - Rename - - onMove(folder)}> - - Move to... - - onDelete(folder)} - > - - Delete - - - )} + onMove(folder)}> + + Move to... + + onDelete(folder)} + > + + Delete + + + )} ); }); diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index 3aa8ce9d8..1df007c0b 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -242,10 +242,10 @@ export function FolderTreeView({ siblingPositions={siblingPositions} contextMenuOpen={openContextMenuId === `folder-${f.id}`} onContextMenuOpenChange={(open) => setOpenContextMenuId(open ? `folder-${f.id}` : null)} - isWatched={watchedFolderIds?.has(f.id)} - onRescan={onRescanFolder} - onStopWatching={onStopWatchingFolder} - /> + isWatched={watchedFolderIds?.has(f.id)} + onRescan={onRescanFolder} + onStopWatching={onStopWatchingFolder} + /> ); if (isExpanded) { diff --git a/surfsense_web/components/documents/version-history.tsx b/surfsense_web/components/documents/version-history.tsx index 7aba92b47..27343dc6a 100644 --- a/surfsense_web/components/documents/version-history.tsx +++ b/surfsense_web/components/documents/version-history.tsx @@ -1,19 +1,14 @@ "use client"; -import { useCallback, useEffect, useState } from "react"; import { Check, ChevronRight, Clock, Copy, RotateCcw } from "lucide-react"; +import { useCallback, useEffect, useState } from "react"; +import { toast } from "sonner"; import { Button } from "@/components/ui/button"; -import { - Dialog, - DialogContent, - DialogTitle, - DialogTrigger, -} from "@/components/ui/dialog"; +import { Dialog, DialogContent, DialogTitle, DialogTrigger } from "@/components/ui/dialog"; import { Separator } from "@/components/ui/separator"; import { Spinner } from "@/components/ui/spinner"; -import { cn } from "@/lib/utils"; import { documentsApiService } from "@/lib/apis/documents-api.service"; -import { toast } from "sonner"; +import { cn } from "@/lib/utils"; interface DocumentVersionSummary { version_number: number; @@ -123,10 +118,9 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) { setSelectedVersion(versionNumber); setContentLoading(true); try { - const data = (await documentsApiService.getDocumentVersion( - documentId, - versionNumber - )) as { source_markdown: string }; + const data = (await documentsApiService.getDocumentVersion(documentId, versionNumber)) as { + source_markdown: string; + }; setVersionContent(data.source_markdown || ""); } catch { toast.error("Failed to load version content"); @@ -196,13 +190,11 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) { >

- {v.created_at ? formatRelativeTime(v.created_at) : `Version ${v.version_number}`} + {v.created_at + ? formatRelativeTime(v.created_at) + : `Version ${v.version_number}`}

- {v.title && ( -

- {v.title} -

- )} + {v.title &&

{v.title}

}
@@ -227,11 +219,7 @@ function VersionHistoryPanel({ documentId }: { documentId: number }) { onClick={handleCopy} disabled={contentLoading || copied} > - {copied ? ( - - ) : ( - - )} + {copied ? : } {copied ? "Copied" : "Copy"}
diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 59af0ee8d..05bcd2dc0 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -54,7 +54,6 @@ function EditorPanelSkeleton() { ); } - export function EditorPanelContent({ documentId, searchSpaceId, @@ -194,24 +193,24 @@ export function EditorPanelContent({ return ( <>
-
-

{displayTitle}

- {isEditableType && editedMarkdown !== null && ( -

Unsaved changes

- )} +
+

{displayTitle}

+ {isEditableType && editedMarkdown !== null && ( +

Unsaved changes

+ )} +
+
+ {editorDoc?.document_type && ( + + )} + {onClose && ( + + )} +
-
- {editorDoc?.document_type && ( - - )} - {onClose && ( - - )} -
-
{isLoading ? ( @@ -233,7 +232,9 @@ export function EditorPanelContent({ ? "Document is processing" : "Document unavailable"}

-

{error || "An unknown error occurred"}

+

+ {error || "An unknown error occurred"} +

) : isLargeDocument ? ( diff --git a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx index c10c5dc82..aa409e179 100644 --- a/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx +++ b/surfsense_web/components/layout/ui/sidebar/DocumentsSidebar.tsx @@ -121,9 +121,7 @@ export function DocumentsSidebar({ } const recovered = await api!.getWatchedFolders(); const ids = new Set( - recovered - .filter((f) => f.rootFolderId != null) - .map((f) => f.rootFolderId as number) + recovered.filter((f) => f.rootFolderId != null).map((f) => f.rootFolderId as number) ); setWatchedFolderIds(ids); return; @@ -133,9 +131,7 @@ export function DocumentsSidebar({ } const ids = new Set( - folders - .filter((f) => f.rootFolderId != null) - .map((f) => f.rootFolderId as number) + folders.filter((f) => f.rootFolderId != null).map((f) => f.rootFolderId as number) ); setWatchedFolderIds(ids); } @@ -305,28 +301,25 @@ export function DocumentsSidebar({ [searchSpaceId] ); - const handleStopWatching = useCallback( - async (folder: FolderDisplay) => { - const api = window.electronAPI; - if (!api) return; + const handleStopWatching = useCallback(async (folder: FolderDisplay) => { + const api = window.electronAPI; + if (!api) return; - const watchedFolders = await api.getWatchedFolders(); - const matched = watchedFolders.find((wf) => wf.rootFolderId === folder.id); - if (!matched) { - toast.error("This folder is not being watched"); - return; - } + const watchedFolders = await api.getWatchedFolders(); + const matched = watchedFolders.find((wf) => wf.rootFolderId === folder.id); + if (!matched) { + toast.error("This folder is not being watched"); + return; + } - await api.removeWatchedFolder(matched.path); - try { - await foldersApiService.stopWatching(folder.id); - } catch (err) { - console.error("[DocumentsSidebar] Failed to clear watched metadata:", err); - } - toast.success(`Stopped watching: ${matched.name}`); - }, - [] - ); + await api.removeWatchedFolder(matched.path); + try { + await foldersApiService.stopWatching(folder.id); + } catch (err) { + console.error("[DocumentsSidebar] Failed to clear watched metadata:", err); + } + toast.success(`Stopped watching: ${matched.name}`); + }, []); const handleRenameFolder = useCallback(async (folder: FolderDisplay, newName: string) => { try { @@ -755,81 +748,83 @@ export function DocumentsSidebar({
- handleCreateFolder(null)} - /> + handleCreateFolder(null)} + />
-
- {deletableSelectedIds.length > 0 && ( -
- -
- )} +
+ {deletableSelectedIds.length > 0 && ( +
+ +
+ )} - { - openEditorPanel({ - documentId: doc.id, - searchSpaceId, - title: doc.title, - }); - }} - onEditDocument={(doc) => { - openEditorPanel({ - documentId: doc.id, - searchSpaceId, - title: doc.title, - }); - }} - onDeleteDocument={(doc) => handleDeleteDocument(doc.id)} - onMoveDocument={handleMoveDocument} - onExportDocument={handleExportDocument} - onVersionHistory={(doc) => setVersionDocId(doc.id)} - activeTypes={activeTypes} - onDropIntoFolder={handleDropIntoFolder} - onReorderFolder={handleReorderFolder} - watchedFolderIds={watchedFolderIds} - onRescanFolder={handleRescanFolder} - onStopWatchingFolder={handleStopWatching} - /> + { + openEditorPanel({ + documentId: doc.id, + searchSpaceId, + title: doc.title, + }); + }} + onEditDocument={(doc) => { + openEditorPanel({ + documentId: doc.id, + searchSpaceId, + title: doc.title, + }); + }} + onDeleteDocument={(doc) => handleDeleteDocument(doc.id)} + onMoveDocument={handleMoveDocument} + onExportDocument={handleExportDocument} + onVersionHistory={(doc) => setVersionDocId(doc.id)} + activeTypes={activeTypes} + onDropIntoFolder={handleDropIntoFolder} + onReorderFolder={handleReorderFolder} + watchedFolderIds={watchedFolderIds} + onRescanFolder={handleRescanFolder} + onStopWatchingFolder={handleStopWatching} + /> +
-
- {versionDocId !== null && ( - { if (!open) setVersionDocId(null); }} - documentId={versionDocId} - /> - )} + {versionDocId !== null && ( + { + if (!open) setVersionDocId(null); + }} + documentId={versionDocId} + /> + )} - {isProcessing ? "Document is processing" : "Document unavailable"}

-

- {error || "An unknown error occurred"} -

+

{error || "An unknown error occurred"}

{!isProcessing && (
-

- Document unavailable -

+

Document unavailable

{documentByChunkFetchingError.message || "An unexpected error occurred. Please try again."} diff --git a/surfsense_web/components/settings/llm-role-manager.tsx b/surfsense_web/components/settings/llm-role-manager.tsx index d1651b7f0..718503318 100644 --- a/surfsense_web/components/settings/llm-role-manager.tsx +++ b/surfsense_web/components/settings/llm-role-manager.tsx @@ -134,24 +134,27 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { preferences?.image_generation_config_id, ]); - const handleRoleAssignment = useCallback(async (prefKey: string, configId: string) => { - const value = configId === "unassigned" ? "" : parseInt(configId); + const handleRoleAssignment = useCallback( + async (prefKey: string, configId: string) => { + const value = configId === "unassigned" ? "" : parseInt(configId); - setAssignments((prev) => ({ ...prev, [prefKey]: value })); - setSavingRole(prefKey); - savingRef.current = true; + setAssignments((prev) => ({ ...prev, [prefKey]: value })); + setSavingRole(prefKey); + savingRef.current = true; - try { - await updatePreferences({ - search_space_id: searchSpaceId, - data: { [prefKey]: value || undefined }, - }); - toast.success("Role assignment updated"); - } finally { - setSavingRole(null); - savingRef.current = false; - } - }, [updatePreferences, searchSpaceId]); + try { + await updatePreferences({ + search_space_id: searchSpaceId, + data: { [prefKey]: value || undefined }, + }); + toast.success("Role assignment updated"); + } finally { + setSavingRole(null); + savingRef.current = false; + } + }, + [updatePreferences, searchSpaceId] + ); // Combine global and custom LLM configs const allLLMConfigs = [ @@ -199,10 +202,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { Refresh {isAssignmentComplete && !isLoading && !hasError && ( - + All roles assigned @@ -483,7 +483,6 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { })}

)} -
); } diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 9733bd2e6..f1162f57c 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -128,7 +128,8 @@ const MAX_TOTAL_SIZE_BYTES = MAX_TOTAL_SIZE_MB * 1024 * 1024; const MAX_FILE_SIZE_MB = 500; const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024; -const toggleRowClass = "flex items-center justify-between rounded-lg bg-slate-400/5 dark:bg-white/5 p-3"; +const toggleRowClass = + "flex items-center justify-between rounded-lg bg-slate-400/5 dark:bg-white/5 p-3"; export function DocumentUploadTab({ searchSpaceId, @@ -326,7 +327,14 @@ export function DocumentUploadTab({ await api.addWatchedFolder({ path: selectedFolder.path, name: selectedFolder.name, - excludePatterns: [".git", "node_modules", "__pycache__", ".DS_Store", ".obsidian", ".trash"], + excludePatterns: [ + ".git", + "node_modules", + "__pycache__", + ".DS_Store", + ".obsidian", + ".trash", + ], fileExtensions: null, rootFolderId, searchSpaceId: Number(searchSpaceId), @@ -393,12 +401,20 @@ export function DocumentUploadTab({ return ( e.stopPropagation()}> - - e.stopPropagation()}> + e.stopPropagation()} + > Files @@ -415,7 +431,11 @@ export function DocumentUploadTab({ return ( e.stopPropagation()}> - @@ -457,21 +477,19 @@ export function DocumentUploadTab({ {/* MOBILE DROP ZONE */}
{hasContent ? ( - !selectedFolder && !isFileCountLimitReached && ( - isElectron ? ( -
- {renderBrowseButton({ compact: true, fullWidth: true })} -
- ) : ( - - ) - ) + !selectedFolder && + !isFileCountLimitReached && + (isElectron ? ( +
{renderBrowseButton({ compact: true, fullWidth: true })}
+ ) : ( + + )) ) : (
{t("file_size_limit")} - {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} + + {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} +

e.stopPropagation()}> @@ -538,7 +558,9 @@ export function DocumentUploadTab({

{t("file_size_limit")} - {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} + + {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} +

{renderBrowseButton()}
@@ -569,9 +591,7 @@ export function DocumentUploadTab({

Watch folder

-

- Auto-sync when files change -

+

Auto-sync when files change

- {t("selected_files", { count: files.length })} · {formatFileSize(totalFileSize)} + {t("selected_files", { count: files.length })} ·{" "} + {formatFileSize(totalFileSize)}

From c964b47f99a53cef556342061d42f8c1f4c78a91 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 3 Apr 2026 13:59:33 +0530 Subject: [PATCH 087/202] style: enhance folder selection UI in DocumentUploadTab --- surfsense_web/components/sources/DocumentUploadTab.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index f1162f57c..940d1560a 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -570,7 +570,7 @@ export function DocumentUploadTab({ {/* FOLDER SELECTED (Electron only — web flattens folder contents into file list) */} {isElectron && selectedFolder && (
-
+

{selectedFolder.name}

From 8a8e5fcd76c5dcebb6fbc2142212408553ee7d6d Mon Sep 17 00:00:00 2001 From: SohamBhattacharjee2003 <125297948+SohamBhattacharjee2003@users.noreply.github.com> Date: Fri, 3 Apr 2026 14:29:41 +0530 Subject: [PATCH 088/202] fix(hooks): add AbortController to properly cancel fetch requests on unmount --- package-lock.json | 6 ++++++ .../components/circleback-config.tsx | 16 ++++++++++++---- .../components/editor-panel/editor-panel.tsx | 18 ++++++++---------- .../layout/ui/tabs/DocumentTabContent.tsx | 18 ++++++++---------- 4 files changed, 34 insertions(+), 24 deletions(-) create mode 100644 package-lock.json diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 000000000..9703ac09f --- /dev/null +++ b/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "SurfSense", + "lockfileVersion": 3, + "requires": true, + "packages": {} +} diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/circleback-config.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/circleback-config.tsx index 99e26c542..268ab0f98 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/circleback-config.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/components/circleback-config.tsx @@ -34,9 +34,12 @@ export const CirclebackConfig: FC = ({ connector, onNameC const [isLoading, setIsLoading] = useState(true); const [copied, setCopied] = useState(false); + // Fetch webhook info // Fetch webhook info useEffect(() => { - const fetchWebhookInfo = async () => { + const controller = new AbortController(); + + const doFetch = async () => { if (!connector.search_space_id) return; const baseUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL; @@ -49,8 +52,11 @@ export const CirclebackConfig: FC = ({ connector, onNameC setIsLoading(true); try { const response = await authenticatedFetch( - `${baseUrl}/api/v1/webhooks/circleback/${connector.search_space_id}/info` + `${baseUrl}/api/v1/webhooks/circleback/${connector.search_space_id}/info`, + { signal: controller.signal } ); + if (controller.signal.aborted) return; + if (response.ok) { const data: unknown = await response.json(); // Runtime validation with zod schema @@ -59,16 +65,18 @@ export const CirclebackConfig: FC = ({ connector, onNameC setWebhookUrl(validatedData.webhook_url); } } catch (error) { + if (controller.signal.aborted) return; console.error("Failed to fetch webhook info:", error); // Reset state on error setWebhookInfo(null); setWebhookUrl(""); } finally { - setIsLoading(false); + if (!controller.signal.aborted) setIsLoading(false); } }; - fetchWebhookInfo(); + doFetch().catch(() => {}); + return () => controller.abort(); }, [connector.search_space_id]); const handleNameChange = (value: string) => { diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 3ea36f800..3c204f1bb 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -70,7 +70,7 @@ export function EditorPanelContent({ const [displayTitle, setDisplayTitle] = useState(title || "Untitled"); useEffect(() => { - let cancelled = false; + const controller = new AbortController(); setIsLoading(true); setError(null); setEditorDoc(null); @@ -78,7 +78,7 @@ export function EditorPanelContent({ initialLoadDone.current = false; changeCountRef.current = 0; - const fetchContent = async () => { + const doFetch = async () => { const token = getBearerToken(); if (!token) { redirectToLogin(); @@ -88,10 +88,10 @@ export function EditorPanelContent({ try { const response = await authenticatedFetch( `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, - { method: "GET" } + { method: "GET", signal: controller.signal } ); - if (cancelled) return; + if (controller.signal.aborted) return; if (!response.ok) { const errorData = await response @@ -115,18 +115,16 @@ export function EditorPanelContent({ setEditorDoc(data); initialLoadDone.current = true; } catch (err) { - if (cancelled) return; + if (controller.signal.aborted) return; console.error("Error fetching document:", err); setError(err instanceof Error ? err.message : "Failed to fetch document"); } finally { - if (!cancelled) setIsLoading(false); + if (!controller.signal.aborted) setIsLoading(false); } }; - fetchContent(); - return () => { - cancelled = true; - }; + doFetch().catch(() => {}); + return () => controller.abort(); }, [documentId, searchSpaceId, title]); const handleMarkdownChange = useCallback((md: string) => { diff --git a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx index ac279cd4d..a645bfbd5 100644 --- a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx +++ b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx @@ -55,7 +55,7 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen const changeCountRef = useRef(0); useEffect(() => { - let cancelled = false; + const controller = new AbortController(); setIsLoading(true); setError(null); setDoc(null); @@ -64,7 +64,7 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen initialLoadDone.current = false; changeCountRef.current = 0; - const fetchContent = async () => { + const doFetch = async () => { const token = getBearerToken(); if (!token) { redirectToLogin(); @@ -74,10 +74,10 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen try { const response = await authenticatedFetch( `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, - { method: "GET" } + { method: "GET", signal: controller.signal } ); - if (cancelled) return; + if (controller.signal.aborted) return; if (!response.ok) { const errorData = await response @@ -98,18 +98,16 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen setDoc(data); initialLoadDone.current = true; } catch (err) { - if (cancelled) return; + if (controller.signal.aborted) return; console.error("Error fetching document:", err); setError(err instanceof Error ? err.message : "Failed to fetch document"); } finally { - if (!cancelled) setIsLoading(false); + if (!controller.signal.aborted) setIsLoading(false); } }; - fetchContent(); - return () => { - cancelled = true; - }; + doFetch().catch(() => {}); + return () => controller.abort(); }, [documentId, searchSpaceId]); const handleMarkdownChange = useCallback((md: string) => { From 416b3635bf90d9c5618745f9069d2cab816839c0 Mon Sep 17 00:00:00 2001 From: sukarxn Date: Fri, 3 Apr 2026 17:09:35 +0530 Subject: [PATCH 089/202] fix: optimize image components with next/image - Replace raw with Next.js Image in markdown-viewer.tsx - Use next/image with fill + sizes in assistant-ui image.tsx - Optimize favicons with explicit dimensions in citation components - Set unoptimized=true for data/blob URLs and external favicons --- .../components/assistant-ui/image.tsx | 121 +++++++++++++----- .../components/homepage/use-cases-grid.tsx | 10 ++ surfsense_web/components/markdown-viewer.tsx | 37 ++++-- .../tool-ui/citation/citation-list.tsx | 42 +++--- .../components/tool-ui/citation/citation.tsx | 25 ++-- 5 files changed, 160 insertions(+), 75 deletions(-) diff --git a/surfsense_web/components/assistant-ui/image.tsx b/surfsense_web/components/assistant-ui/image.tsx index 65059bcdc..c147eede4 100644 --- a/surfsense_web/components/assistant-ui/image.tsx +++ b/surfsense_web/components/assistant-ui/image.tsx @@ -6,6 +6,7 @@ import { ImageIcon, ImageOffIcon } from "lucide-react"; import { memo, type PropsWithChildren, useEffect, useRef, useState } from "react"; import { createPortal } from "react-dom"; import { cn } from "@/lib/utils"; +import NextImage from 'next/image'; const imageVariants = cva("aui-image-root relative overflow-hidden rounded-lg", { variants: { @@ -86,23 +87,57 @@ function ImagePreview({ >
- ) : ( + ) : isDataOrBlobUrl(src) ? ( + // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img + {alt} { + if (typeof src === "string") setLoadedSrc(src); + onLoad?.(e); + }} + onError={(e) => { + if (typeof src === "string") setErrorSrc(src); + onError?.(e); + }} + {...props} + /> + ) : ( // biome-ignore lint/performance/noImgElement: intentional for dynamic external URLs - {alt} { - if (typeof src === "string") setLoadedSrc(src); - onLoad?.(e); - }} - onError={(e) => { - if (typeof src === "string") setErrorSrc(src); - onError?.(e); - }} - {...props} - /> + // {alt} { + // if (typeof src === "string") setLoadedSrc(src); + // onLoad?.(e); + // }} + // onError={(e) => { + // if (typeof src === "string") setErrorSrc(src); + // onError?.(e); + // }} + // {...props} + // /> + { + if (typeof src === "string") setLoadedSrc(src); + onLoad?.(); + }} + onError={() => { + if (typeof src === "string") setErrorSrc(src); + onError?.(); + }} + unoptimized={false} + {...props} + /> )}
); @@ -126,7 +161,10 @@ type ImageZoomProps = PropsWithChildren<{ src: string; alt?: string; }>; - +function isDataOrBlobUrl(src: string | undefined): boolean { + if (!src || typeof src !== "string") return false; + return src.startsWith("data:") || src.startsWith("blob:"); +} function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) { const [isMounted, setIsMounted] = useState(false); const [isOpen, setIsOpen] = useState(false); @@ -177,22 +215,39 @@ function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) { aria-label="Close zoomed image" > {/** biome-ignore lint/performance/noImgElement: */} - {alt} { - e.stopPropagation(); - handleClose(); - }} - onKeyDown={(e) => { - if (e.key === "Enter") { - e.stopPropagation(); - handleClose(); - } - }} - /> + {isDataOrBlobUrl(src) ? ( + // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img + {alt} { + e.stopPropagation(); + handleClose(); + }} + onKeyDown={(e) => { + if (e.key === "Enter") { + e.stopPropagation(); + handleClose(); + } + }} + /> + ) : ( + { + e.stopPropagation(); + handleClose(); + }} + unoptimized={false} + /> + )} , document.body )} diff --git a/surfsense_web/components/homepage/use-cases-grid.tsx b/surfsense_web/components/homepage/use-cases-grid.tsx index 2f8c2d537..f9d315b49 100644 --- a/surfsense_web/components/homepage/use-cases-grid.tsx +++ b/surfsense_web/components/homepage/use-cases-grid.tsx @@ -1,4 +1,5 @@ "use client"; +import Image from 'next/image'; import { AnimatePresence, motion } from "motion/react"; import { ExpandedGifOverlay, useExpandedGif } from "@/components/ui/expanded-gif-overlay"; @@ -81,6 +82,15 @@ function UseCaseCard({ alt={title} className="w-full rounded-xl object-cover transition-transform duration-500 group-hover:scale-[1.02]" /> +
+ {title} +

{title}

diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx index a568bd698..1c39f03a0 100644 --- a/surfsense_web/components/markdown-viewer.tsx +++ b/surfsense_web/components/markdown-viewer.tsx @@ -3,6 +3,8 @@ import { createMathPlugin } from "@streamdown/math"; import { Streamdown, type StreamdownProps } from "streamdown"; import "katex/dist/katex.min.css"; import { cn } from "@/lib/utils"; +import Image from 'next/image'; +import { is } from "drizzle-orm"; const code = createCodePlugin({ themes: ["nord", "nord"], @@ -127,16 +129,31 @@ export function MarkdownViewer({ content, className, maxLength }: MarkdownViewer
), hr: ({ ...props }) =>
, - img: ({ src, alt, width: _w, height: _h, ...props }) => ( - // eslint-disable-next-line @next/next/no-img-element - {alt - ), + img: ({ src, alt, width: _w, height: _h, ...props }) => { + const isDataOrUnknownUrl = typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http")); + + return isDataOrUnknownUrl ? ( + // eslint-disable-next-line @next/next/no-img-element + {alt + ) : ( + {alt + ); +}, table: ({ ...props }) => (
diff --git a/surfsense_web/components/tool-ui/citation/citation-list.tsx b/surfsense_web/components/tool-ui/citation/citation-list.tsx index 3151917b6..75b02bf3d 100644 --- a/surfsense_web/components/tool-ui/citation/citation-list.tsx +++ b/surfsense_web/components/tool-ui/citation/citation-list.tsx @@ -7,6 +7,8 @@ import { openSafeNavigationHref, resolveSafeNavigationHref } from "../shared/med import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter"; import { Citation } from "./citation"; import type { CitationType, CitationVariant, SerializableCitation } from "./schema"; +import NextImage from 'next/image'; + const TYPE_ICONS: Record = { webpage: Globe, @@ -253,18 +255,18 @@ function OverflowItem({ citation, onClick }: OverflowItemProps) { className="group hover:bg-muted focus-visible:bg-muted flex w-full cursor-pointer items-center gap-2.5 rounded-md px-2 py-2 text-left transition-colors focus-visible:outline-none" > {citation.favicon ? ( - // biome-ignore lint/performance/noImgElement: external favicon from arbitrary domain — next/image requires remotePatterns config - - ) : ( -
diff --git a/surfsense_web/components/settings/user-settings-dialog.tsx b/surfsense_web/components/settings/user-settings-dialog.tsx index b74ff973b..0afdfb2b7 100644 --- a/surfsense_web/components/settings/user-settings-dialog.tsx +++ b/surfsense_web/components/settings/user-settings-dialog.tsx @@ -5,10 +5,10 @@ import { Globe, KeyRound, Monitor, Receipt, Sparkles, User } from "lucide-react" import { useTranslations } from "next-intl"; import { ApiKeyContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent"; import { CommunityPromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent"; +import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent"; import { ProfileContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ProfileContent"; import { PromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PromptsContent"; import { PurchaseHistoryContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent"; -import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent"; import { userSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms"; import { SettingsDialog } from "@/components/settings/settings-dialog"; diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index c8ce195aa..36a24e299 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -471,13 +471,13 @@ export function DocumentUploadTab({ )) ) : ( - )} @@ -684,17 +689,17 @@ export function DocumentUploadTab({ -
- {supportedExtensions.map((ext) => ( - - {ext} - - ))} -
+
+ {supportedExtensions.map((ext) => ( + + {ext} + + ))} +
diff --git a/surfsense_web/components/tool-ui/citation/citation-list.tsx b/surfsense_web/components/tool-ui/citation/citation-list.tsx index 75b02bf3d..bbe869a09 100644 --- a/surfsense_web/components/tool-ui/citation/citation-list.tsx +++ b/surfsense_web/components/tool-ui/citation/citation-list.tsx @@ -2,13 +2,12 @@ import type { LucideIcon } from "lucide-react"; import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react"; +import NextImage from "next/image"; import * as React from "react"; import { openSafeNavigationHref, resolveSafeNavigationHref } from "../shared/media"; import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter"; import { Citation } from "./citation"; import type { CitationType, CitationVariant, SerializableCitation } from "./schema"; -import NextImage from 'next/image'; - const TYPE_ICONS: Record = { webpage: Globe, @@ -264,9 +263,9 @@ function OverflowItem({ citation, onClick }: OverflowItemProps) { className="size-4.5 rounded-full object-cover" unoptimized={true} /> - ) : ( + ) : (