From 90f9fad95cb0dcc86faa26ae267a170abf90e470 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 4 Feb 2026 12:55:38 +0530 Subject: [PATCH 01/36] feat: enhance document management with user information and connector dialog --- .../app/routes/documents_routes.py | 28 +- surfsense_backend/app/schemas/documents.py | 1 + .../[search_space_id]/client-layout.tsx | 3 + .../(manage)/components/DocumentTypeIcon.tsx | 33 +- .../(manage)/components/DocumentsFilters.tsx | 354 +++++++------ .../components/DocumentsTableShell.tsx | 464 ++++++++++-------- .../components/PaginationControls.tsx | 193 +++----- .../(manage)/components/RowActions.tsx | 38 +- .../documents/(manage)/components/types.ts | 5 +- .../documents/(manage)/page.tsx | 119 ++--- .../connector-dialog.atoms.ts | 5 + .../assistant-ui/connector-popup.tsx | 60 +-- .../hooks/use-connector-dialog.ts | 6 +- 13 files changed, 665 insertions(+), 644 deletions(-) create mode 100644 surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index be90df459..d25a2db48 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -211,7 +211,11 @@ async def read_documents( Permission.DOCUMENTS_READ.value, "You don't have permission to read documents in this search space", ) - query = select(Document).filter(Document.search_space_id == search_space_id) + query = ( + select(Document) + .options(selectinload(Document.created_by)) + .filter(Document.search_space_id == search_space_id) + ) count_query = ( select(func.count()) .select_from(Document) @@ -221,6 +225,7 @@ async def read_documents( # Get documents from all search spaces user has membership in query = ( select(Document) + .options(selectinload(Document.created_by)) .join(SearchSpace) .join(SearchSpaceMembership) .filter(SearchSpaceMembership.user_id == user.id) @@ -261,6 +266,11 @@ async def read_documents( # Convert database objects to API-friendly format api_documents = [] for doc in db_documents: + # Get user name (display_name or email fallback) + created_by_name = None + if doc.created_by: + created_by_name = doc.created_by.display_name or doc.created_by.email + api_documents.append( DocumentRead( id=doc.id, @@ -273,6 +283,8 @@ async def read_documents( created_at=doc.created_at, updated_at=doc.updated_at, search_space_id=doc.search_space_id, + created_by_id=doc.created_by_id, + created_by_name=created_by_name, ) ) @@ -341,7 +353,11 @@ async def search_documents( Permission.DOCUMENTS_READ.value, "You don't have permission to read documents in this search space", ) - query = select(Document).filter(Document.search_space_id == search_space_id) + query = ( + select(Document) + .options(selectinload(Document.created_by)) + .filter(Document.search_space_id == search_space_id) + ) count_query = ( select(func.count()) .select_from(Document) @@ -351,6 +367,7 @@ async def search_documents( # Get documents from all search spaces user has membership in query = ( select(Document) + .options(selectinload(Document.created_by)) .join(SearchSpace) .join(SearchSpaceMembership) .filter(SearchSpaceMembership.user_id == user.id) @@ -395,6 +412,11 @@ async def search_documents( # Convert database objects to API-friendly format api_documents = [] for doc in db_documents: + # Get user name (display_name or email fallback) + created_by_name = None + if doc.created_by: + created_by_name = doc.created_by.display_name or doc.created_by.email + api_documents.append( DocumentRead( id=doc.id, @@ -407,6 +429,8 @@ async def search_documents( created_at=doc.created_at, updated_at=doc.updated_at, search_space_id=doc.search_space_id, + created_by_id=doc.created_by_id, + created_by_name=created_by_name, ) ) diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index 1f82ae9ce..ad1907b90 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -53,6 +53,7 @@ class DocumentRead(BaseModel): updated_at: datetime | None search_space_id: int created_by_id: UUID | None = None # User who created/uploaded this document + created_by_name: str | None = None # Display name or email of the user who created this document model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx index b9ddb9b74..83a579970 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/client-layout.tsx @@ -13,6 +13,7 @@ import { llmPreferencesAtom, } from "@/atoms/new-llm-config/new-llm-config-query.atoms"; import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms"; +import { ConnectorIndicator } from "@/components/assistant-ui/connector-popup"; import { DocumentUploadDialogProvider } from "@/components/assistant-ui/document-upload-popup"; import { DashboardBreadcrumb } from "@/components/dashboard-breadcrumb"; import { LayoutDataProvider } from "@/components/layout"; @@ -192,6 +193,8 @@ export function DashboardClientLayout({ }> {children} + {/* Global connector dialog - triggered from documents page */} + ); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx index e483dea12..246cff1c0 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx @@ -2,6 +2,7 @@ import type React from "react"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; export function getDocumentTypeIcon(type: string): React.ReactNode { return getConnectorIcon(type); @@ -14,17 +15,35 @@ export function getDocumentTypeLabel(type: string): string { .join(" "); } +const MAX_LABEL_LENGTH = 28; + export function DocumentTypeChip({ type, className }: { type: string; className?: string }) { const icon = getDocumentTypeIcon(type); - return ( + const fullLabel = getDocumentTypeLabel(type); + const truncatedLabel = fullLabel.length > MAX_LABEL_LENGTH + ? `${fullLabel.slice(0, MAX_LABEL_LENGTH)}...` + : fullLabel; + const needsTruncation = fullLabel.length > MAX_LABEL_LENGTH; + + const chip = ( - {icon} - {getDocumentTypeLabel(type)} + {icon} + {truncatedLabel} ); + + if (needsTruncation) { + return ( + + {chip} + +

{fullLabel}

+
+
+ ); + } + + return chip; } diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx index 67413d6f0..87d349e38 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx @@ -1,9 +1,20 @@ "use client"; -import { CircleAlert, CircleX, Columns3, Filter, ListFilter, Trash } from "lucide-react"; -import { AnimatePresence, motion, type Variants } from "motion/react"; +import { useSetAtom } from "jotai"; +import { + CircleAlert, + CircleX, + Columns3, + FilePlus2, + FileType, + SlidersHorizontal, + Trash, +} from "lucide-react"; +import { motion } from "motion/react"; import { useTranslations } from "next-intl"; import React, { useMemo, useRef } from "react"; +import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms"; +import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; import { AlertDialog, AlertDialogAction, @@ -17,25 +28,13 @@ import { } from "@/components/ui/alert-dialog"; import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; -import { - DropdownMenu, - DropdownMenuCheckboxItem, - DropdownMenuContent, - DropdownMenuLabel, - DropdownMenuTrigger, -} from "@/components/ui/dropdown-menu"; import { Input } from "@/components/ui/input"; import { Label } from "@/components/ui/label"; import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover"; import type { DocumentTypeEnum } from "@/contracts/types/document.types"; +import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon"; import type { ColumnVisibility } from "./types"; -const fadeInScale: Variants = { - hidden: { opacity: 0, scale: 0.95 }, - visible: { opacity: 1, scale: 1, transition: { type: "spring", stiffness: 300, damping: 30 } }, - exit: { opacity: 0, scale: 0.95, transition: { duration: 0.15 } }, -}; - export function DocumentsFilters({ typeCounts: typeCountsRecord, selectedIds, @@ -61,6 +60,10 @@ export function DocumentsFilters({ const id = React.useId(); const inputRef = useRef(null); + // Dialog hooks for action buttons + const { openDialog: openUploadDialog } = useDocumentUploadDialog(); + const setConnectorDialogOpen = useSetAtom(connectorDialogOpenAtom); + const uniqueTypes = useMemo(() => { return Object.keys(typeCountsRecord).sort() as DocumentTypeEnum[]; }, [typeCountsRecord]); @@ -75,14 +78,41 @@ export function DocumentsFilters({ return ( -
+ {/* Main toolbar row */} +
+ {/* Action Buttons - Left Side */} +
+ + +
+ + {/* Spacer */} +
+ + {/* Search Input */} onSearch(e.target.value)} - placeholder={t("filter_placeholder")} + placeholder="Filter by title" type="text" aria-label={t("filter_placeholder")} /> - - {Boolean(searchValue) && ( { onSearch(""); inputRef.current?.focus(); }} - initial={{ opacity: 0, rotate: -90 }} - animate={{ opacity: 1, rotate: 0 }} - exit={{ opacity: 0, rotate: 90 }} + initial={{ opacity: 0, scale: 0.8 }} + animate={{ opacity: 1, scale: 1 }} + exit={{ opacity: 0, scale: 0.8 }} whileHover={{ scale: 1.1 }} whileTap={{ scale: 0.9 }} > - )} - - - - - - - - -
-
Filters
-
- - {uniqueTypes.map((value: DocumentTypeEnum, i) => ( - + +
+
+ Filter by source +
+
+ {uniqueTypes.map((value: DocumentTypeEnum, i) => ( + + ))} +
+ {activeTypes.length > 0 && ( +
+ +
+ )} +
+
+ + + {/* View/Columns Popover */} + + + + + +
+
+ Toggle columns +
+
+ {( + [ + ["document_type", "Source"], + ["created_by", "User"], + ["created_at", "Created"], + ] as Array<[keyof ColumnVisibility, string]> + ).map(([key, label], i) => ( + + ))}
-
- - + + +
- - - - - - - - Toggle columns - {( - [ - ["title", "Title"], - ["document_type", "Type"], - ["content", "Content"], - ["created_at", "Created At"], - ] as Array<[keyof ColumnVisibility, string]> - ).map(([key, label]) => ( - onToggleColumn(key, !!v)} - onSelect={(e) => e.preventDefault()} - > - {label} - - ))} - - -
- -
+ {/* Bulk Delete Button */} {selectedIds.size > 0 && ( - + + + - -
+ +
- - Are you absolutely sure? + + Delete {selectedIds.size} document{selectedIds.size !== 1 ? "s" : ""}? - This action cannot be undone. This will permanently delete {selectedIds.size}{" "} - selected {selectedIds.size === 1 ? "row" : "rows"}. + This action cannot be undone. This will permanently delete the selected {selectedIds.size === 1 ? "document" : "documents"} from your search space.
Cancel - Delete + + Delete +
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index d9908f46c..faa7605a3 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -4,9 +4,10 @@ import { ChevronDown, ChevronUp, FileX, Plus } from "lucide-react"; import { motion } from "motion/react"; import { useParams } from "next/navigation"; import { useTranslations } from "next-intl"; -import React from "react"; +import React, { useState } from "react"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; import { DocumentViewer } from "@/components/document-viewer"; +import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; import { Spinner } from "@/components/ui/spinner"; @@ -19,7 +20,7 @@ import { TableRow, } from "@/components/ui/table"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; -import { DocumentTypeChip, getDocumentTypeIcon } from "./DocumentTypeIcon"; +import { DocumentTypeChip } from "./DocumentTypeIcon"; import { RowActions } from "./RowActions"; import type { ColumnVisibility, Document } from "./types"; @@ -36,13 +37,45 @@ function sortDocuments(docs: Document[], key: SortKey, desc: boolean): Document[ return desc ? sorted.reverse() : sorted; } -function truncate(text: string, len = 150): string { - const plain = text - .replace(/[#*_`>\-[\]()]+/g, " ") - .replace(/\s+/g, " ") - .trim(); - if (plain.length <= len) return plain; - return `${plain.slice(0, len)}...`; +function formatDate(dateStr: string): string { + const date = new Date(dateStr); + return date.toLocaleDateString("en-US", { + year: "numeric", + month: "long", + day: "numeric", + }); +} + +function SortableHeader({ + children, + sortKey, + currentSortKey, + sortDesc, + onSort, +}: { + children: React.ReactNode; + sortKey: SortKey; + currentSortKey: SortKey; + sortDesc: boolean; + onSort: (key: SortKey) => void; +}) { + const isActive = currentSortKey === sortKey; + return ( + + ); } export function DocumentsTableShell({ @@ -75,6 +108,9 @@ export function DocumentsTableShell({ const searchSpaceId = params.search_space_id; const { openDialog } = useDocumentUploadDialog(); + // State for metadata viewer (opened via Ctrl/Cmd+Click) + const [metadataDoc, setMetadataDoc] = useState(null); + const sorted = React.useMemo( () => sortDocuments(documents, sortKey, sortDesc), [documents, sortKey, sortDesc] @@ -107,23 +143,23 @@ export function DocumentsTableShell({ return ( {loading ? (
-
+

{t("loading")}

) : error ? (
-
+

{t("error_loading")}

-
@@ -136,10 +172,10 @@ export function DocumentsTableShell({ transition={{ duration: 0.4 }} className="flex flex-col items-center gap-4 max-w-md px-4 text-center" > -
- +
+
-
+

{t("no_documents")}

Get started by uploading your first document. @@ -153,218 +189,232 @@ export function DocumentsTableShell({

) : ( <> -
- - - - + {/* Desktop Table View */} +
+ {/* Fixed Header */} +
+ + + toggleAll(!!v)} aria-label="Select all" + className="data-[state=checked]:bg-primary data-[state=checked]:border-primary" /> - {columnVisibility.title && ( - - - - )} + + + Document + + {columnVisibility.document_type && ( - - + Source + )} - {columnVisibility.content && ( - {t("content_summary")} + {columnVisibility.created_by && ( + + User + )} {columnVisibility.created_at && ( - - + Created + )} - + Actions - - {sorted.map((doc, index) => { - const icon = getDocumentTypeIcon(doc.document_type); - const title = doc.title; - const truncatedTitle = title.length > 30 ? `${title.slice(0, 30)}...` : title; - return ( - - - toggleOne(doc.id, !!v)} - aria-label="Select row" - /> - - {columnVisibility.title && ( - - - - - - {icon} - {truncatedTitle} - - - -

{title}

-
-
-
-
- )} - {columnVisibility.document_type && ( - -
- -
-
- )} - {columnVisibility.content && ( - -
-
- {truncate(doc.content)} -
- - {t("view_full")} - - } - /> -
-
- )} - {columnVisibility.created_at && ( - - {new Date(doc.created_at).toLocaleDateString()} - - )} - - { - await onRefresh(); - }} - searchSpaceId={searchSpaceId as string} - /> - -
- ); - })} -
+ {/* Scrollable Body */} +
+ + + {sorted.map((doc, index) => { + const title = doc.title; + const truncatedTitle = title.length > 50 ? `${title.slice(0, 50)}...` : title; + const isSelected = selectedIds.has(doc.id); + return ( + + + toggleOne(doc.id, !!v)} + aria-label="Select row" + className="data-[state=checked]:bg-primary data-[state=checked]:border-primary" + /> + + + { + // Ctrl (Win/Linux) or Cmd (Mac) + Click opens metadata + if (e.ctrlKey || e.metaKey) { + e.preventDefault(); + e.stopPropagation(); + setMetadataDoc(doc); + } + }} + onKeyDown={(e) => { + // Ctrl/Cmd + Enter opens metadata + if ((e.ctrlKey || e.metaKey) && e.key === "Enter") { + e.preventDefault(); + setMetadataDoc(doc); + } + }} + > + {title.length > 50 ? ( + + + {truncatedTitle} + + +

{title}

+
+
+ ) : ( + title + )} + + } + /> +
+ {columnVisibility.document_type && ( + + + + )} + {columnVisibility.created_by && ( + + {doc.created_by_name || "—"} + + )} + {columnVisibility.created_at && ( + + {formatDate(doc.created_at)} + + )} + + { + await onRefresh(); + }} + searchSpaceId={searchSpaceId as string} + /> + +
+ ); + })} +
+
+
-
- {sorted.map((doc) => { - const icon = getDocumentTypeIcon(doc.document_type); + + {/* Mobile Card View */} +
+ {sorted.map((doc, index) => { + const isSelected = selectedIds.has(doc.id); return ( -
-
+ +
toggleOne(doc.id, !!v)} aria-label="Select row" + className="mt-0.5 data-[state=checked]:bg-primary data-[state=checked]:border-primary" /> -
-
- {icon} -
{doc.title}
-
-
- - - {new Date(doc.created_at).toLocaleDateString()} - -
- {columnVisibility.content && ( -
- {truncate(doc.content)} -
- - {t("view_full")} - +
+ { + // Ctrl (Win/Linux) or Cmd (Mac) + Click opens metadata + if (e.ctrlKey || e.metaKey) { + e.preventDefault(); + e.stopPropagation(); + setMetadataDoc(doc); } - /> -
-
- )} + }} + onKeyDown={(e) => { + // Ctrl/Cmd + Enter opens metadata + if ((e.ctrlKey || e.metaKey) && e.key === "Enter") { + e.preventDefault(); + setMetadataDoc(doc); + } + }} + > + {doc.title} + + } + /> +
+ + {columnVisibility.created_by && doc.created_by_name && ( + + {doc.created_by_name} + + )} + {columnVisibility.created_at && ( + + {formatDate(doc.created_at)} + + )} +
-
+
); })}
)} + + {/* Metadata Viewer - opened via Ctrl/Cmd+Click on document title */} + { + if (!open) setMetadataDoc(null); + }} + /> ); } diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/PaginationControls.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/PaginationControls.tsx index d87fa2dc9..bd8a9f1cc 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/PaginationControls.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/PaginationControls.tsx @@ -2,164 +2,89 @@ import { ChevronFirst, ChevronLast, ChevronLeft, ChevronRight } from "lucide-react"; import { motion } from "motion/react"; -import { useTranslations } from "next-intl"; import { Button } from "@/components/ui/button"; -import { Label } from "@/components/ui/label"; -import { Pagination, PaginationContent, PaginationItem } from "@/components/ui/pagination"; -import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, -} from "@/components/ui/select"; + +const PAGE_SIZE = 50; export function PaginationControls({ pageIndex, - pageSize, total, - onPageSizeChange, onFirst, onPrev, onNext, onLast, canPrev, canNext, - id, }: { pageIndex: number; - pageSize: number; total: number; - onPageSizeChange: (size: number) => void; onFirst: () => void; onPrev: () => void; onNext: () => void; onLast: () => void; canPrev: boolean; canNext: boolean; - id: string; }) { - const t = useTranslations("documents"); - const start = total === 0 ? 0 : pageIndex * pageSize + 1; - const end = Math.min((pageIndex + 1) * pageSize, total); + const start = pageIndex * PAGE_SIZE + 1; + const end = Math.min((pageIndex + 1) * PAGE_SIZE, total); return ( -
- - - - + + {/* Range indicator */} + + {start}-{end} of {total} + - -

- - {start}-{end} - {" "} - of {total} -

-
- -
- - - - - - - - - - - - - - - - - - - - - - - - + {/* Navigation buttons */} +
+ + + +
-
+
); } + +export { PAGE_SIZE }; diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx index d9a894e5a..cc6ed3fe8 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx @@ -1,11 +1,10 @@ "use client"; -import { FileText, MoreHorizontal, Pencil, Trash2 } from "lucide-react"; +import { MoreHorizontal, Pencil, Trash2 } from "lucide-react"; import { motion } from "motion/react"; import { useRouter } from "next/navigation"; import { useState } from "react"; import { toast } from "sonner"; -import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { AlertDialog, AlertDialogAction, @@ -43,7 +42,6 @@ export function RowActions({ searchSpaceId: string; }) { const [isDeleteOpen, setIsDeleteOpen] = useState(false); - const [isMetadataOpen, setIsMetadataOpen] = useState(false); const [isDeleting, setIsDeleting] = useState(false); const router = useRouter(); @@ -104,29 +102,6 @@ export function RowActions({ )} - - - - - - - -

View Metadata

-
-
- {isDeletable && ( @@ -170,10 +145,6 @@ export function RowActions({ Edit )} - setIsMetadataOpen(true)}> - - Metadata - {isDeletable && ( setIsDeleteOpen(true)} @@ -187,13 +158,6 @@ export function RowActions({
- - diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts index 73b68b588..b52054dcd 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts @@ -8,11 +8,12 @@ export type Document = { content: string; created_at: string; search_space_id: number; + created_by_id?: string | null; + created_by_name?: string | null; }; export type ColumnVisibility = { - title: boolean; document_type: boolean; - content: boolean; + created_by: boolean; created_at: boolean; }; diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx index 52eb3546c..269c2ca2f 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx @@ -2,22 +2,19 @@ import { useQuery } from "@tanstack/react-query"; import { useAtomValue } from "jotai"; -import { RefreshCw, SquarePlus, Upload } from "lucide-react"; import { motion } from "motion/react"; -import { useParams, useRouter } from "next/navigation"; +import { useParams } from "next/navigation"; import { useTranslations } from "next-intl"; -import { useCallback, useEffect, useId, useMemo, useState } from "react"; +import { useCallback, useEffect, useMemo, useState } from "react"; import { toast } from "sonner"; import { deleteDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; import { documentTypeCountsAtom } from "@/atoms/documents/document-query.atoms"; -import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; -import { Button } from "@/components/ui/button"; import type { DocumentTypeEnum } from "@/contracts/types/document.types"; import { documentsApiService } from "@/lib/apis/documents-api.service"; import { cacheKeys } from "@/lib/query-client/cache-keys"; import { DocumentsFilters } from "./components/DocumentsFilters"; import { DocumentsTableShell, type SortKey } from "./components/DocumentsTableShell"; -import { PaginationControls } from "./components/PaginationControls"; +import { PAGE_SIZE, PaginationControls } from "./components/PaginationControls"; import type { ColumnVisibility } from "./components/types"; function useDebounced(value: T, delay = 250) { @@ -31,29 +28,20 @@ function useDebounced(value: T, delay = 250) { export default function DocumentsTable() { const t = useTranslations("documents"); - const id = useId(); const params = useParams(); - const router = useRouter(); const searchSpaceId = Number(params.search_space_id); - const { openDialog: openUploadDialog } = useDocumentUploadDialog(); - - const handleNewNote = useCallback(() => { - router.push(`/dashboard/${searchSpaceId}/editor/new`); - }, [router, searchSpaceId]); const [search, setSearch] = useState(""); const debouncedSearch = useDebounced(search, 250); const [activeTypes, setActiveTypes] = useState([]); const [columnVisibility, setColumnVisibility] = useState({ - title: true, document_type: true, - content: true, + created_by: true, created_at: true, }); const [pageIndex, setPageIndex] = useState(0); - const [pageSize, setPageSize] = useState(50); - const [sortKey, setSortKey] = useState("title"); - const [sortDesc, setSortDesc] = useState(false); + const [sortKey, setSortKey] = useState("created_at"); + const [sortDesc, setSortDesc] = useState(true); const [selectedIds, setSelectedIds] = useState>(new Set()); const { data: rawTypeCounts } = useAtomValue(documentTypeCountsAtom); const { mutateAsync: deleteDocumentMutation } = useAtomValue(deleteDocumentMutationAtom); @@ -63,10 +51,10 @@ export default function DocumentsTable() { () => ({ search_space_id: searchSpaceId, page: pageIndex, - page_size: pageSize, + page_size: PAGE_SIZE, ...(activeTypes.length > 0 && { document_types: activeTypes }), }), - [searchSpaceId, pageIndex, pageSize, activeTypes] + [searchSpaceId, pageIndex, activeTypes] ); // Build search query parameters @@ -74,11 +62,11 @@ export default function DocumentsTable() { () => ({ search_space_id: searchSpaceId, page: pageIndex, - page_size: pageSize, + page_size: PAGE_SIZE, title: debouncedSearch.trim(), ...(activeTypes.length > 0 && { document_types: activeTypes }), }), - [searchSpaceId, pageIndex, pageSize, activeTypes, debouncedSearch] + [searchSpaceId, pageIndex, activeTypes, debouncedSearch] ); // Use query for fetching documents @@ -112,17 +100,14 @@ export default function DocumentsTable() { activeTypes.length === 0 || activeTypes.includes("SURFSENSE_DOCS" as DocumentTypeEnum); // Use query for fetching SurfSense docs - const { - data: surfsenseDocsResponse, - isLoading: isSurfsenseDocsLoading, - refetch: refetchSurfsenseDocs, - } = useQuery({ - queryKey: ["surfsense-docs", debouncedSearch, pageIndex, pageSize], + // eslint-disable-next-line @typescript-eslint/no-unused-vars + const { data: surfsenseDocsResponse } = useQuery({ + queryKey: ["surfsense-docs", debouncedSearch, pageIndex, PAGE_SIZE], queryFn: () => documentsApiService.getSurfsenseDocs({ queryParams: { page: pageIndex, - page_size: pageSize, + page_size: PAGE_SIZE, title: debouncedSearch.trim() || undefined, }, }), @@ -131,7 +116,8 @@ export default function DocumentsTable() { }); // Transform SurfSense docs to match the Document type - const surfsenseDocsAsDocuments: Document[] = useMemo(() => { + // eslint-disable-next-line @typescript-eslint/no-unused-vars + const surfsenseDocsAsDocuments = useMemo(() => { if (!surfsenseDocsResponse?.items) return []; return surfsenseDocsResponse.items.map((doc) => ({ id: doc.id, @@ -145,6 +131,7 @@ export default function DocumentsTable() { }, [surfsenseDocsResponse]); // Merge type counts with SURFSENSE_DOCS count + // eslint-disable-next-line @typescript-eslint/no-unused-vars const typeCounts = useMemo(() => { const counts = { ...(rawTypeCounts || {}) }; if (surfsenseDocsResponse?.total) { @@ -165,11 +152,17 @@ export default function DocumentsTable() { // Display results directly const displayDocs = documents; const displayTotal = total; - const pageStart = pageIndex * pageSize; - const pageEnd = Math.min(pageStart + pageSize, displayTotal); + const pageEnd = Math.min((pageIndex + 1) * PAGE_SIZE, displayTotal); const onToggleType = (type: DocumentTypeEnum, checked: boolean) => { - setActiveTypes((prev) => (checked ? [...prev, type] : prev.filter((t) => t !== type))); + setActiveTypes((prev) => { + if (checked) { + // Only add if not already in the array + return prev.includes(type) ? prev : [...prev, type]; + } else { + return prev.filter((t) => t !== type); + } + }); setPageIndex(0); }; @@ -238,10 +231,21 @@ export default function DocumentsTable() { } }; + const handleSortChange = useCallback((key: SortKey) => { + setSortKey((currentKey) => { + if (currentKey === key) { + setSortDesc((v) => !v); + return currentKey; + } + setSortDesc(false); + return key; + }); + }, []); + useEffect(() => { const mq = window.matchMedia("(max-width: 768px)"); const apply = (isSmall: boolean) => { - setColumnVisibility((prev) => ({ ...prev, content: !isSmall, created_at: !isSmall })); + setColumnVisibility((prev) => ({ ...prev, created_by: !isSmall, created_at: !isSmall })); }; apply(mq.matches); const onChange = (e: MediaQueryListEvent) => apply(e.matches); @@ -254,34 +258,9 @@ export default function DocumentsTable() { initial={{ opacity: 0, y: 20 }} animate={{ opacity: 1, y: 0 }} transition={{ duration: 0.3 }} - className="w-full px-6 py-4 space-y-6 min-h-[calc(100vh-64px)]" + className="w-full max-w-7xl mx-auto px-6 pt-17 pb-6 space-y-6 min-h-[calc(100vh-64px)]" > - -
-

{t("title")}

-

{t("subtitle")}

-
-
- - - -
-
- + {/* Filters */} + {/* Table */} { - if (sortKey === key) setSortDesc((v) => !v); - else { - setSortKey(key); - setSortDesc(false); - } - }} + onSortChange={handleSortChange} /> + {/* Pagination */} { - setPageSize(s); - setPageIndex(0); - }} onFirst={() => setPageIndex(0)} onPrev={() => setPageIndex((i) => Math.max(0, i - 1))} onNext={() => setPageIndex((i) => (pageEnd < displayTotal ? i + 1 : i))} - onLast={() => setPageIndex(Math.max(0, Math.ceil(displayTotal / pageSize) - 1))} + onLast={() => setPageIndex(Math.max(0, Math.ceil(displayTotal / PAGE_SIZE) - 1))} canPrev={pageIndex > 0} canNext={pageEnd < displayTotal} - id={id} /> ); diff --git a/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts b/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts new file mode 100644 index 000000000..38205a8d2 --- /dev/null +++ b/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts @@ -0,0 +1,5 @@ +import { atom } from "jotai"; + +// Atom to control the connector dialog open state from anywhere in the app +export const connectorDialogOpenAtom = atom(false); + diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index 9b201e96b..abb32dde1 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -37,7 +37,7 @@ import { AllConnectorsTab } from "./connector-popup/tabs/all-connectors-tab"; import { ConnectorAccountsListView } from "./connector-popup/views/connector-accounts-list-view"; import { YouTubeCrawlerView } from "./connector-popup/views/youtube-crawler-view"; -export const ConnectorIndicator: FC = () => { +export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger = false }) => { const searchSpaceId = useAtomValue(activeSearchSpaceIdAtom); const searchParams = useSearchParams(); const { data: currentUser } = useAtomValue(currentUserAtom); @@ -186,34 +186,36 @@ export const ConnectorIndicator: FC = () => { return ( - handleOpenChange(true)} - > - {isLoading ? ( - - ) : ( - <> - - {activeConnectorsCount > 0 && ( - - {activeConnectorsCount > 99 ? "99+" : activeConnectorsCount} - - )} - - )} - + {!hideTrigger && ( + handleOpenChange(true)} + > + {isLoading ? ( + + ) : ( + <> + + {activeConnectorsCount > 0 && ( + + {activeConnectorsCount > 99 ? "99+" : activeConnectorsCount} + + )} + + )} + + )} Manage Connectors diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 639d0f7ed..118ca66ce 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -1,8 +1,9 @@ import { format } from "date-fns"; -import { useAtomValue } from "jotai"; +import { useAtom, useAtomValue } from "jotai"; import { useRouter, useSearchParams } from "next/navigation"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; +import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms"; import { createConnectorMutationAtom, deleteConnectorMutationAtom, @@ -49,7 +50,8 @@ export const useConnectorDialog = () => { const { mutateAsync: deleteConnector } = useAtomValue(deleteConnectorMutationAtom); const { mutateAsync: createConnector } = useAtomValue(createConnectorMutationAtom); - const [isOpen, setIsOpen] = useState(false); + // Use global atom for dialog open state so it can be controlled from anywhere + const [isOpen, setIsOpen] = useAtom(connectorDialogOpenAtom); const [activeTab, setActiveTab] = useState("all"); const [connectingId, setConnectingId] = useState(null); const [isScrolled, setIsScrolled] = useState(false); From 878e829bdc8da815a075598082202f0a751306b0 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Wed, 4 Feb 2026 17:19:29 +0530 Subject: [PATCH 02/36] feat: enhance document filters and table components with search functionality and improved loading states --- .../(manage)/components/DocumentTypeIcon.tsx | 10 +- .../(manage)/components/DocumentsFilters.tsx | 255 ++++++++------ .../components/DocumentsTableShell.tsx | 321 +++++++++++++----- .../(manage)/components/RowActions.tsx | 161 ++++----- 4 files changed, 480 insertions(+), 267 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx index 246cff1c0..b5d434e92 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx @@ -4,8 +4,8 @@ import type React from "react"; import { getConnectorIcon } from "@/contracts/enums/connectorIcons"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; -export function getDocumentTypeIcon(type: string): React.ReactNode { - return getConnectorIcon(type); +export function getDocumentTypeIcon(type: string, className?: string): React.ReactNode { + return getConnectorIcon(type, className); } export function getDocumentTypeLabel(type: string): string { @@ -18,7 +18,7 @@ export function getDocumentTypeLabel(type: string): string { const MAX_LABEL_LENGTH = 28; export function DocumentTypeChip({ type, className }: { type: string; className?: string }) { - const icon = getDocumentTypeIcon(type); + const icon = getDocumentTypeIcon(type, "h-4 w-4"); const fullLabel = getDocumentTypeLabel(type); const truncatedLabel = fullLabel.length > MAX_LABEL_LENGTH ? `${fullLabel.slice(0, MAX_LABEL_LENGTH)}...` @@ -27,9 +27,9 @@ export function DocumentTypeChip({ type, className }: { type: string; className? const chip = ( - {icon} + {icon} {truncatedLabel} ); diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx index 87d349e38..2c3dc7eef 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx @@ -7,12 +7,14 @@ import { Columns3, FilePlus2, FileType, + ListFilter, + Search, SlidersHorizontal, Trash, } from "lucide-react"; import { motion } from "motion/react"; import { useTranslations } from "next-intl"; -import React, { useMemo, useRef } from "react"; +import React, { useMemo, useRef, useState } from "react"; import { connectorDialogOpenAtom } from "@/atoms/connector-dialog/connector-dialog.atoms"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; import { @@ -64,10 +66,20 @@ export function DocumentsFilters({ const { openDialog: openUploadDialog } = useDocumentUploadDialog(); const setConnectorDialogOpen = useSetAtom(connectorDialogOpenAtom); + const [typeSearchQuery, setTypeSearchQuery] = useState(""); + const uniqueTypes = useMemo(() => { return Object.keys(typeCountsRecord).sort() as DocumentTypeEnum[]; }, [typeCountsRecord]); + const filteredTypes = useMemo(() => { + if (!typeSearchQuery.trim()) return uniqueTypes; + const query = typeSearchQuery.toLowerCase(); + return uniqueTypes.filter((type) => + getDocumentTypeLabel(type).toLowerCase().includes(query) + ); + }, [uniqueTypes, typeSearchQuery]); + const typeCounts = useMemo(() => { const map = new Map(); for (const [type, count] of Object.entries(typeCountsRecord)) { @@ -117,10 +129,13 @@ export function DocumentsFilters({ animate={{ opacity: 1, y: 0 }} transition={{ type: "spring", stiffness: 300, damping: 30 }} > +
+
onSearch(e.target.value)} placeholder="Filter by title" @@ -148,74 +163,94 @@ export function DocumentsFilters({ {/* Filter Buttons Group */}
- {/* Type Filter */} - - - - - -
-
- Filter by source + {/* Type Filter */} + + + + + +
+ {/* Search input */} +
+
+ + setTypeSearchQuery(e.target.value)} + className="h-6 pl-6 text-sm bg-transparent border-0 focus-visible:ring-0" + />
-
- {uniqueTypes.map((value: DocumentTypeEnum, i) => ( +
+ +
+ {filteredTypes.length === 0 ? ( +
+ No types found +
+ ) : ( + filteredTypes.map((value: DocumentTypeEnum, i) => ( - ))} -
- {activeTypes.length > 0 && ( -
- -
+ )) )}
- - + {activeTypes.length > 0 && ( +
+ +
+ )} +
+
+
{/* View/Columns Popover */} @@ -266,57 +301,69 @@ export function DocumentsFilters({
-
- {/* Bulk Delete Button */} - {selectedIds.size > 0 && ( - - - - - - - -
- - - Cancel - - Delete - - - - - )} + + Cancel + + Delete + + + + + )} +
); diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index faa7605a3..f23893fbe 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -1,16 +1,17 @@ "use client"; -import { ChevronDown, ChevronUp, FileX, Plus } from "lucide-react"; +import { formatDistanceToNow } from "date-fns"; +import { Calendar, ChevronDown, ChevronUp, FileText, FileX, Link2, Plus, User } from "lucide-react"; import { motion } from "motion/react"; import { useParams } from "next/navigation"; import { useTranslations } from "next-intl"; -import React, { useState } from "react"; +import React, { useRef, useState, useEffect } from "react"; import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup"; import { DocumentViewer } from "@/components/document-viewer"; import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; -import { Spinner } from "@/components/ui/spinner"; +import { Skeleton } from "@/components/ui/skeleton"; import { Table, TableBody, @@ -37,35 +38,82 @@ function sortDocuments(docs: Document[], key: SortKey, desc: boolean): Document[ return desc ? sorted.reverse() : sorted; } -function formatDate(dateStr: string): string { +function formatRelativeDate(dateStr: string): string { + return formatDistanceToNow(new Date(dateStr), { addSuffix: true }); +} + +function formatAbsoluteDate(dateStr: string): string { const date = new Date(dateStr); - return date.toLocaleDateString("en-US", { + return date.toLocaleString("en-US", { year: "numeric", month: "long", day: "numeric", + hour: "2-digit", + minute: "2-digit", + hour12: false, }); } +function TruncatedText({ text, className }: { text: string; className?: string }) { + const textRef = useRef(null); + const [isTruncated, setIsTruncated] = useState(false); + + useEffect(() => { + const checkTruncation = () => { + if (textRef.current) { + setIsTruncated(textRef.current.scrollWidth > textRef.current.clientWidth); + } + }; + checkTruncation(); + window.addEventListener("resize", checkTruncation); + return () => window.removeEventListener("resize", checkTruncation); + }, []); + + if (isTruncated) { + return ( + + + + {text} + + + +

{text}

+
+
+ ); + } + + return ( + + {text} + + ); +} + function SortableHeader({ children, sortKey, currentSortKey, sortDesc, onSort, + icon, }: { children: React.ReactNode; sortKey: SortKey; currentSortKey: SortKey; sortDesc: boolean; onSort: (key: SortKey) => void; + icon?: React.ReactNode; }) { const isActive = currentSortKey === sortKey; return (
@@ -649,6 +660,7 @@ export function DocumentsTableShell({
{sorted.map((doc, index) => { const isSelected = selectedIds.has(doc.id); + const canSelect = isSelectable(doc); return ( toggleOne(doc.id, !!v)} - aria-label="Select row" - className="border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary" + onCheckedChange={(v) => canSelect && toggleOne(doc.id, !!v)} + disabled={!canSelect} + aria-label={canSelect ? "Select row" : "Cannot select while processing"} + className={`border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary ${!canSelect ? "opacity-40 cursor-not-allowed" : ""}`} />
- Processing... + Syncing ); case "failed": diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx index 867fdc916..4133f2960 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx @@ -119,7 +119,7 @@ export function RowActions({
{activeTypes.length > 0 && ( -
+
- Pending - waiting to be processed + Pending - waiting to be synced ); case "processing": @@ -191,7 +191,6 @@ export function DocumentsTableShell({ documents, loading, error, - onRefresh, selectedIds, setSelectedIds, columnVisibility, @@ -204,7 +203,6 @@ export function DocumentsTableShell({ documents: Document[]; loading: boolean; error: boolean; - onRefresh: () => Promise; selectedIds: Set; setSelectedIds: (update: Set) => void; columnVisibility: ColumnVisibility; @@ -361,10 +359,15 @@ export function DocumentsTableShell({ )} {columnVisibility.created_at && ( - + )} + {columnVisibility.status && ( + + + + )} Actions @@ -401,10 +404,15 @@ export function DocumentsTableShell({ )} {columnVisibility.created_at && ( - + )} + {columnVisibility.status && ( + + + + )} @@ -435,23 +443,26 @@ export function DocumentsTableShell({ )}
- +
+ {columnVisibility.status && ( + + )} + +
))}
) : error ? ( -
+
+

{t("error_loading")}

-
) : sorted.length === 0 ? ( -
+
{ - if (isRefreshing) return; - setIsRefreshing(true); - try { - if (isSearchMode) { - await refetchSearch(); - } - // Real-time view doesn't need manual refresh - Electric handles it - toast.success(t("refresh_success") || "Documents refreshed"); - } finally { - setIsRefreshing(false); - } - }, [isSearchMode, refetchSearch, t, isRefreshing]); - const onBulkDelete = async () => { if (selectedIds.size === 0) { toast.error(t("no_rows_selected")); @@ -293,7 +277,6 @@ export default function DocumentsTable() { documents={displayDocs} loading={!!loading} error={!!error} - onRefresh={refreshCurrentView} selectedIds={selectedIds} setSelectedIds={setSelectedIds} columnVisibility={columnVisibility} diff --git a/surfsense_web/contracts/enums/connectorIcons.tsx b/surfsense_web/contracts/enums/connectorIcons.tsx index aaf476215..18a872d94 100644 --- a/surfsense_web/contracts/enums/connectorIcons.tsx +++ b/surfsense_web/contracts/enums/connectorIcons.tsx @@ -92,7 +92,7 @@ export const getConnectorIcon = (connectorType: EnumConnectorName | string, clas case "FILE": return ; case "GOOGLE_DRIVE_FILE": - return ; + return Google Drive; case "COMPOSIO_GOOGLE_DRIVE_CONNECTOR": return Google Drive; case "COMPOSIO_GMAIL_CONNECTOR": diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 75b186420..0dcf44776 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -328,7 +328,6 @@ "filter_placeholder": "Filter by title...", "rows_per_page": "Rows per page", "refresh": "Refresh", - "refresh_success": "Documents refreshed", "upload_documents": "Upload Documents", "create_shared_note": "Create Shared Note", "processing_documents": "Processing documents...", diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json index 81121ef3e..bf5961fa7 100644 --- a/surfsense_web/messages/zh.json +++ b/surfsense_web/messages/zh.json @@ -313,7 +313,6 @@ "filter_placeholder": "按标题筛选...", "rows_per_page": "每页行数", "refresh": "刷新", - "refresh_success": "文档已刷新", "upload_documents": "上传文档", "create_shared_note": "创建共享笔记", "processing_documents": "正在处理文档...", From 2077344934600197ae21db895fe3b37253d06934 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 02:59:21 +0530 Subject: [PATCH 21/36] feat: implement two-phase document indexing for Linear and Slack connectors with real-time status updates --- .../connector_indexers/linear_indexer.py | 318 ++++++++++-------- .../tasks/connector_indexers/slack_indexer.py | 249 +++++++++----- 2 files changed, 337 insertions(+), 230 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py index c28f151ca..45e1e357a 100644 --- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py @@ -1,5 +1,9 @@ """ Linear connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.linear_connector import LinearConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -196,6 +201,7 @@ async def index_linear_issues( # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 # Track issues that failed processing skipped_issues = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck @@ -207,16 +213,14 @@ async def index_linear_issues( {"stage": "process_issues", "total_issues": len(issues)}, ) - # Process each issue - for issue in issues: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all issues, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + issues_to_process = [] # List of dicts with document and issue data + new_documents_created = False + for issue in issues: try: issue_id = issue.get("id", "") issue_identifier = issue.get("identifier", "") @@ -262,78 +266,35 @@ async def index_linear_issues( state = formatted_issue.get("state", "Unknown") description = formatted_issue.get("description", "") comment_count = len(formatted_issue.get("comments", [])) + priority = formatted_issue.get("priority", "Unknown") if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for Linear issue {issue_identifier} unchanged. Skipping." ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Linear issue {issue_identifier}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "issue_id": issue_identifier, - "issue_title": issue_title, - "state": state, - "priority": formatted_issue.get("priority", "Unknown"), - "comment_count": comment_count, - "document_type": "Linear Issue", - "connector_type": "Linear", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - issue_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - if description and len(description) > 1000: - description = description[:997] + "..." - summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n" - if description: - summary_content += f"Description: {description}\n\n" - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(issue_content) - - # Update existing document - existing_document.title = f"{issue_identifier}: {issue_title}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "issue_id": issue_id, - "issue_identifier": issue_identifier, - "issue_title": issue_title, - "state": state, - "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated Linear issue {issue_identifier}" - ) - continue + # Queue existing document for update (will be set to processing in Phase 2) + issues_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'issue_content': issue_content, + 'content_hash': content_hash, + 'issue_id': issue_id, + 'issue_identifier': issue_identifier, + 'issue_title': issue_title, + 'state': state, + 'description': description, + 'comment_count': comment_count, + 'priority': priority, + }) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -351,48 +312,7 @@ async def index_linear_issues( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "issue_id": issue_identifier, - "issue_title": issue_title, - "state": state, - "priority": formatted_issue.get("priority", "Unknown"), - "comment_count": comment_count, - "document_type": "Linear Issue", - "connector_type": "Linear", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - issue_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - # Truncate description if it's too long for the summary - if description and len(description) > 1000: - description = description[:997] + "..." - summary_content = f"Linear Issue {issue_identifier}: {issue_title}\n\nStatus: {state}\n\n" - if description: - summary_content += f"Description: {description}\n\n" - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full issue content with comments - chunks = await create_document_chunks(issue_content) - - # Create and store new document - logger.info( - f"Creating new document for issue {issue_identifier} - {issue_title}" - ) + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, title=f"{issue_identifier}: {issue_title}", @@ -403,25 +323,119 @@ async def index_linear_issues( "issue_title": issue_title, "state": state, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new issue {issue_identifier} - {issue_title}" + new_documents_created = True + + issues_to_process.append({ + 'document': document, + 'is_new': True, + 'issue_content': issue_content, + 'content_hash': content_hash, + 'issue_id': issue_id, + 'issue_identifier': issue_identifier, + 'issue_title': issue_title, + 'state': state, + 'description': description, + 'comment_count': comment_count, + 'priority': priority, + }) + + except Exception as e: + logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(issues_to_process)} documents") + + for item in issues_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id ) - # Batch commit every 10 documents + if user_llm: + document_metadata_for_summary = { + "issue_id": item['issue_identifier'], + "issue_title": item['issue_title'], + "state": item['state'], + "priority": item['priority'], + "comment_count": item['comment_count'], + "document_type": "Linear Issue", + "connector_type": "Linear", + } + summary_content, summary_embedding = await generate_document_summary( + item['issue_content'], user_llm, document_metadata_for_summary + ) + else: + # Fallback to simple summary if no LLM configured + description = item['description'] + if description and len(description) > 1000: + description = description[:997] + "..." + summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n" + if description: + summary_content += f"Description: {description}\n\n" + summary_content += f"Comments: {item['comment_count']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item['issue_content']) + + # Update document to READY with actual content + document.title = f"{item['issue_identifier']}: {item['issue_title']}" + document.content = summary_content + document.content_hash = item['content_hash'] + document.embedding = summary_embedding + document.document_metadata = { + "issue_id": item['issue_id'], + "issue_identifier": item['issue_identifier'], + "issue_title": item['issue_title'], + "state": item['state'], + "comment_count": item['comment_count'], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Linear issues processed so far" @@ -430,44 +444,68 @@ async def index_linear_issues( except Exception as e: logger.error( - f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}", + f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}", exc_info=True, ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") skipped_issues.append( - f"{issue.get('identifier', 'Unknown')} (processing error)" + f"{item.get('issue_identifier', 'Unknown')} (processing error)" ) - documents_skipped += 1 - continue # Skip this issue and continue with others + documents_failed += 1 + continue - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info(f"Final commit: Total {documents_indexed} Linear issues processed") - await session.commit() - logger.info("Successfully committed all Linear document changes to database") + try: + await session.commit() + logger.info("Successfully committed all Linear document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same issue was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Linear indexing for connector {connector_id}", { - "issues_processed": total_processed, + "issues_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "skipped_issues_count": len(skipped_issues), }, ) logger.info( - f"Linear indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + f"Linear indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py index 010d1eff4..61faa39b3 100644 --- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py @@ -1,5 +1,9 @@ """ Slack connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.slack_history import SlackHistory -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -168,11 +173,15 @@ async def index_slack_messages( f"No Slack channels found for connector {connector_id}", {"channels_found": 0}, ) - return 0, "No Slack channels found" + # CRITICAL: Update timestamp even when no channels found so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return 0, None # Return None (not error) when no channels found # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 # Track messages that failed processing skipped_channels = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck @@ -184,15 +193,14 @@ async def index_slack_messages( {"stage": "process_channels", "total_channels": len(channels)}, ) - # Process each channel + # ======================================================================= + # PHASE 1: Collect all messages from all channels, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + messages_to_process = [] # List of dicts with document and message data + new_documents_created = False + for channel_obj in channels: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() channel_id = channel_obj["id"] channel_name = channel_obj["name"] is_private = channel_obj["is_private"] @@ -305,47 +313,29 @@ async def index_slack_messages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for Slack message {msg_ts} in channel {channel_name} unchanged. Skipping." ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Slack message {msg_ts} in channel {channel_name}. Updating document." - ) - # Update chunks and embedding - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = config.embedding_model_instance.embed( - combined_document_string - ) - - # Update existing document - existing_document.content = combined_document_string - existing_document.content_hash = content_hash - existing_document.embedding = doc_embedding - existing_document.document_metadata = { - "channel_name": channel_name, - "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - - # Delete old chunks and add new ones - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info(f"Successfully updated Slack message {msg_ts}") - continue + # Queue existing document for update (will be set to processing in Phase 2) + messages_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'combined_document_string': combined_document_string, + 'content_hash': content_hash, + 'channel_name': channel_name, + 'channel_id': channel_id, + 'msg_ts': msg_ts, + 'start_date': start_date_str, + 'end_date': end_date_str, + 'message_count': len(formatted_messages), + }) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -363,14 +353,7 @@ async def index_slack_messages( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Process chunks - chunks = await create_document_chunks(combined_document_string) - doc_embedding = config.embedding_model_instance.embed( - combined_document_string - ) - - # Create and store new document + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, title=channel_name, @@ -378,33 +361,37 @@ async def index_slack_messages( document_metadata={ "channel_name": channel_name, "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(formatted_messages), - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "msg_ts": msg_ts, + "connector_id": connector_id, }, - content=combined_document_string, - embedding=doc_embedding, - chunks=chunks, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Slack channels processed so far" - ) - await session.commit() + messages_to_process.append({ + 'document': document, + 'is_new': True, + 'combined_document_string': combined_document_string, + 'content_hash': content_hash, + 'channel_name': channel_name, + 'channel_id': channel_id, + 'msg_ts': msg_ts, + 'start_date': start_date_str, + 'end_date': end_date_str, + 'message_count': len(formatted_messages), + }) logger.info( - f"Successfully indexed new channel {channel_name} with {len(formatted_messages)} messages" + f"Phase 1: Collected {len(formatted_messages)} messages from channel {channel_name}" ) except SlackApiError as slack_error: @@ -420,43 +407,125 @@ async def index_slack_messages( documents_skipped += 1 continue # Skip this channel and continue with others - # Update the last_indexed_at timestamp for the connector only if requested - # and if we successfully indexed at least one channel - total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + + for item in messages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (embeddings, chunks) + chunks = await create_document_chunks(item['combined_document_string']) + doc_embedding = config.embedding_model_instance.embed( + item['combined_document_string'] + ) + + # Update document to READY with actual content + document.title = item['channel_name'] + document.content = item['combined_document_string'] + document.content_hash = item['content_hash'] + document.embedding = doc_embedding + document.document_metadata = { + "channel_name": item['channel_name'], + "channel_id": item['channel_id'], + "start_date": item['start_date'], + "end_date": item['end_date'], + "message_count": item['message_count'], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Slack messages processed so far" + ) + await session.commit() + + except Exception as e: + logger.error( + f"Error processing Slack message {item.get('msg_ts', 'Unknown')}: {e!s}", + exc_info=True, + ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches - logger.info(f"Final commit: Total {documents_indexed} Slack channels processed") - await session.commit() + logger.info(f"Final commit: Total {documents_indexed} Slack messages processed") + try: + await session.commit() + logger.info("Successfully committed all Slack document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same message was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise - # Prepare result message - result_message = None - if skipped_channels: - result_message = f"Processed {total_processed} channels. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" - else: - result_message = f"Processed {total_processed} channels." + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Slack indexing for connector {connector_id}", { - "channels_processed": total_processed, + "channels_processed": len(channels), "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "skipped_channels_count": len(skipped_channels), - "result_message": result_message, }, ) logger.info( - f"Slack indexing completed: {documents_indexed} new channels, {documents_skipped} skipped" + f"Slack indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - return ( - total_processed, - None, - ) # Return None on success (result_message is for logging only) + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() From 0249ea20a5df4bdfdf890a2de66bac2f8a33a316 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 03:42:03 +0530 Subject: [PATCH 22/36] feat: implement two-phase document indexing for Discord and Teams connectors with real-time status updates --- .../connector_indexers/discord_indexer.py | 384 +++++++++++------- .../tasks/connector_indexers/teams_indexer.py | 291 +++++++------ 2 files changed, 400 insertions(+), 275 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py index f9a6918a7..e5f333531 100644 --- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py @@ -1,5 +1,9 @@ """ Discord connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import asyncio @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.discord_connector import DiscordConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, @@ -27,6 +31,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -48,7 +53,11 @@ async def index_discord_messages( on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, str | None]: """ - Index Discord messages from all accessible channels. + Index Discord messages from the configured guild's channels. + + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Create all documents with 'pending' status (visible in UI immediately) + - Phase 2: Process each document: pending → processing → ready/failed Args: session: Database session @@ -113,6 +122,37 @@ async def index_discord_messages( logger.info(f"Starting Discord indexing for connector {connector_id}") + # ======================================================================= + # GUILD FILTERING: Only index the specific guild configured for this connector + # ======================================================================= + # Extract guild_id from connector config (set during OAuth flow) + configured_guild_id = connector.config.get("guild_id") + configured_guild_name = connector.config.get("guild_name") + + # Legacy connector check - if no guild_id, we need to warn and handle gracefully + is_legacy_connector = configured_guild_id is None + + if is_legacy_connector: + logger.warning( + f"Discord connector {connector_id} has no guild_id configured. " + "This is a legacy connector. Please reconnect the Discord server to fix this. " + "For now, indexing will be skipped to prevent indexing unwanted servers." + ) + await task_logger.log_task_failure( + log_entry, + f"Legacy Discord connector {connector_id} missing guild_id", + "No guild_id configured. Please reconnect this Discord server.", + {"error_type": "MissingGuildId", "is_legacy": True}, + ) + return ( + 0, + "This Discord connector needs to be reconnected. Please disconnect and reconnect your Discord server to enable indexing.", + ) + + logger.info( + f"Configured to index guild: {configured_guild_name} ({configured_guild_id})" + ) + # Initialize Discord client with OAuth credentials support await task_logger.log_task_progress( log_entry, @@ -255,77 +295,68 @@ async def index_discord_messages( try: await task_logger.log_task_progress( log_entry, - f"Starting Discord bot and fetching guilds for connector {connector_id}", - {"stage": "fetch_guilds"}, + f"Starting Discord bot for connector {connector_id}", + {"stage": "bot_initialization"}, ) - logger.info("Starting Discord bot to fetch guilds") + logger.info("Starting Discord bot") discord_client._bot_task = asyncio.create_task(discord_client.start_bot()) await discord_client._wait_until_ready() - logger.info("Fetching Discord guilds") - guilds = await discord_client.get_guilds() - logger.info(f"Found {len(guilds)} guilds") + # We only process the configured guild, not all guilds + logger.info( + f"Processing configured guild only: {configured_guild_name} ({configured_guild_id})" + ) + except Exception as e: await task_logger.log_task_failure( log_entry, - f"Failed to get Discord guilds for connector {connector_id}", + f"Failed to start Discord bot for connector {connector_id}", str(e), - {"error_type": "GuildFetchError"}, + {"error_type": "BotStartError"}, ) - logger.error(f"Failed to get Discord guilds: {e!s}", exc_info=True) + logger.error(f"Failed to start Discord bot: {e!s}", exc_info=True) await discord_client.close_bot() - return 0, f"Failed to get Discord guilds: {e!s}" - - if not guilds: - await task_logger.log_task_success( - log_entry, - f"No Discord guilds found for connector {connector_id}", - {"guilds_found": 0}, - ) - logger.info("No Discord guilds found to index") - await discord_client.close_bot() - return 0, "No Discord guilds found" + return 0, f"Failed to start Discord bot: {e!s}" # Track results documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 skipped_channels: list[str] = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() - # Process each guild and channel + # Use the configured guild info + guild_id = configured_guild_id + guild_name = configured_guild_name or "Unknown Guild" + await task_logger.log_task_progress( log_entry, - f"Starting to process {len(guilds)} Discord guilds", - {"stage": "process_guilds", "total_guilds": len(guilds)}, + f"Processing Discord guild: {guild_name}", + {"stage": "process_guild", "guild_id": guild_id, "guild_name": guild_name}, ) + # ======================================================================= + # PHASE 1: Collect all messages and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + messages_to_process = [] # List of dicts with document and message data + new_documents_created = False + try: - for guild in guilds: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) - >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() - guild_id = guild["id"] - guild_name = guild["name"] - logger.info(f"Processing guild: {guild_name} ({guild_id})") - - try: - channels = await discord_client.get_text_channels(guild_id) - if not channels: - logger.info( - f"No channels found in guild {guild_name}. Skipping." - ) - skipped_channels.append(f"{guild_name} (no channels)") - documents_skipped += 1 - continue + logger.info(f"Processing guild: {guild_name} ({guild_id})") + try: + channels = await discord_client.get_text_channels(guild_id) + if not channels: + logger.info( + f"No channels found in guild {guild_name}. Skipping." + ) + skipped_channels.append(f"{guild_name} (no channels)") + else: for channel in channels: channel_id = channel["id"] channel_name = channel["name"] @@ -343,14 +374,12 @@ async def index_discord_messages( skipped_channels.append( f"{guild_name}#{channel_name} (fetch error)" ) - documents_skipped += 1 continue if not messages: logger.info( f"No messages found in channel {channel_name} for the specified date range." ) - documents_skipped += 1 continue # Filter/format messages @@ -365,7 +394,6 @@ async def index_discord_messages( logger.info( f"No valid messages found in channel {channel_name} after filtering." ) - documents_skipped += 1 continue # Process each message as an individual document (like Slack) @@ -427,55 +455,27 @@ async def index_discord_messages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Discord message {msg_id} in {guild_name}#{channel_name} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Discord message {msg_id} in {guild_name}#{channel_name}. Updating document." - ) - # Update chunks and embedding - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = ( - config.embedding_model_instance.embed( - combined_document_string - ) - ) - - # Update existing document - existing_document.content = combined_document_string - existing_document.content_hash = content_hash - existing_document.embedding = doc_embedding - existing_document.document_metadata = { - "guild_name": guild_name, - "guild_id": guild_id, - "channel_name": channel_name, - "channel_id": channel_id, - "message_id": msg_id, - "message_timestamp": msg_timestamp, - "message_user_name": msg_user_name, - "indexed_at": datetime.now(UTC).strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - - # Delete old chunks and add new ones - existing_document.chunks = chunks - existing_document.updated_at = ( - get_current_timestamp() - ) - - documents_indexed += 1 - logger.info( - f"Successfully updated Discord message {msg_id}" - ) - continue + # Queue existing document for update (will be set to processing in Phase 2) + messages_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'combined_document_string': combined_document_string, + 'content_hash': content_hash, + 'guild_name': guild_name, + 'guild_id': guild_id, + 'channel_name': channel_name, + 'channel_id': channel_id, + 'message_id': msg_id, + 'message_timestamp': msg_timestamp, + 'message_user_name': msg_user_name, + }) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -492,19 +492,11 @@ async def index_discord_messages( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Process chunks - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = config.embedding_model_instance.embed( - combined_document_string - ) - - # Create and store new document + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, title=f"{guild_name}#{channel_name}", @@ -515,87 +507,171 @@ async def index_discord_messages( "channel_name": channel_name, "channel_id": channel_id, "message_id": msg_id, - "message_timestamp": msg_timestamp, - "message_user_name": msg_user_name, - "indexed_at": datetime.now(UTC).strftime( - "%Y-%m-%d %H:%M:%S" - ), + "connector_id": connector_id, }, - content=combined_document_string, - embedding=doc_embedding, - chunks=chunks, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} Discord messages processed so far" - ) - await session.commit() + messages_to_process.append({ + 'document': document, + 'is_new': True, + 'combined_document_string': combined_document_string, + 'content_hash': content_hash, + 'guild_name': guild_name, + 'guild_id': guild_id, + 'channel_name': channel_name, + 'channel_id': channel_id, + 'message_id': msg_id, + 'message_timestamp': msg_timestamp, + 'message_user_name': msg_user_name, + }) - logger.info( - f"Successfully indexed channel {guild_name}#{channel_name} with {len(formatted_messages)} messages" - ) + except Exception as e: + logger.error( + f"Error processing guild {guild_name}: {e!s}", exc_info=True + ) + skipped_channels.append(f"{guild_name} (processing error)") - except Exception as e: - logger.error( - f"Error processing guild {guild_name}: {e!s}", exc_info=True - ) - skipped_channels.append(f"{guild_name} (processing error)") - documents_skipped += 1 - continue finally: await discord_client.close_bot() - # Update last_indexed_at only if we indexed at least one - if documents_indexed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + + for item in messages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (embeddings, chunks) + chunks = await create_document_chunks(item['combined_document_string']) + doc_embedding = config.embedding_model_instance.embed( + item['combined_document_string'] + ) + + # Update document to READY with actual content + document.title = f"{item['guild_name']}#{item['channel_name']}" + document.content = item['combined_document_string'] + document.content_hash = item['content_hash'] + document.embedding = doc_embedding + document.document_metadata = { + "guild_name": item['guild_name'], + "guild_id": item['guild_id'], + "channel_name": item['channel_name'], + "channel_id": item['channel_id'], + "message_id": item['message_id'], + "message_timestamp": item['message_timestamp'], + "message_user_name": item['message_user_name'], + "indexed_at": datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} Discord messages processed so far" + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Discord message: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( f"Final commit: Total {documents_indexed} Discord messages processed" ) - await session.commit() - - # Prepare result message - result_message = None - if skipped_channels: - result_message = ( - f"Processed {documents_indexed} messages. Skipped {len(skipped_channels)} channels: " - + ", ".join(skipped_channels) + try: + await session.commit() + logger.info( + "Successfully committed all Discord document changes to database" ) - else: - result_message = f"Processed {documents_indexed} messages." + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + if skipped_channels: + warning_parts.append(f"{len(skipped_channels)} channels skipped") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Discord indexing for connector {connector_id}", { - "messages_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, "skipped_channels_count": len(skipped_channels), - "guilds_processed": len(guilds), - "result_message": result_message, + "guild_id": guild_id, + "guild_name": guild_name, }, ) logger.info( - f"Discord indexing completed: {documents_indexed} new messages, {documents_skipped} skipped" + f"Discord indexing completed for guild {guild_name}: {documents_indexed} ready, {documents_skipped} skipped, " + f"{documents_failed} failed ({duplicate_content_count} duplicate content)" ) - return ( - documents_indexed, - None, - ) # Return None on success (result_message is for logging only) + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py index d42c5b7f1..27259fd6f 100644 --- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py @@ -1,17 +1,21 @@ """ Microsoft Teams connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time from collections.abc import Awaitable, Callable -from datetime import UTC +from datetime import UTC, datetime from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.teams_history import TeamsHistory -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, @@ -27,6 +31,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -50,6 +55,10 @@ async def index_teams_messages( """ Index Microsoft Teams messages from all accessible teams and channels. + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Create all documents with 'pending' status (visible in UI immediately) + - Phase 2: Process each document: pending → processing → ready/failed + Args: session: Database session connector_id: ID of the Teams connector @@ -165,11 +174,16 @@ async def index_teams_messages( f"No Teams found for connector {connector_id}", {"teams_found": 0}, ) - return 0, "No Teams found" + # CRITICAL: Update timestamp even when no teams found so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) + await session.commit() + return 0, None # Return None (not error) when no items found # Track the number of documents indexed documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 skipped_channels = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck @@ -182,8 +196,6 @@ async def index_teams_messages( ) # Convert date strings to datetime objects for filtering - from datetime import datetime - start_datetime = None end_datetime = None if start_date_str: @@ -197,16 +209,14 @@ async def index_teams_messages( hour=23, minute=59, second=59, tzinfo=UTC ) - # Process each team - for team in teams: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Collect all messages and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + messages_to_process = [] # List of dicts with document and message data + new_documents_created = False + for team in teams: team_id = team.get("id") team_name = team.get("displayName", "Unknown Team") @@ -239,7 +249,6 @@ async def index_teams_messages( channel_name, team_name, ) - documents_skipped += 1 continue # Process each message @@ -322,60 +331,27 @@ async def index_teams_messages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - "Document for Teams message %s in channel %s unchanged. Skipping.", - message_id, - channel_name, - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - "Content changed for Teams message %s in channel %s. Updating document.", - message_id, - channel_name, - ) - # Update chunks and embedding - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = ( - config.embedding_model_instance.embed( - combined_document_string - ) - ) - - # Update existing document - existing_document.content = combined_document_string - existing_document.content_hash = content_hash - existing_document.embedding = doc_embedding - existing_document.document_metadata = { - "team_name": team_name, - "team_id": team_id, - "channel_name": channel_name, - "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(messages), - "indexed_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - - # Delete old chunks and add new ones - existing_document.chunks = chunks - existing_document.updated_at = ( - get_current_timestamp() - ) - - documents_indexed += 1 - logger.info( - "Successfully updated Teams message %s", - message_id, - ) - continue + # Queue existing document for update (will be set to processing in Phase 2) + messages_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'combined_document_string': combined_document_string, + 'content_hash': content_hash, + 'team_name': team_name, + 'team_id': team_id, + 'channel_name': channel_name, + 'channel_id': channel_id, + 'message_id': message_id, + 'start_date': start_date_str, + 'end_date': end_date_str, + }) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -395,19 +371,11 @@ async def index_teams_messages( duplicate_by_content.id, duplicate_by_content.document_type, ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Process chunks - chunks = await create_document_chunks( - combined_document_string - ) - doc_embedding = config.embedding_model_instance.embed( - combined_document_string - ) - - # Create and store new document + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, title=f"{team_name} - {channel_name}", @@ -417,40 +385,34 @@ async def index_teams_messages( "team_id": team_id, "channel_name": channel_name, "channel_id": channel_id, - "start_date": start_date_str, - "end_date": end_date_str, - "message_count": len(messages), - "indexed_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), + "connector_id": connector_id, }, - content=combined_document_string, - embedding=doc_embedding, - chunks=chunks, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - "Committing batch: %s Teams messages processed so far", - documents_indexed, - ) - await session.commit() - - logger.info( - "Successfully indexed channel %s in team %s with %s messages", - channel_name, - team_name, - len(messages), - ) + messages_to_process.append({ + 'document': document, + 'is_new': True, + 'combined_document_string': combined_document_string, + 'content_hash': content_hash, + 'team_name': team_name, + 'team_id': team_id, + 'channel_name': channel_name, + 'channel_id': channel_id, + 'message_id': message_id, + 'start_date': start_date_str, + 'end_date': end_date_str, + }) except Exception as e: logger.error( @@ -462,54 +424,141 @@ async def index_teams_messages( skipped_channels.append( f"{team_name}/{channel_name} (processing error)" ) - documents_skipped += 1 continue except Exception as e: logger.error("Error processing team %s: %s", team_name, str(e)) continue - # Update the last_indexed_at timestamp for the connector only if requested - # and if we successfully indexed at least one document - total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(messages_to_process)} documents") + + for item in messages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (embeddings, chunks) + chunks = await create_document_chunks(item['combined_document_string']) + doc_embedding = config.embedding_model_instance.embed( + item['combined_document_string'] + ) + + # Update document to READY with actual content + document.title = f"{item['team_name']} - {item['channel_name']}" + document.content = item['combined_document_string'] + document.content_hash = item['content_hash'] + document.embedding = doc_embedding + document.document_metadata = { + "team_name": item['team_name'], + "team_id": item['team_id'], + "channel_name": item['channel_name'], + "channel_id": item['channel_id'], + "start_date": item['start_date'], + "end_date": item['end_date'], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: + logger.info( + "Committing batch: %s Teams messages processed so far", + documents_indexed, + ) + await session.commit() + + except Exception as e: + logger.error(f"Error processing Teams message: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( "Final commit: Total %s Teams messages processed", documents_indexed ) - await session.commit() + try: + await session.commit() + logger.info( + "Successfully committed all Teams document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise - # Prepare result message - result_message = None + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") if skipped_channels: - result_message = f"Processed {total_processed} messages. Skipped {len(skipped_channels)} channels: {', '.join(skipped_channels)}" - else: - result_message = f"Processed {total_processed} messages." + warning_parts.append(f"{len(skipped_channels)} channels skipped") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Teams indexing for connector {connector_id}", { - "messages_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, "skipped_channels_count": len(skipped_channels), - "result_message": result_message, }, ) logger.info( - "Teams indexing completed: %s new messages, %s skipped", + "Teams indexing completed: %s ready, %s skipped, %s failed " + "(%s duplicate content)", documents_indexed, documents_skipped, + documents_failed, + duplicate_content_count, ) - return ( - total_processed, - None, - ) # Return None on success (result_message is for logging only) + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() From 1d870e45a48b12b8577d1bc73f7630fe0ce0d325 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 03:54:24 +0530 Subject: [PATCH 23/36] feat: implement two-phase document indexing for Confluence and Jira connectors with real-time status updates --- .../connector_indexers/confluence_indexer.py | 336 ++++++++++-------- .../tasks/connector_indexers/jira_indexer.py | 328 +++++++++-------- 2 files changed, 369 insertions(+), 295 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py index 74b4cc23d..7fd842996 100644 --- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py @@ -1,5 +1,9 @@ """ Confluence connector indexer. + +Provides real-time document status updates during indexing using a two-phase approach: +- Phase 1: Create all documents with PENDING status (visible in UI immediately) +- Phase 2: Process each document one by one (PENDING → PROCESSING → READY/FAILED) """ import contextlib @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.confluence_history import ConfluenceHistoryConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -29,6 +33,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -180,22 +185,22 @@ async def index_confluence_pages( await confluence_client.close() return 0, f"Error fetching Confluence pages: {e!s}" - # Process and index each page + # ======================================================================= + # PHASE 1: Analyze all pages, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= documents_indexed = 0 - skipped_pages = [] documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + pages_to_process = [] # List of dicts with document and page data + new_documents_created = False + for page in pages: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: page_id = page.get("id") page_title = page.get("title", "") @@ -205,7 +210,6 @@ async def index_confluence_pages( logger.warning( f"Skipping page with missing ID or title: {page_id or 'Unknown'}" ) - skipped_pages.append(f"{page_title or 'Unknown'} (missing data)") documents_skipped += 1 continue @@ -236,7 +240,6 @@ async def index_confluence_pages( if not full_content.strip(): logger.warning(f"Skipping page with no content: {page_title}") - skipped_pages.append(f"{page_title} (no content)") documents_skipped += 1 continue @@ -258,74 +261,25 @@ async def index_confluence_pages( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Confluence page {page_title} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Confluence page {page_title}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "page_title": page_title, - "page_id": page_id, - "space_id": space_id, - "comment_count": comment_count, - "document_type": "Confluence Page", - "connector_type": "Confluence", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - full_content, user_llm, document_metadata - ) - else: - summary_content = f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n" - if page_content: - content_preview = page_content[:1000] - if len(page_content) > 1000: - content_preview += "..." - summary_content += ( - f"Content Preview: {content_preview}\n\n" - ) - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(full_content) - - # Update existing document - existing_document.title = page_title - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "page_id": page_id, - "page_title": page_title, - "space_id": space_id, - "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated Confluence page {page_title}" - ) - continue + # Queue existing document for update (will be set to processing in Phase 2) + pages_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'full_content': full_content, + 'page_content': page_content, + 'content_hash': content_hash, + 'page_id': page_id, + 'page_title': page_title, + 'space_id': space_id, + 'comment_count': comment_count, + }) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -340,51 +294,11 @@ async def index_confluence_pages( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "page_title": page_title, - "page_id": page_id, - "space_id": space_id, - "comment_count": comment_count, - "document_type": "Confluence Page", - "connector_type": "Confluence", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - full_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = ( - f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n" - ) - if page_content: - # Take first 500 characters of content for summary - content_preview = page_content[:1000] - if len(page_content) > 1000: - content_preview += "..." - summary_content += f"Content Preview: {content_preview}\n\n" - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full page content with comments - chunks = await create_document_chunks(full_content) - - # Create and store new document - logger.info(f"Creating new document for page {page_title}") + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, title=page_title, @@ -394,23 +308,122 @@ async def index_confluence_pages( "page_title": page_title, "space_id": space_id, "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new page {page_title}") + new_documents_created = True - # Batch commit every 10 documents + pages_to_process.append({ + 'document': document, + 'is_new': True, + 'full_content': full_content, + 'page_content': page_content, + 'content_hash': content_hash, + 'page_id': page_id, + 'page_title': page_title, + 'space_id': space_id, + 'comment_count': comment_count, + }) + + except Exception as e: + logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(pages_to_process)} documents") + + for item in pages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata = { + "page_title": item['page_title'], + "page_id": item['page_id'], + "space_id": item['space_id'], + "comment_count": item['comment_count'], + "document_type": "Confluence Page", + "connector_type": "Confluence", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item['full_content'], user_llm, document_metadata + ) + else: + # Fallback to simple summary if no LLM configured + summary_content = ( + f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n" + ) + if item['page_content']: + # Take first 1000 characters of content for summary + content_preview = item['page_content'][:1000] + if len(item['page_content']) > 1000: + content_preview += "..." + summary_content += f"Content Preview: {content_preview}\n\n" + summary_content += f"Comments: {item['comment_count']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks - using the full page content with comments + chunks = await create_document_chunks(item['full_content']) + + # Update document to READY with actual content + document.title = item['page_title'] + document.content = summary_content + document.content_hash = item['content_hash'] + document.embedding = summary_embedding + document.document_metadata = { + "page_id": item['page_id'], + "page_title": item['page_title'], + "space_id": item['space_id'], + "comment_count": item['comment_count'], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Confluence pages processed so far" @@ -419,53 +432,78 @@ async def index_confluence_pages( except Exception as e: logger.error( - f"Error processing page {page.get('title', 'Unknown')}: {e!s}", + f"Error processing page {item.get('page_title', 'Unknown')}: {e!s}", exc_info=True, ) - skipped_pages.append( - f"{page.get('title', 'Unknown')} (processing error)" - ) - documents_skipped += 1 + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + documents_failed += 1 continue # Skip this page and continue with others - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit for any remaining documents not yet committed in batches + # Final commit to ensure all documents are persisted (safety net) logger.info( f"Final commit: Total {documents_indexed} Confluence pages processed" ) - await session.commit() - logger.info( - "Successfully committed all Confluence document changes to database" - ) + try: + await session.commit() + logger.info( + "Successfully committed all Confluence document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same page was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed Confluence indexing for connector {connector_id}", { - "pages_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, - "skipped_pages_count": len(skipped_pages), + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) logger.info( - f"Confluence indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" + f"Confluence indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) # Close the client connection if confluence_client: await confluence_client.close() - return ( - total_processed, - None, - ) # Return None as the error message to indicate success + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py index 508834b4f..038df0f46 100644 --- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py @@ -1,5 +1,9 @@ """ Jira connector indexer. + +Provides real-time document status updates during indexing using a two-phase approach: +- Phase 1: Create all documents with PENDING status (visible in UI immediately) +- Phase 2: Process each document one by one (PENDING → PROCESSING → READY/FAILED) """ import contextlib @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.jira_history import JiraHistoryConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -29,6 +33,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -174,22 +179,22 @@ async def index_jira_issues( logger.error(f"Error fetching Jira issues: {e!s}", exc_info=True) return 0, f"Error fetching Jira issues: {e!s}" - # Process and index each issue + # ======================================================================= + # PHASE 1: Analyze all issues, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= documents_indexed = 0 - skipped_issues = [] documents_skipped = 0 + documents_failed = 0 + duplicate_content_count = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + issues_to_process = [] # List of dicts with document and issue data + new_documents_created = False + for issue in issues: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: issue_id = issue.get("key") issue_identifier = issue.get("key", "") @@ -199,9 +204,6 @@ async def index_jira_issues( logger.warning( f"Skipping issue with missing ID or title: {issue_id or 'Unknown'}" ) - skipped_issues.append( - f"{issue_identifier or 'Unknown'} (missing data)" - ) documents_skipped += 1 continue @@ -215,7 +217,6 @@ async def index_jira_issues( logger.warning( f"Skipping issue with no content: {issue_identifier} - {issue_title}" ) - skipped_issues.append(f"{issue_identifier} (no content)") documents_skipped += 1 continue @@ -237,71 +238,25 @@ async def index_jira_issues( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: - logger.info( - f"Document for Jira issue {issue_identifier} unchanged. Skipping." - ) + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Jira issue {issue_identifier}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "issue_key": issue_identifier, - "issue_title": issue_title, - "status": formatted_issue.get("status", "Unknown"), - "priority": formatted_issue.get("priority", "Unknown"), - "comment_count": comment_count, - "document_type": "Jira Issue", - "connector_type": "Jira", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - issue_content, user_llm, document_metadata - ) - else: - summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n" - if formatted_issue.get("description"): - summary_content += f"Description: {formatted_issue.get('description')}\n\n" - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(issue_content) - - # Update existing document - existing_document.title = f"{issue_identifier}: {issue_title}" - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "issue_id": issue_id, - "issue_identifier": issue_identifier, - "issue_title": issue_title, - "state": formatted_issue.get("status", "Unknown"), - "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated Jira issue {issue_identifier}" - ) - continue + # Queue existing document for update (will be set to processing in Phase 2) + issues_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'issue_content': issue_content, + 'content_hash': content_hash, + 'issue_id': issue_id, + 'issue_identifier': issue_identifier, + 'issue_title': issue_title, + 'formatted_issue': formatted_issue, + 'comment_count': comment_count, + }) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -316,50 +271,11 @@ async def index_jira_issues( f"(existing document ID: {duplicate_by_content.id}, " f"type: {duplicate_by_content.document_type}). Skipping." ) + duplicate_content_count += 1 documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "issue_key": issue_identifier, - "issue_title": issue_title, - "status": formatted_issue.get("status", "Unknown"), - "priority": formatted_issue.get("priority", "Unknown"), - "comment_count": comment_count, - "document_type": "Jira Issue", - "connector_type": "Jira", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - issue_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Jira Issue {issue_identifier}: {issue_title}\n\nStatus: {formatted_issue.get('status', 'Unknown')}\n\n" - if formatted_issue.get("description"): - summary_content += ( - f"Description: {formatted_issue.get('description')}\n\n" - ) - summary_content += f"Comments: {comment_count}" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - using the full issue content with comments - chunks = await create_document_chunks(issue_content) - - # Create and store new document - logger.info( - f"Creating new document for issue {issue_identifier} - {issue_title}" - ) + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, title=f"{issue_identifier}: {issue_title}", @@ -370,25 +286,120 @@ async def index_jira_issues( "issue_title": issue_title, "state": formatted_issue.get("status", "Unknown"), "comment_count": comment_count, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info( - f"Successfully indexed new issue {issue_identifier} - {issue_title}" + new_documents_created = True + + issues_to_process.append({ + 'document': document, + 'is_new': True, + 'issue_content': issue_content, + 'content_hash': content_hash, + 'issue_id': issue_id, + 'issue_identifier': issue_identifier, + 'issue_title': issue_title, + 'formatted_issue': formatted_issue, + 'comment_count': comment_count, + }) + + except Exception as e: + logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(issues_to_process)} documents") + + for item in issues_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id ) - # Batch commit every 10 documents + if user_llm: + document_metadata = { + "issue_key": item['issue_identifier'], + "issue_title": item['issue_title'], + "status": item['formatted_issue'].get("status", "Unknown"), + "priority": item['formatted_issue'].get("priority", "Unknown"), + "comment_count": item['comment_count'], + "document_type": "Jira Issue", + "connector_type": "Jira", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item['issue_content'], user_llm, document_metadata + ) + else: + # Fallback to simple summary if no LLM configured + summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['formatted_issue'].get('status', 'Unknown')}\n\n" + if item['formatted_issue'].get("description"): + summary_content += ( + f"Description: {item['formatted_issue'].get('description')}\n\n" + ) + summary_content += f"Comments: {item['comment_count']}" + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + # Process chunks - using the full issue content with comments + chunks = await create_document_chunks(item['issue_content']) + + # Update document to READY with actual content + document.title = f"{item['issue_identifier']}: {item['issue_title']}" + document.content = summary_content + document.content_hash = item['content_hash'] + document.embedding = summary_embedding + document.document_metadata = { + "issue_id": item['issue_id'], + "issue_identifier": item['issue_identifier'], + "issue_title": item['issue_title'], + "state": item['formatted_issue'].get("status", "Unknown"), + "comment_count": item['comment_count'], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Jira issues processed so far" @@ -397,48 +408,73 @@ async def index_jira_issues( except Exception as e: logger.error( - f"Error processing issue {issue.get('identifier', 'Unknown')}: {e!s}", + f"Error processing issue {item.get('issue_identifier', 'Unknown')}: {e!s}", exc_info=True, ) - skipped_issues.append( - f"{issue.get('identifier', 'Unknown')} (processing error)" - ) - documents_skipped += 1 + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + documents_failed += 1 continue # Skip this issue and continue with others - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) - # Final commit for any remaining documents not yet committed in batches + # Final commit to ensure all documents are persisted (safety net) logger.info(f"Final commit: Total {documents_indexed} Jira issues processed") - await session.commit() - logger.info("Successfully committed all JIRA document changes to database") + try: + await session.commit() + logger.info("Successfully committed all JIRA document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same issue was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed JIRA indexing for connector {connector_id}", { - "issues_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, - "skipped_issues_count": len(skipped_issues), + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) logger.info( - f"JIRA indexing completed: {documents_indexed} new issues, {documents_skipped} skipped" + f"JIRA indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed " + f"({duplicate_content_count} duplicate content)" ) # Clean up the connector await jira_client.close() - return ( - total_processed, - None, - ) # Return None as the error message to indicate success + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() From 108e8c960ff68e7954042334de1c57675f467192 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 03:54:38 +0530 Subject: [PATCH 24/36] fix: adjust opacity of clock icon in status indicator for better visibility --- .../documents/(manage)/components/DocumentsTableShell.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index beb808191..fb0d72fae 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -42,7 +42,7 @@ function StatusIndicator({ status }: { status?: DocumentStatus }) {
- +
Pending - waiting to be synced From bfa3be655ef9a9f5ab78595f6258b10a6053bef4 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 04:06:14 +0530 Subject: [PATCH 25/36] feat: implement two-phase document indexing for ClickUp and GitHub connectors with real-time status updates --- .../connector_indexers/clickup_indexer.py | 310 ++++++++----- .../connector_indexers/github_indexer.py | 436 ++++++++++-------- 2 files changed, 440 insertions(+), 306 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py index 2b8789e0c..934e56744 100644 --- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py @@ -1,5 +1,9 @@ """ ClickUp connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import contextlib @@ -12,7 +16,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.clickup_history import ClickUpHistoryConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -141,10 +146,18 @@ async def index_clickup_tasks( documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Collect all tasks and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + tasks_to_process = [] # List of dicts with document and task data + new_documents_created = False + # Iterate workspaces and fetch tasks for workspace in workspaces: workspace_id = workspace.get("id") @@ -183,15 +196,6 @@ async def index_clickup_tasks( ) for task in tasks: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) - >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() - try: task_id = task.get("id") task_name = task.get("name", "Untitled Task") @@ -255,74 +259,35 @@ async def index_clickup_tasks( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for ClickUp task {task_name} unchanged. Skipping." ) documents_skipped += 1 continue else: - # Content has changed - update the existing document + # Queue existing document for update (will be set to processing in Phase 2) logger.info( - f"Content changed for ClickUp task {task_name}. Updating document." - ) - - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "task_id": task_id, - "task_name": task_name, - "task_status": task_status, - "task_priority": task_priority, - "task_list": task_list_name, - "task_space": task_space_name, - "assignees": len(task_assignees), - "document_type": "ClickUp Task", - "connector_type": "ClickUp", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - task_content, user_llm, document_metadata - ) - else: - summary_content = task_content - summary_embedding = ( - config.embedding_model_instance.embed(task_content) - ) - - # Process chunks - chunks = await create_document_chunks(task_content) - - # Update existing document - existing_document.title = task_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "task_id": task_id, - "task_name": task_name, - "task_status": task_status, - "task_priority": task_priority, - "task_assignees": task_assignees, - "task_due_date": task_due_date, - "task_created": task_created, - "task_updated": task_updated, - "indexed_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info( - f"Successfully updated ClickUp task {task_name}" + f"Content changed for ClickUp task {task_name}. Queuing for update." ) + tasks_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'task_content': task_content, + 'content_hash': content_hash, + 'task_id': task_id, + 'task_name': task_name, + 'task_status': task_status, + 'task_priority': task_priority, + 'task_list_name': task_list_name, + 'task_space_name': task_space_name, + 'task_assignees': task_assignees, + 'task_due_date': task_due_date, + 'task_created': task_created, + 'task_updated': task_updated, + }) continue # Document doesn't exist by unique_identifier_hash @@ -341,39 +306,7 @@ async def index_clickup_tasks( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "task_id": task_id, - "task_name": task_name, - "task_status": task_status, - "task_priority": task_priority, - "task_list": task_list_name, - "task_space": task_space_name, - "assignees": len(task_assignees), - "document_type": "ClickUp Task", - "connector_type": "ClickUp", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - task_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = task_content - summary_embedding = config.embedding_model_instance.embed( - task_content - ) - - chunks = await create_document_chunks(task_content) - + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, title=task_name, @@ -387,44 +320,174 @@ async def index_clickup_tasks( "task_due_date": task_due_date, "task_created": task_created, "task_updated": task_updated, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new task {task_name}") + new_documents_created = True - # Batch commit every 10 documents - if documents_indexed % 10 == 0: - logger.info( - f"Committing batch: {documents_indexed} ClickUp tasks processed so far" - ) - await session.commit() + tasks_to_process.append({ + 'document': document, + 'is_new': True, + 'task_content': task_content, + 'content_hash': content_hash, + 'task_id': task_id, + 'task_name': task_name, + 'task_status': task_status, + 'task_priority': task_priority, + 'task_list_name': task_list_name, + 'task_space_name': task_space_name, + 'task_assignees': task_assignees, + 'task_due_date': task_due_date, + 'task_created': task_created, + 'task_updated': task_updated, + }) except Exception as e: logger.error( - f"Error processing task {task.get('name', 'Unknown')}: {e!s}", + f"Error in Phase 1 for task {task.get('name', 'Unknown')}: {e!s}", exc_info=True, ) - documents_skipped += 1 + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([t for t in tasks_to_process if t['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(tasks_to_process)} documents") + + for item in tasks_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "task_id": item['task_id'], + "task_name": item['task_name'], + "task_status": item['task_status'], + "task_priority": item['task_priority'], + "task_list": item['task_list_name'], + "task_space": item['task_space_name'], + "assignees": len(item['task_assignees']), + "document_type": "ClickUp Task", + "connector_type": "ClickUp", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item['task_content'], user_llm, document_metadata_for_summary + ) + else: + summary_content = item['task_content'] + summary_embedding = config.embedding_model_instance.embed( + item['task_content'] + ) + + chunks = await create_document_chunks(item['task_content']) + + # Update document to READY with actual content + document.title = item['task_name'] + document.content = summary_content + document.content_hash = item['content_hash'] + document.embedding = summary_embedding + document.document_metadata = { + "task_id": item['task_id'], + "task_name": item['task_name'], + "task_status": item['task_status'], + "task_priority": item['task_priority'], + "task_assignees": item['task_assignees'], + "task_due_date": item['task_due_date'], + "task_created": item['task_created'], + "task_updated": item['task_updated'], + "connector_id": connector_id, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) + if documents_indexed % 10 == 0: + logger.info( + f"Committing batch: {documents_indexed} ClickUp tasks processed so far" + ) + await session.commit() + + except Exception as e: + logger.error( + f"Error processing task {item.get('task_name', 'Unknown')}: {e!s}", + exc_info=True, + ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + documents_failed += 1 + continue total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info(f"Final commit: Total {documents_indexed} ClickUp tasks processed") - await session.commit() + try: + await session.commit() + logger.info( + "Successfully committed all ClickUp document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same task was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise await task_logger.log_task_success( log_entry, @@ -433,11 +496,12 @@ async def index_clickup_tasks( "pages_processed": total_processed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, }, ) logger.info( - f"clickup indexing completed: {documents_indexed} new tasks, {documents_skipped} skipped" + f"clickup indexing completed: {documents_indexed} ready, {documents_skipped} skipped, {documents_failed} failed" ) # Close client connection diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index 848db7623..b37989a84 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -3,6 +3,10 @@ GitHub connector indexer using gitingest. This indexer processes entire repository digests in one pass, dramatically reducing LLM API calls compared to the previous file-by-file approach. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -14,7 +18,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.github_connector import GitHubConnector, RepositoryDigest -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -30,6 +34,8 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, + update_connector_last_indexed, ) # Type hint for heartbeat callback @@ -164,7 +170,7 @@ async def index_github_repos( ) return 0, f"Failed to initialize GitHub client: {e!s}" - # 4. Process each repository with gitingest + # 4. Process each repository with gitingest using 2-phase approach await task_logger.log_task_progress( log_entry, f"Starting gitingest processing for {len(repo_full_names_to_index)} repositories", @@ -181,24 +187,25 @@ async def index_github_repos( # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() documents_indexed = 0 + documents_skipped = 0 + documents_failed = 0 + + # ======================================================================= + # PHASE 1: Analyze all repos and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + repos_to_process = [] # List of dicts with document and digest data + new_documents_created = False for repo_full_name in repo_full_names_to_index: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() if not repo_full_name or not isinstance(repo_full_name, str): logger.warning(f"Skipping invalid repository entry: {repo_full_name}") continue - logger.info(f"Ingesting repository: {repo_full_name}") - try: + logger.info(f"Phase 1: Analyzing repository: {repo_full_name}") + # Run gitingest via subprocess (isolated from event loop) - # Using to_thread to not block the async database operations import asyncio digest = await asyncio.to_thread( @@ -212,30 +219,248 @@ async def index_github_repos( errors.append(f"No digest for {repo_full_name}") continue - # Process the digest and create documents - docs_created = await _process_repository_digest( - session=session, - digest=digest, - search_space_id=search_space_id, - user_id=user_id, - task_logger=task_logger, - log_entry=log_entry, - connector_id=connector_id, + # Generate unique identifier based on repo name + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id ) - documents_processed += docs_created - logger.info( - f"Created {docs_created} documents from repository: {repo_full_name}" + # Generate content hash from digest + full_content = digest.full_digest + content_hash = generate_content_hash(full_content, search_space_id) + + # Check if document with this unique identifier already exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash ) + if existing_document: + # Document exists - check if content has changed + if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() + logger.info(f"Repository {repo_full_name} unchanged. Skipping.") + documents_skipped += 1 + continue + + # Queue existing document for update (will be set to processing in Phase 2) + logger.info( + f"Content changed for repository {repo_full_name}. Queuing for update." + ) + repos_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'digest': digest, + 'content_hash': content_hash, + 'repo_full_name': repo_full_name, + 'unique_identifier_hash': unique_identifier_hash, + }) + continue + + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from another connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + logger.info( + f"Repository {repo_full_name} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping." + ) + documents_skipped += 1 + continue + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=repo_full_name, + document_type=DocumentType.GITHUB_CONNECTOR, + document_metadata={ + "repository_full_name": repo_full_name, + "url": f"https://github.com/{repo_full_name}", + "branch": digest.branch, + "ingestion_method": "gitingest", + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + repos_to_process.append({ + 'document': document, + 'is_new': True, + 'digest': digest, + 'content_hash': content_hash, + 'repo_full_name': repo_full_name, + 'unique_identifier_hash': unique_identifier_hash, + }) + except Exception as repo_err: logger.error( - f"Failed to process repository {repo_full_name}: {repo_err}" + f"Error in Phase 1 for repository {repo_full_name}: {repo_err}", + exc_info=True, ) + errors.append(f"Phase 1 error for {repo_full_name}: {repo_err}") + documents_failed += 1 + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([r for r in repos_to_process if r['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(repos_to_process)} documents") + + for item in repos_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + digest = item['digest'] + repo_full_name = item['repo_full_name'] + + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + document_metadata_for_summary = { + "repository": repo_full_name, + "document_type": "GitHub Repository", + "connector_type": "GitHub", + "ingestion_method": "gitingest", + "file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree, + "estimated_tokens": digest.estimated_tokens, + } + + if user_llm: + # Prepare content for summarization + summary_content = digest.full_digest + if len(summary_content) > MAX_DIGEST_CHARS: + summary_content = ( + f"# Repository: {repo_full_name}\n\n" + f"## File Structure\n\n{digest.tree}\n\n" + f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..." + ) + + summary_text, summary_embedding = await generate_document_summary( + summary_content, user_llm, document_metadata_for_summary + ) + else: + # Fallback to simple summary if no LLM configured + summary_text = ( + f"# GitHub Repository: {repo_full_name}\n\n" + f"## Summary\n{digest.summary}\n\n" + f"## File Structure\n{digest.tree[:3000]}" + ) + summary_embedding = config.embedding_model_instance.embed(summary_text) + + # Chunk the full digest content for granular search + try: + chunks_data = await create_document_chunks(digest.content) + except Exception as chunk_err: + logger.error(f"Failed to chunk repository {repo_full_name}: {chunk_err}") + chunks_data = await _simple_chunk_content(digest.content) + + # Update document to READY with actual content + doc_metadata = { + "repository_full_name": repo_full_name, + "url": f"https://github.com/{repo_full_name}", + "branch": digest.branch, + "ingestion_method": "gitingest", + "file_tree": digest.tree, + "gitingest_summary": digest.summary, + "estimated_tokens": digest.estimated_tokens, + "connector_id": connector_id, + "indexed_at": datetime.now(UTC).isoformat(), + } + + document.title = repo_full_name + document.content = summary_text + document.content_hash = item['content_hash'] + document.embedding = summary_embedding + document.document_metadata = doc_metadata + safe_set_chunks(document, chunks_data) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_processed += 1 + documents_indexed += 1 + + logger.info( + f"Created document for repository {repo_full_name} " + f"with {len(chunks_data)} chunks" + ) + + # Batch commit every 5 documents (repositories are large) + if documents_indexed % 5 == 0: + logger.info( + f"Committing batch: {documents_indexed} GitHub repos processed so far" + ) + await session.commit() + + except Exception as repo_err: + logger.error( + f"Error processing repository {repo_full_name}: {repo_err}", + exc_info=True, + ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(repo_err)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") errors.append(f"Failed processing {repo_full_name}: {repo_err}") + documents_failed += 1 + continue + + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit - await session.commit() + logger.info(f"Final commit: Total {documents_processed} GitHub repositories processed") + try: + await session.commit() + logger.info( + "Successfully committed all GitHub document changes to database" + ) + except Exception as e: + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise + logger.info( f"Finished GitHub indexing for connector {connector_id}. " f"Created {documents_processed} documents." @@ -247,6 +472,8 @@ async def index_github_repos( f"Successfully completed GitHub indexing for connector {connector_id}", { "documents_processed": documents_processed, + "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "errors_count": len(errors), "repo_count": len(repo_full_names_to_index), "method": "gitingest", @@ -286,163 +513,6 @@ async def index_github_repos( return documents_processed, error_message -async def _process_repository_digest( - session: AsyncSession, - digest: RepositoryDigest, - search_space_id: int, - user_id: str, - task_logger: TaskLoggingService, - log_entry, - connector_id: int, -) -> int: - """ - Process a repository digest and create documents. - - For each repository, we create: - 1. One main document with the repository summary - 2. Chunks from the full digest content for granular search - - Args: - session: Database session - digest: The repository digest from gitingest - search_space_id: ID of the search space - user_id: ID of the user - task_logger: Task logging service - log_entry: Current log entry - - Returns: - Number of documents created - """ - repo_full_name = digest.repo_full_name - documents_created = 0 - - # Generate unique identifier based on repo name and content hash - # This allows updates when repo content changes - full_content = digest.full_digest - content_hash = generate_content_hash(full_content, search_space_id) - - # Use repo name as the unique identifier (one document per repo) - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.GITHUB_CONNECTOR, repo_full_name, search_space_id - ) - - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - logger.info(f"Repository {repo_full_name} unchanged. Skipping.") - return 0 - else: - logger.info( - f"Content changed for repository {repo_full_name}. Updating document." - ) - # Delete existing document to replace with new one - await session.delete(existing_document) - await session.flush() - else: - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) - with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash - ) - - if duplicate_by_content: - logger.info( - f"Repository {repo_full_name} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping." - ) - return 0 - - # Generate summary using LLM (ONE call per repository!) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - - document_metadata = { - "repository": repo_full_name, - "document_type": "GitHub Repository", - "connector_type": "GitHub", - "ingestion_method": "gitingest", - "file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree, - "estimated_tokens": digest.estimated_tokens, - } - - if user_llm: - # Prepare content for summarization - # Include tree structure and truncated content if too large - summary_content = digest.full_digest - if len(summary_content) > MAX_DIGEST_CHARS: - # Truncate but keep the tree and beginning of content - summary_content = ( - f"# Repository: {repo_full_name}\n\n" - f"## File Structure\n\n{digest.tree}\n\n" - f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..." - ) - - summary_text, summary_embedding = await generate_document_summary( - summary_content, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_text = ( - f"# GitHub Repository: {repo_full_name}\n\n" - f"## Summary\n{digest.summary}\n\n" - f"## File Structure\n{digest.tree[:3000]}" - ) - summary_embedding = config.embedding_model_instance.embed(summary_text) - - # Chunk the full digest content for granular search - try: - # Use the content (not the summary) for chunking - # This preserves file-level granularity in search - chunks_data = await create_document_chunks(digest.content) - except Exception as chunk_err: - logger.error(f"Failed to chunk repository {repo_full_name}: {chunk_err}") - # Fall back to a simpler chunking approach - chunks_data = await _simple_chunk_content(digest.content) - - # Create the document - doc_metadata = { - "repository_full_name": repo_full_name, - "url": f"https://github.com/{repo_full_name}", - "branch": digest.branch, - "ingestion_method": "gitingest", - "file_tree": digest.tree, - "gitingest_summary": digest.summary, - "estimated_tokens": digest.estimated_tokens, - "indexed_at": datetime.now(UTC).isoformat(), - } - - document = Document( - title=repo_full_name, - document_type=DocumentType.GITHUB_CONNECTOR, - document_metadata=doc_metadata, - content=summary_text, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - search_space_id=search_space_id, - chunks=chunks_data, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) - - session.add(document) - documents_created += 1 - - logger.info( - f"Created document for repository {repo_full_name} " - f"with {len(chunks_data)} chunks" - ) - - return documents_created - - async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list: """ Simple fallback chunking when the regular chunker fails. From 0f61a249c0f322d1ebf4e4bcf1c84272bbf8a403 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 04:31:55 +0530 Subject: [PATCH 26/36] feat: implement two-phase document indexing for BookStack, Elasticsearch, and Luma connectors with real-time status updates --- .../connector_indexers/bookstack_indexer.py | 307 +++++++++------ .../elasticsearch_indexer.py | 267 +++++++++---- .../tasks/connector_indexers/luma_indexer.py | 370 ++++++++++-------- 3 files changed, 580 insertions(+), 364 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py index f1338564e..fbf90b345 100644 --- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py @@ -1,5 +1,9 @@ """ BookStack connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Collect all pages and create pending documents (visible in UI immediately) +- Phase 2: Process each page: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.bookstack_connector import BookStackConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -184,22 +189,22 @@ async def index_bookstack_pages( logger.error(f"Error fetching BookStack pages: {e!s}", exc_info=True) return 0, f"Error fetching BookStack pages: {e!s}" - # Process and index each page + # ======================================================================= + # PHASE 1: Analyze all pages, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= documents_indexed = 0 skipped_pages = [] documents_skipped = 0 + documents_failed = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + pages_to_process = [] # List of dicts with document and page data + new_documents_created = False + for page in pages: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: page_id = page.get("id") page_name = page.get("name", "") @@ -218,7 +223,7 @@ async def index_bookstack_pages( # Fetch full page content (Markdown preferred) try: - page_detail, page_content = bookstack_client.get_page_with_content( + _, page_content = bookstack_client.get_page_with_content( page_id, use_markdown=True ) except Exception as e: @@ -252,82 +257,34 @@ async def index_bookstack_pages( # Build page URL page_url = f"{bookstack_base_url}/books/{book_slug}/page/{page_slug}" - # Build document metadata - doc_metadata = { - "page_id": page_id, - "page_name": page_name, - "page_slug": page_slug, - "book_id": book_id, - "book_slug": book_slug, - "chapter_id": chapter_id, - "base_url": bookstack_base_url, - "page_url": page_url, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - } - if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for BookStack page {page_name} unchanged. Skipping." ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for BookStack page {page_name}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - summary_metadata = { - "page_name": page_name, - "page_id": page_id, - "book_id": book_id, - "document_type": "BookStack Page", - "connector_type": "BookStack", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - full_content, user_llm, summary_metadata - ) - else: - summary_content = ( - f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n" - ) - if page_content: - content_preview = page_content[:1000] - if len(page_content) > 1000: - content_preview += "..." - summary_content += ( - f"Content Preview: {content_preview}\n\n" - ) - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(full_content) - - # Update existing document - existing_document.title = page_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = doc_metadata - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info(f"Successfully updated BookStack page {page_name}") - continue + # Queue existing document for update (will be set to processing in Phase 2) + pages_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'page_id': page_id, + 'page_name': page_name, + 'page_slug': page_slug, + 'book_id': book_id, + 'book_slug': book_slug, + 'chapter_id': chapter_id, + 'page_url': page_url, + 'page_content': page_content, + 'full_content': full_content, + 'content_hash': content_hash, + }) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -345,17 +302,104 @@ async def index_bookstack_pages( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=page_name, + document_type=DocumentType.BOOKSTACK_CONNECTOR, + document_metadata={ + "page_id": page_id, + "page_name": page_name, + "page_slug": page_slug, + "book_id": book_id, + "book_slug": book_slug, + "chapter_id": chapter_id, + "base_url": bookstack_base_url, + "page_url": page_url, + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + pages_to_process.append({ + 'document': document, + 'is_new': True, + 'page_id': page_id, + 'page_name': page_name, + 'page_slug': page_slug, + 'book_id': book_id, + 'book_slug': book_slug, + 'chapter_id': chapter_id, + 'page_url': page_url, + 'page_content': page_content, + 'full_content': full_content, + 'content_hash': content_hash, + }) + + except Exception as e: + logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(pages_to_process)} documents") + + for item in pages_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) user_llm = await get_user_long_context_llm( session, user_id, search_space_id ) + # Build document metadata + doc_metadata = { + "page_id": item['page_id'], + "page_name": item['page_name'], + "page_slug": item['page_slug'], + "book_id": item['book_id'], + "book_slug": item['book_slug'], + "chapter_id": item['chapter_id'], + "base_url": bookstack_base_url, + "page_url": item['page_url'], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + if user_llm: summary_metadata = { - "page_name": page_name, - "page_id": page_id, - "book_id": book_id, + "page_name": item['page_name'], + "page_id": item['page_id'], + "book_id": item['book_id'], "document_type": "BookStack Page", "connector_type": "BookStack", } @@ -363,17 +407,17 @@ async def index_bookstack_pages( summary_content, summary_embedding, ) = await generate_document_summary( - full_content, user_llm, summary_metadata + item['full_content'], user_llm, summary_metadata ) else: # Fallback to simple summary if no LLM configured summary_content = ( - f"BookStack Page: {page_name}\n\nBook ID: {book_id}\n\n" + f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n" ) - if page_content: + if item['page_content']: # Take first 1000 characters of content for summary - content_preview = page_content[:1000] - if len(page_content) > 1000: + content_preview = item['page_content'][:1000] + if len(item['page_content']) > 1000: content_preview += "..." summary_content += f"Content Preview: {content_preview}\n\n" summary_embedding = config.embedding_model_instance.embed( @@ -381,30 +425,21 @@ async def index_bookstack_pages( ) # Process chunks - using the full page content - chunks = await create_document_chunks(full_content) + chunks = await create_document_chunks(item['full_content']) - # Create and store new document - logger.info(f"Creating new document for page {page_name}") - document = Document( - search_space_id=search_space_id, - title=page_name, - document_type=DocumentType.BOOKSTACK_CONNECTOR, - document_metadata=doc_metadata, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) + # Update document to READY with actual content + document.title = item['page_name'] + document.content = summary_content + document.content_hash = item['content_hash'] + document.embedding = summary_embedding + document.document_metadata = doc_metadata + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() - session.add(document) documents_indexed += 1 - logger.info(f"Successfully indexed new page {page_name}") - # Batch commit every 10 documents + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} BookStack pages processed so far" @@ -413,46 +448,72 @@ async def index_bookstack_pages( except Exception as e: logger.error( - f"Error processing page {page.get('name', 'Unknown')}: {e!s}", + f"Error processing page {item.get('page_name', 'Unknown')}: {e!s}", exc_info=True, ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") skipped_pages.append( - f"{page.get('name', 'Unknown')} (processing error)" + f"{item.get('page_name', 'Unknown')} (processing error)" ) - documents_skipped += 1 - continue # Skip this page and continue with others + documents_failed += 1 + continue - # Update the last_indexed_at timestamp for the connector only if requested - total_processed = documents_indexed - if update_last_indexed: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( f"Final commit: Total {documents_indexed} BookStack pages processed" ) - await session.commit() - logger.info("Successfully committed all BookStack document changes to database") + try: + await session.commit() + logger.info("Successfully committed all BookStack document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same page was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None # Log success await task_logger.log_task_success( log_entry, f"Successfully completed BookStack indexing for connector {connector_id}", { - "pages_processed": total_processed, + "pages_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "skipped_pages_count": len(skipped_pages), }, ) logger.info( - f"BookStack indexing completed: {documents_indexed} new pages, {documents_skipped} skipped" + f"BookStack indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - return ( - total_processed, - None, - ) # Return None as the error message to indicate success + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() diff --git a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py index fb6487474..97cd31a09 100644 --- a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py @@ -1,5 +1,9 @@ """ Elasticsearch indexer for SurfSense + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Collect all documents and create pending documents (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import json @@ -13,7 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from app.connectors.elasticsearch_connector import ElasticsearchConnector -from app.db import Document, DocumentType, SearchSourceConnector +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( create_document_chunks, @@ -25,6 +29,7 @@ from .base import ( check_document_by_unique_identifier, check_duplicate_document_by_hash, get_current_timestamp, + safe_set_chunks, ) # Type hint for heartbeat callback @@ -164,6 +169,8 @@ async def index_elasticsearch_documents( ) documents_processed = 0 + documents_skipped = 0 + documents_failed = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() @@ -178,23 +185,22 @@ async def index_elasticsearch_documents( "max_documents": max_documents, }, ) - # Use scroll search for large result sets + + # ======================================================================= + # PHASE 1: Collect all documents from Elasticsearch and create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + docs_to_process = [] # List of dicts with document and ES data + new_documents_created = False + hits_collected = 0 + async for hit in es_connector.scroll_search( index=index_name, query=query, size=min(max_documents, 100), # Scroll in batches fields=config.get("ELASTICSEARCH_FIELDS"), ): - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) - >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_processed) - last_heartbeat_time = time.time() - - if documents_processed >= max_documents: + if hits_collected >= max_documents: break try: @@ -220,26 +226,12 @@ async def index_elasticsearch_documents( if not content.strip(): logger.warning(f"Skipping document {doc_id} - no content found") + documents_skipped += 1 continue # Create content hash content_hash = generate_content_hash(content, search_space_id) - # Build metadata - metadata = { - "elasticsearch_id": doc_id, - "elasticsearch_index": hit.get("_index", index_name), - "elasticsearch_score": hit.get("_score"), - "indexed_at": datetime.now().isoformat(), - "source": "ELASTICSEARCH_CONNECTOR", - } - - # Add any additional metadata fields specified in config - if "ELASTICSEARCH_METADATA_FIELDS" in config: - for field in config["ELASTICSEARCH_METADATA_FIELDS"]: - if field in source: - metadata[f"es_{field}"] = source[field] - # Build source-unique identifier and hash (prefer source id dedupe) source_identifier = f"{hit.get('_index', index_name)}:{doc_id}" unique_identifier_hash = generate_unique_identifier_hash( @@ -258,98 +250,209 @@ async def index_elasticsearch_documents( ) if existing_doc: - # If content is unchanged, skip. Otherwise update the existing document. + # If content is unchanged, skip. Otherwise queue for update. if existing_doc.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_doc.status, DocumentStatus.READY): + existing_doc.status = DocumentStatus.ready() logger.info( f"Skipping ES doc {doc_id} — already indexed (doc id {existing_doc.id})" ) - continue - else: - logger.info( - f"Updating existing document {existing_doc.id} for ES doc {doc_id}" - ) - existing_doc.title = title - existing_doc.content = content - existing_doc.content_hash = content_hash - existing_doc.document_metadata = metadata - existing_doc.unique_identifier_hash = unique_identifier_hash - chunks = await create_document_chunks(content) - existing_doc.chunks = chunks - existing_doc.updated_at = get_current_timestamp() - await session.flush() - documents_processed += 1 - if documents_processed % 10 == 0: - await session.commit() + documents_skipped += 1 continue - # Create document + # Queue existing document for update (will be set to processing in Phase 2) + docs_to_process.append({ + 'document': existing_doc, + 'is_new': False, + 'doc_id': doc_id, + 'title': title, + 'content': content, + 'content_hash': content_hash, + 'unique_identifier_hash': unique_identifier_hash, + 'hit': hit, + 'source': source, + }) + hits_collected += 1 + continue + + # Build metadata for new document + metadata = { + "elasticsearch_id": doc_id, + "elasticsearch_index": hit.get("_index", index_name), + "elasticsearch_score": hit.get("_score"), + "source": "ELASTICSEARCH_CONNECTOR", + "connector_id": connector_id, + } + + # Add any additional metadata fields specified in config + if "ELASTICSEARCH_METADATA_FIELDS" in config: + for field in config["ELASTICSEARCH_METADATA_FIELDS"]: + if field in source: + metadata[f"es_{field}"] = source[field] + + # Create new document with PENDING status (visible in UI immediately) document = Document( title=title, - content=content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, document_type=DocumentType.ELASTICSEARCH_CONNECTOR, document_metadata=metadata, search_space_id=search_space_id, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - - # Create chunks and attach to document (persist via relationship) - chunks = await create_document_chunks(content) - document.chunks = chunks session.add(document) - await session.flush() + new_documents_created = True + + docs_to_process.append({ + 'document': document, + 'is_new': True, + 'doc_id': doc_id, + 'title': title, + 'content': content, + 'content_hash': content_hash, + 'unique_identifier_hash': unique_identifier_hash, + 'hit': hit, + 'source': source, + }) + hits_collected += 1 + + except Exception as e: + logger.error(f"Error in Phase 1 for ES doc: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([d for d in docs_to_process if d['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(docs_to_process)} documents") + + for item in docs_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_processed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Build metadata + metadata = { + "elasticsearch_id": item['doc_id'], + "elasticsearch_index": item['hit'].get("_index", index_name), + "elasticsearch_score": item['hit'].get("_score"), + "indexed_at": datetime.now().isoformat(), + "source": "ELASTICSEARCH_CONNECTOR", + "connector_id": connector_id, + } + + # Add any additional metadata fields specified in config + if "ELASTICSEARCH_METADATA_FIELDS" in config: + for field in config["ELASTICSEARCH_METADATA_FIELDS"]: + if field in item['source']: + metadata[f"es_{field}"] = item['source'][field] + + # Create chunks + chunks = await create_document_chunks(item['content']) + + # Update document to READY with actual content + document.title = item['title'] + document.content = item['content'] + document.content_hash = item['content_hash'] + document.unique_identifier_hash = item['unique_identifier_hash'] + document.document_metadata = metadata + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() documents_processed += 1 + # Batch commit every 10 documents (for ready status updates) if documents_processed % 10 == 0: logger.info( - f"Processed {documents_processed} Elasticsearch documents" + f"Committing batch: {documents_processed} Elasticsearch documents processed so far" ) await session.commit() except Exception as e: - msg = f"Error processing Elasticsearch document {hit.get('_id', 'unknown')}: {e}" + msg = f"Error processing Elasticsearch document {item.get('doc_id', 'unknown')}: {e}" logger.error(msg) - await task_logger.log_task_failure( - log_entry, - "Document processing error", - msg, - { - "document_id": hit.get("_id", "unknown"), - "error_type": type(e).__name__, - }, - ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + documents_failed += 1 continue - # Final commit - await session.commit() + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + if update_last_indexed: + connector.last_indexed_at = ( + datetime.now(UTC).isoformat().replace("+00:00", "Z") + ) + + # Final commit for any remaining documents not yet committed in batches + logger.info(f"Final commit: Total {documents_processed} Elasticsearch documents processed") + try: + await session.commit() + logger.info("Successfully committed all Elasticsearch document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same document was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None await task_logger.log_task_success( log_entry, f"Successfully indexed {documents_processed} documents from Elasticsearch", - {"documents_indexed": documents_processed, "index": index_name}, + { + "documents_indexed": documents_processed, + "documents_skipped": documents_skipped, + "documents_failed": documents_failed, + "index": index_name, + }, ) logger.info( - f"Successfully indexed {documents_processed} documents from Elasticsearch" + f"Elasticsearch indexing completed: {documents_processed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - # Update last indexed timestamp if requested - if update_last_indexed and documents_processed > 0: - # connector.last_indexed_at = datetime.now() - connector.last_indexed_at = ( - datetime.now(UTC).isoformat().replace("+00:00", "Z") - ) - await session.commit() - await task_logger.log_task_progress( - log_entry, - "Updated connector.last_indexed_at", - {"last_indexed_at": connector.last_indexed_at}, - ) - - return documents_processed, None + return documents_processed, warning_message finally: # Clean up Elasticsearch connection diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py index f4527843c..80d4ef3cf 100644 --- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py @@ -1,5 +1,9 @@ """ Luma connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Collect all events and create pending documents (visible in UI immediately) +- Phase 2: Process each event: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.luma_connector import LumaConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -27,6 +31,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -227,21 +232,22 @@ async def index_luma_events( logger.error(f"Error fetching Luma events: {e!s}", exc_info=True) return 0, f"Error fetching Luma events: {e!s}" + # ======================================================================= + # PHASE 1: Analyze all events, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= documents_indexed = 0 documents_skipped = 0 + documents_failed = 0 skipped_events = [] # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + events_to_process = [] # List of dicts with document and event data + new_documents_created = False + for event in events: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() try: # Luma event structure fields - events have nested 'event' field event_data = event.get("event", {}) @@ -298,91 +304,34 @@ async def index_luma_events( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() logger.info( f"Document for Luma event {event_name} unchanged. Skipping." ) documents_skipped += 1 continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for Luma event {event_name}. Updating document." - ) - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "event_name": event_name, - "event_url": event_url, - "start_at": start_at, - "end_at": end_at, - "timezone": timezone, - "location": location or "No location", - "city": city, - "hosts": host_names, - "document_type": "Luma Event", - "connector_type": "Luma", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - event_markdown, user_llm, document_metadata - ) - else: - summary_content = f"Luma Event: {event_name}\n\n" - if event_url: - summary_content += f"URL: {event_url}\n" - summary_content += f"Start: {start_at}\n" - summary_content += f"End: {end_at}\n" - if timezone: - summary_content += f"Timezone: {timezone}\n" - if location: - summary_content += f"Location: {location}\n" - if city: - summary_content += f"City: {city}\n" - if host_names: - summary_content += f"Hosts: {host_names}\n" - if description: - desc_preview = description[:1000] - if len(description) > 1000: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(event_markdown) - - # Update existing document - existing_document.title = event_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "event_id": event_id, - "event_name": event_name, - "event_url": event_url, - "start_at": start_at, - "end_at": end_at, - "timezone": timezone, - "location": location, - "city": city, - "hosts": host_names, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_indexed += 1 - logger.info(f"Successfully updated Luma event {event_name}") - continue + # Queue existing document for update (will be set to processing in Phase 2) + events_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'event_id': event_id, + 'event_name': event_name, + 'event_url': event_url, + 'event_markdown': event_markdown, + 'content_hash': content_hash, + 'start_at': start_at, + 'end_at': end_at, + 'timezone': timezone, + 'location': location, + 'city': city, + 'host_names': host_names, + 'description': description, + 'cover_url': cover_url, + }) + continue # Document doesn't exist by unique_identifier_hash # Check if a document with the same content_hash exists (from another connector) @@ -400,59 +349,7 @@ async def index_luma_events( documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "event_id": event_id, - "event_name": event_name, - "event_url": event_url, - "start_at": start_at, - "end_at": end_at, - "timezone": timezone, - "location": location or "No location", - "city": city, - "hosts": host_names, - "document_type": "Luma Event", - "connector_type": "Luma", - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - event_markdown, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Luma Event: {event_name}\n\n" - if event_url: - summary_content += f"URL: {event_url}\n" - summary_content += f"Start: {start_at}\n" - summary_content += f"End: {end_at}\n" - if timezone: - summary_content += f"Timezone: {timezone}\n" - if location: - summary_content += f"Location: {location}\n" - if city: - summary_content += f"City: {city}\n" - if host_names: - summary_content += f"Hosts: {host_names}\n" - if description: - desc_preview = description[:1000] - if len(description) > 1000: - desc_preview += "..." - summary_content += f"Description: {desc_preview}\n" - - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - chunks = await create_document_chunks(event_markdown) - + # Create new document with PENDING status (visible in UI immediately) document = Document( search_space_id=search_space_id, title=event_name, @@ -468,23 +365,147 @@ async def index_luma_events( "city": city, "hosts": host_names, "cover_url": cover_url, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, }, - content=summary_content, - content_hash=content_hash, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts updated_at=get_current_timestamp(), created_by_id=user_id, connector_id=connector_id, ) - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new event {event_name}") + new_documents_created = True - # Batch commit every 10 documents + events_to_process.append({ + 'document': document, + 'is_new': True, + 'event_id': event_id, + 'event_name': event_name, + 'event_url': event_url, + 'event_markdown': event_markdown, + 'content_hash': content_hash, + 'start_at': start_at, + 'end_at': end_at, + 'timezone': timezone, + 'location': location, + 'city': city, + 'host_names': host_names, + 'description': description, + 'cover_url': cover_url, + }) + + except Exception as e: + logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(events_to_process)} documents") + + for item in events_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Heavy processing (LLM, embeddings, chunks) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + if user_llm: + document_metadata_for_summary = { + "event_id": item['event_id'], + "event_name": item['event_name'], + "event_url": item['event_url'], + "start_at": item['start_at'], + "end_at": item['end_at'], + "timezone": item['timezone'], + "location": item['location'] or "No location", + "city": item['city'], + "hosts": item['host_names'], + "document_type": "Luma Event", + "connector_type": "Luma", + } + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item['event_markdown'], user_llm, document_metadata_for_summary + ) + else: + # Fallback to simple summary if no LLM configured + summary_content = f"Luma Event: {item['event_name']}\n\n" + if item['event_url']: + summary_content += f"URL: {item['event_url']}\n" + summary_content += f"Start: {item['start_at']}\n" + summary_content += f"End: {item['end_at']}\n" + if item['timezone']: + summary_content += f"Timezone: {item['timezone']}\n" + if item['location']: + summary_content += f"Location: {item['location']}\n" + if item['city']: + summary_content += f"City: {item['city']}\n" + if item['host_names']: + summary_content += f"Hosts: {item['host_names']}\n" + if item['description']: + desc_preview = item['description'][:1000] + if len(item['description']) > 1000: + desc_preview += "..." + summary_content += f"Description: {desc_preview}\n" + + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) + + chunks = await create_document_chunks(item['event_markdown']) + + # Update document to READY with actual content + document.title = item['event_name'] + document.content = summary_content + document.content_hash = item['content_hash'] + document.embedding = summary_embedding + document.document_metadata = { + "event_id": item['event_id'], + "event_name": item['event_name'], + "event_url": item['event_url'], + "start_at": item['start_at'], + "end_at": item['end_at'], + "timezone": item['timezone'], + "location": item['location'], + "city": item['city'], + "hosts": item['host_names'], + "cover_url": item['cover_url'], + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + documents_indexed += 1 + + # Batch commit every 10 documents (for ready status updates) if documents_indexed % 10 == 0: logger.info( f"Committing batch: {documents_indexed} Luma events processed so far" @@ -493,38 +514,69 @@ async def index_luma_events( except Exception as e: logger.error( - f"Error processing event {event.get('name', 'Unknown')}: {e!s}", + f"Error processing event {item.get('event_name', 'Unknown')}: {e!s}", exc_info=True, ) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") skipped_events.append( - f"{event.get('name', 'Unknown')} (processing error)" + f"{item.get('event_name', 'Unknown')} (processing error)" ) - documents_skipped += 1 + documents_failed += 1 continue - total_processed = documents_indexed - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + # This ensures the UI shows "Last indexed" instead of "Never indexed" + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info(f"Final commit: Total {documents_indexed} Luma events processed") - await session.commit() + try: + await session.commit() + logger.info("Successfully committed all Luma document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same event was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None await task_logger.log_task_success( log_entry, f"Successfully completed Luma indexing for connector {connector_id}", { - "events_processed": total_processed, + "events_processed": documents_indexed, "documents_indexed": documents_indexed, "documents_skipped": documents_skipped, + "documents_failed": documents_failed, "skipped_events_count": len(skipped_events), }, ) logger.info( - f"Luma indexing completed: {documents_indexed} new events, {documents_skipped} skipped" + f"Luma indexing completed: {documents_indexed} ready, " + f"{documents_skipped} skipped, {documents_failed} failed" ) - return total_processed, None + return documents_indexed, warning_message except SQLAlchemyError as db_error: await session.rollback() From 629f6f9cf5e42b63ec50193b9c694da6056bab40 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 04:35:13 +0530 Subject: [PATCH 27/36] feat: implement two-phase document indexing for Obsidian and Circleback connectors with real-time status updates --- .../connector_indexers/obsidian_indexer.py | 362 ++++++++++++------ .../app/tasks/document_processors/base.py | 28 ++ .../circleback_processor.py | 196 ++++++---- 3 files changed, 394 insertions(+), 192 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py index cfc321df1..0e6934e2c 100644 --- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py @@ -3,6 +3,10 @@ Obsidian connector indexer. Indexes markdown notes from a local Obsidian vault. This connector is only available in self-hosted mode. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import os @@ -17,7 +21,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -34,6 +38,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -307,25 +312,22 @@ async def index_obsidian_vault( logger.info(f"Processing {len(files)} files after date filtering") - # Get LLM for summarization - long_context_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - indexed_count = 0 skipped_count = 0 + failed_count = 0 + duplicate_content_count = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all files, create pending documents + # This makes ALL documents visible in the UI immediately with pending status + # ======================================================================= + files_to_process = [] # List of dicts with document and file data + new_documents_created = False + for file_info in files: - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(indexed_count) - last_heartbeat_time = time.time() try: file_path = file_info["path"] relative_path = file_info["relative_path"] @@ -368,13 +370,143 @@ async def index_obsidian_vault( search_space_id, ) + # Generate content hash + content_hash = generate_content_hash(content, search_space_id) + # Check for existing document existing_document = await check_document_by_unique_identifier( session, unique_identifier_hash ) - # Generate content hash - content_hash = generate_content_hash(content, search_space_id) + if existing_document: + # Document exists - check if content has changed + if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() + logger.debug(f"Note {title} unchanged, skipping") + skipped_count += 1 + continue + + # Queue existing document for update (will be set to processing in Phase 2) + files_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'file_info': file_info, + 'content': content, + 'body_content': body_content, + 'frontmatter': frontmatter, + 'wiki_links': wiki_links, + 'tags': tags, + 'title': title, + 'relative_path': relative_path, + 'content_hash': content_hash, + 'unique_identifier_hash': unique_identifier_hash, + }) + continue + + # Document doesn't exist by unique_identifier_hash + # Check if a document with the same content_hash exists (from another connector) + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + logger.info( + f"Obsidian note {title} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}, " + f"type: {duplicate_by_content.document_type}). Skipping." + ) + duplicate_content_count += 1 + skipped_count += 1 + continue + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=title, + document_type=DocumentType.OBSIDIAN_CONNECTOR, + document_metadata={ + "vault_name": vault_name, + "file_path": relative_path, + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + files_to_process.append({ + 'document': document, + 'is_new': True, + 'file_info': file_info, + 'content': content, + 'body_content': body_content, + 'frontmatter': frontmatter, + 'wiki_links': wiki_links, + 'tags': tags, + 'title': title, + 'relative_path': relative_path, + 'content_hash': content_hash, + 'unique_identifier_hash': unique_identifier_hash, + }) + + except Exception as e: + logger.exception( + f"Error in Phase 1 for file {file_info.get('path', 'unknown')}: {e}" + ) + failed_count += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each document one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(files_to_process)} documents") + + # Get LLM for summarization + long_context_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) + + for item in files_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(indexed_count) + last_heartbeat_time = current_time + + document = item['document'] + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() + + # Extract data from item + title = item['title'] + relative_path = item['relative_path'] + content = item['content'] + body_content = item['body_content'] + frontmatter = item['frontmatter'] + wiki_links = item['wiki_links'] + tags = item['tags'] + content_hash = item['content_hash'] + file_info = item['file_info'] # Build metadata document_metadata = { @@ -404,134 +536,114 @@ async def index_obsidian_vault( ] document_string = build_document_metadata_string(metadata_sections) - if existing_document: - # Check if content has changed - if existing_document.content_hash == content_hash: - logger.debug(f"Note {title} unchanged, skipping") - skipped_count += 1 - continue - - # Update existing document - logger.info(f"Updating note: {title}") - - # Generate new summary if content changed - if long_context_llm: - new_summary, _ = await generate_document_summary( - document_string, - long_context_llm, - document_metadata, - ) - # Store summary in metadata - document_metadata["summary"] = new_summary - - # Add URL and connector_id to metadata - document_metadata["url"] = ( - f"obsidian://{vault_name}/{relative_path}" - ) - document_metadata["connector_id"] = connector_id - - existing_document.content = document_string - existing_document.content_hash = content_hash - existing_document.document_metadata = document_metadata - existing_document.updated_at = get_current_timestamp() - - # Update embedding - embedding = config.embedding_model_instance.embed(document_string) - existing_document.embedding = embedding - - # Update chunks - delete old and create new - existing_document.chunks.clear() - new_chunks = await create_document_chunks(document_string) - existing_document.chunks = new_chunks - - indexed_count += 1 - - else: - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) - with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash - ) - - if duplicate_by_content: - logger.info( - f"Obsidian note {title} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping." - ) - skipped_count += 1 - continue - - # Create new document - logger.info(f"Indexing new note: {title}") - - # Generate summary - summary_content = "" - if long_context_llm: - summary_content, _ = await generate_document_summary( - document_string, - long_context_llm, - document_metadata, - ) - - # Generate embedding - embedding = config.embedding_model_instance.embed(document_string) - - # Add URL and summary to metadata - document_metadata["url"] = ( - f"obsidian://{vault_name}/{relative_path}" - ) - document_metadata["summary"] = summary_content - document_metadata["connector_id"] = connector_id - - # Create chunks - chunks = await create_document_chunks(document_string) - - # Create document - new_document = Document( - search_space_id=search_space_id, - title=title, - document_type=DocumentType.OBSIDIAN_CONNECTOR, - content=document_string, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - document_metadata=document_metadata, - embedding=embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, + # Generate summary + summary_content = "" + if long_context_llm: + summary_content, _ = await generate_document_summary( + document_string, + long_context_llm, + document_metadata, ) - session.add(new_document) + # Generate embedding + embedding = config.embedding_model_instance.embed(document_string) - indexed_count += 1 + # Add URL and summary to metadata + document_metadata["url"] = f"obsidian://{vault_name}/{relative_path}" + document_metadata["summary"] = summary_content + document_metadata["connector_id"] = connector_id + + # Create chunks + chunks = await create_document_chunks(document_string) + + # Update document to READY with actual content + document.title = title + document.content = document_string + document.content_hash = content_hash + document.embedding = embedding + document.document_metadata = document_metadata + safe_set_chunks(document, chunks) + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + + indexed_count += 1 + + # Batch commit every 10 documents (for ready status updates) + if indexed_count % 10 == 0: + logger.info( + f"Committing batch: {indexed_count} Obsidian notes processed so far" + ) + await session.commit() except Exception as e: logger.exception( - f"Error processing file {file_info.get('path', 'unknown')}: {e}" + f"Error processing file {item.get('file_info', {}).get('path', 'unknown')}: {e}" ) - skipped_count += 1 + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + failed_count += 1 continue - # Update connector's last indexed timestamp + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs await update_connector_last_indexed(session, connector, update_last_indexed) - # Commit all changes - await session.commit() + # Final commit for any remaining documents not yet committed in batches + logger.info( + f"Final commit: Total {indexed_count} Obsidian notes processed" + ) + try: + await session.commit() + logger.info( + "Successfully committed all Obsidian document changes to database" + ) + except Exception as e: + # Handle any remaining integrity errors gracefully (race conditions, etc.) + if ( + "duplicate key value violates unique constraint" in str(e).lower() + or "uniqueviolationerror" in str(e).lower() + ): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"This may occur if the same note was indexed by multiple connectors. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + # Don't fail the entire task - some documents may have been successfully indexed + else: + raise + + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if failed_count > 0: + warning_parts.append(f"{failed_count} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None + + total_processed = indexed_count await task_logger.log_task_success( log_entry, - f"Successfully indexed {indexed_count} Obsidian notes (skipped {skipped_count})", + f"Successfully completed Obsidian vault indexing for connector {connector_id}", { - "indexed_count": indexed_count, - "skipped_count": skipped_count, - "total_files": len(files), + "notes_processed": total_processed, + "documents_indexed": indexed_count, + "documents_skipped": skipped_count, + "documents_failed": failed_count, + "duplicate_content_count": duplicate_content_count, }, ) - return indexed_count, None + logger.info( + f"Obsidian vault indexing completed: {indexed_count} ready, " + f"{skipped_count} skipped, {failed_count} failed " + f"({duplicate_content_count} duplicate content)" + ) + return total_processed, warning_message except SQLAlchemyError as e: logger.exception(f"Database error during Obsidian indexing: {e}") diff --git a/surfsense_backend/app/tasks/document_processors/base.py b/surfsense_backend/app/tasks/document_processors/base.py index f29207448..c8046868c 100644 --- a/surfsense_backend/app/tasks/document_processors/base.py +++ b/surfsense_backend/app/tasks/document_processors/base.py @@ -14,6 +14,34 @@ from app.db import Document md = MarkdownifyTransformer() +def safe_set_chunks(document: Document, chunks: list) -> None: + """ + Safely assign chunks to a document without triggering lazy loading. + + ALWAYS use this instead of `document.chunks = chunks` to avoid + SQLAlchemy async errors (MissingGreenlet / greenlet_spawn). + + Why this is needed: + - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to + load the OLD chunks first (for comparison/orphan detection) + - This lazy loading fails in async context with asyncpg driver + - set_committed_value bypasses this by setting the value directly + + This function is safe regardless of how the document was loaded + (with or without selectinload). + + Args: + document: The Document object to update + chunks: List of Chunk objects to assign + + Example: + # Instead of: document.chunks = chunks (DANGEROUS!) + safe_set_chunks(document, chunks) # Always safe + """ + from sqlalchemy.orm.attributes import set_committed_value + set_committed_value(document, 'chunks', chunks) + + def get_current_timestamp() -> datetime: """ Get the current timestamp with timezone for updated_at field. diff --git a/surfsense_backend/app/tasks/document_processors/circleback_processor.py b/surfsense_backend/app/tasks/document_processors/circleback_processor.py index f412b51dd..e9c395c83 100644 --- a/surfsense_backend/app/tasks/document_processors/circleback_processor.py +++ b/surfsense_backend/app/tasks/document_processors/circleback_processor.py @@ -3,6 +3,11 @@ Circleback meeting document processor. This module processes meeting data received from Circleback webhooks and stores it as searchable documents in the database. + +Implements real-time document status updates for UI feedback: +- Create document with 'pending' status (visible in UI immediately) +- Set to 'processing' while processing content +- Set to 'ready' or 'failed' when complete """ import logging @@ -14,6 +19,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.db import ( Document, + DocumentStatus, DocumentType, SearchSourceConnector, SearchSourceConnectorType, @@ -30,6 +36,7 @@ from app.utils.document_converters import ( from .base import ( check_document_by_unique_identifier, get_current_timestamp, + safe_set_chunks, ) logger = logging.getLogger(__name__) @@ -47,6 +54,11 @@ async def add_circleback_meeting_document( """ Process and store a Circleback meeting document. + Implements real-time document status updates: + - Phase 1: Create document with 'pending' status (visible in UI immediately) + - Phase 2: Set to 'processing' while processing content + - Phase 3: Set to 'ready' or 'failed' when complete + Args: session: Database session meeting_id: Circleback meeting ID @@ -59,6 +71,7 @@ async def add_circleback_meeting_document( Returns: Document object if successful, None if failed or duplicate """ + document = None try: # Generate unique identifier hash using Circleback meeting ID unique_identifier = f"circleback_{meeting_id}" @@ -77,6 +90,10 @@ async def add_circleback_meeting_document( if existing_document: # Document exists - check if content has changed if existing_document.content_hash == content_hash: + # Ensure status is ready (might have been stuck in processing/pending) + if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + existing_document.status = DocumentStatus.ready() + await session.commit() logger.info(f"Circleback meeting {meeting_id} unchanged. Skipping.") return existing_document else: @@ -84,7 +101,79 @@ async def add_circleback_meeting_document( logger.info( f"Content changed for Circleback meeting {meeting_id}. Updating document." ) + document = existing_document + # Set to PROCESSING status and commit - shows "processing" in UI + document.status = DocumentStatus.processing() + await session.commit() + else: + # ======================================================================= + # PHASE 1: Create document with PENDING status + # This makes the document visible in the UI immediately + # ======================================================================= + + # Fetch the user who set up the Circleback connector (preferred) + # or fall back to search space owner if no connector found + created_by_user_id = None + # Try to find the Circleback connector for this search space + connector_result = await session.execute( + select(SearchSourceConnector.user_id).where( + SearchSourceConnector.search_space_id == search_space_id, + SearchSourceConnector.connector_type + == SearchSourceConnectorType.CIRCLEBACK_CONNECTOR, + ) + ) + connector_user = connector_result.scalar_one_or_none() + + if connector_user: + # Use the user who set up the Circleback connector + created_by_user_id = connector_user + else: + # Fallback: use search space owner if no connector found + search_space_result = await session.execute( + select(SearchSpace.user_id).where(SearchSpace.id == search_space_id) + ) + created_by_user_id = search_space_result.scalar_one_or_none() + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=meeting_name, + document_type=DocumentType.CIRCLEBACK, + document_metadata={ + "CIRCLEBACK_MEETING_ID": meeting_id, + "MEETING_NAME": meeting_name, + "SOURCE": "CIRCLEBACK_WEBHOOK", + "connector_id": connector_id, + }, + content="Pending...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary unique value - updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # Pending until processing starts + content_needs_reindexing=False, + updated_at=get_current_timestamp(), + created_by_id=created_by_user_id, + connector_id=connector_id, + ) + session.add(document) + # Commit immediately so document appears in UI with pending status + await session.commit() + logger.info( + f"Created pending Circleback meeting document {meeting_id} in search space {search_space_id}" + ) + + # ======================================================================= + # PHASE 2: Set to PROCESSING status + # ======================================================================= + document.status = DocumentStatus.processing() + await session.commit() + + # ======================================================================= + # PHASE 3: Process the document content + # ======================================================================= + # Get LLM for generating summary llm = await get_document_summary_llm(session, search_space_id) if not llm: @@ -100,7 +189,7 @@ async def add_circleback_meeting_document( summary_embedding = None else: # Generate summary with metadata - document_metadata = { + summary_metadata = { "meeting_name": meeting_name, "meeting_id": meeting_id, "document_type": "Circleback Meeting", @@ -111,7 +200,7 @@ async def add_circleback_meeting_document( }, } summary_content, summary_embedding = await generate_document_summary( - markdown_content, llm, document_metadata + markdown_content, llm, summary_metadata ) # Process chunks @@ -126,7 +215,7 @@ async def add_circleback_meeting_document( f"Failed to convert Circleback meeting {meeting_id} to BlockNote JSON, document will not be editable" ) - # Prepare document metadata + # Prepare final document metadata document_metadata = { "CIRCLEBACK_MEETING_ID": meeting_id, "MEETING_NAME": meeting_name, @@ -134,77 +223,34 @@ async def add_circleback_meeting_document( **metadata, } - # Fetch the user who set up the Circleback connector (preferred) - # or fall back to search space owner if no connector found - created_by_user_id = None + # ======================================================================= + # PHASE 4: Update document to READY status with actual content + # ======================================================================= + document.title = meeting_name + document.content = summary_content + document.content_hash = content_hash + if summary_embedding is not None: + document.embedding = summary_embedding + document.document_metadata = document_metadata + safe_set_chunks(document, chunks) + document.blocknote_document = blocknote_json + document.content_needs_reindexing = False + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() + # Ensure connector_id is set (backfill for documents created before this field) + if connector_id is not None: + document.connector_id = connector_id - # Try to find the Circleback connector for this search space - connector_result = await session.execute( - select(SearchSourceConnector.user_id).where( - SearchSourceConnector.search_space_id == search_space_id, - SearchSourceConnector.connector_type - == SearchSourceConnectorType.CIRCLEBACK_CONNECTOR, - ) - ) - connector_user = connector_result.scalar_one_or_none() - - if connector_user: - # Use the user who set up the Circleback connector - created_by_user_id = connector_user - else: - # Fallback: use search space owner if no connector found - search_space_result = await session.execute( - select(SearchSpace.user_id).where(SearchSpace.id == search_space_id) - ) - created_by_user_id = search_space_result.scalar_one_or_none() - - # Update or create document + await session.commit() + await session.refresh(document) + if existing_document: - # Update existing document - existing_document.title = meeting_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - if summary_embedding is not None: - existing_document.embedding = summary_embedding - existing_document.document_metadata = document_metadata - existing_document.chunks = chunks - existing_document.blocknote_document = blocknote_json - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - # Ensure connector_id is set (backfill for documents created before this field) - if connector_id is not None: - existing_document.connector_id = connector_id - - await session.commit() - await session.refresh(existing_document) - document = existing_document logger.info( f"Updated Circleback meeting document {meeting_id} in search space {search_space_id}" ) else: - # Create new document - document = Document( - search_space_id=search_space_id, - title=meeting_name, - document_type=DocumentType.CIRCLEBACK, - document_metadata=document_metadata, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - blocknote_document=blocknote_json, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=created_by_user_id, - connector_id=connector_id, - ) - - session.add(document) - await session.commit() - await session.refresh(document) logger.info( - f"Created new Circleback meeting document {meeting_id} in search space {search_space_id}" + f"Processed Circleback meeting document {meeting_id} in search space {search_space_id} - now ready" ) return document @@ -214,8 +260,24 @@ async def add_circleback_meeting_document( logger.error( f"Database error processing Circleback meeting {meeting_id}: {db_error}" ) + # Mark document as failed if it was created + if document is not None: + try: + document.status = DocumentStatus.failed(str(db_error)) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") raise db_error except Exception as e: await session.rollback() logger.error(f"Failed to process Circleback meeting {meeting_id}: {e!s}") + # Mark document as failed if it was created + if document is not None: + try: + document.status = DocumentStatus.failed(str(e)) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") raise RuntimeError(f"Failed to process Circleback meeting: {e!s}") from e From 5d2da0847eaf48ecb1374702699382f80877d069 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 04:54:29 +0530 Subject: [PATCH 28/36] refactor: update connector mutation atoms to handle optional searchSpaceId and improve query invalidation logic --- .../connectors/connector-mutation.atoms.ts | 20 ++++--- .../hooks/use-connector-dialog.ts | 59 ++++++++----------- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/surfsense_web/atoms/connectors/connector-mutation.atoms.ts b/surfsense_web/atoms/connectors/connector-mutation.atoms.ts index 70b5b0322..b928f8631 100644 --- a/surfsense_web/atoms/connectors/connector-mutation.atoms.ts +++ b/surfsense_web/atoms/connectors/connector-mutation.atoms.ts @@ -1,5 +1,4 @@ import { atomWithMutation } from "jotai-tanstack-query"; -import { toast } from "sonner"; import type { CreateConnectorRequest, DeleteConnectorRequest, @@ -17,15 +16,16 @@ export const createConnectorMutationAtom = atomWithMutation((get) => { const searchSpaceId = get(activeSearchSpaceIdAtom); return { - mutationKey: cacheKeys.connectors.all(searchSpaceId!), + mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""), enabled: !!searchSpaceId, mutationFn: async (request: CreateConnectorRequest) => { return connectorsApiService.createConnector(request); }, onSuccess: () => { + if (!searchSpaceId) return; queryClient.invalidateQueries({ - queryKey: cacheKeys.connectors.all(searchSpaceId!), + queryKey: cacheKeys.connectors.all(searchSpaceId), }); }, }; @@ -35,15 +35,16 @@ export const updateConnectorMutationAtom = atomWithMutation((get) => { const searchSpaceId = get(activeSearchSpaceIdAtom); return { - mutationKey: cacheKeys.connectors.all(searchSpaceId!), + mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""), enabled: !!searchSpaceId, mutationFn: async (request: UpdateConnectorRequest) => { return connectorsApiService.updateConnector(request); }, onSuccess: (_, request: UpdateConnectorRequest) => { + if (!searchSpaceId) return; queryClient.invalidateQueries({ - queryKey: cacheKeys.connectors.all(searchSpaceId!), + queryKey: cacheKeys.connectors.all(searchSpaceId), }); queryClient.invalidateQueries({ queryKey: cacheKeys.connectors.byId(String(request.id)), @@ -56,15 +57,16 @@ export const deleteConnectorMutationAtom = atomWithMutation((get) => { const searchSpaceId = get(activeSearchSpaceIdAtom); return { - mutationKey: cacheKeys.connectors.all(searchSpaceId!), + mutationKey: cacheKeys.connectors.all(searchSpaceId ?? ""), enabled: !!searchSpaceId, mutationFn: async (request: DeleteConnectorRequest) => { return connectorsApiService.deleteConnector(request); }, onSuccess: (_, request: DeleteConnectorRequest) => { + if (!searchSpaceId) return; queryClient.setQueryData( - cacheKeys.connectors.all(searchSpaceId!), + cacheKeys.connectors.all(searchSpaceId), (oldData: GetConnectorsResponse | undefined) => { if (!oldData) return oldData; return oldData.filter((connector) => connector.id !== request.id); @@ -88,9 +90,9 @@ export const indexConnectorMutationAtom = atomWithMutation((get) => { }, onSuccess: (response: IndexConnectorResponse) => { - toast.success(response.message); + if (!searchSpaceId) return; queryClient.invalidateQueries({ - queryKey: cacheKeys.connectors.all(searchSpaceId!), + queryKey: cacheKeys.connectors.all(searchSpaceId), }); queryClient.invalidateQueries({ queryKey: cacheKeys.connectors.byId(String(response.connector_id)), diff --git a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts index 118ca66ce..0ab333457 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts +++ b/surfsense_web/components/assistant-ui/connector-popup/hooks/use-connector-dialog.ts @@ -295,6 +295,7 @@ export const useConnectorDialog = () => { connectingConnectorType, viewingAccountsType, viewingMCPList, + setIsOpen, ]); // Detect OAuth success / Failure and transition to config view @@ -345,12 +346,13 @@ export const useConnectorDialog = () => { const connectorId = parseInt(params.connectorId, 10); newConnector = result.data.find((c: SearchSourceConnector) => c.id === connectorId); - // If we found the connector, find the matching OAuth/Composio connector by type - if (newConnector) { - oauthConnector = - OAUTH_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type) || - COMPOSIO_CONNECTORS.find((c) => c.connectorType === newConnector!.connector_type); - } + // If we found the connector, find the matching OAuth/Composio connector by type + if (newConnector) { + const connectorType = newConnector.connector_type; + oauthConnector = + OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === connectorType); + } } // If we don't have a connector yet, try to find by connector param @@ -359,11 +361,12 @@ export const useConnectorDialog = () => { OAUTH_CONNECTORS.find((c) => c.id === params.connector) || COMPOSIO_CONNECTORS.find((c) => c.id === params.connector); - if (oauthConnector) { - newConnector = result.data.find( - (c: SearchSourceConnector) => c.connector_type === oauthConnector!.connectorType - ); - } + if (oauthConnector) { + const oauthConnectorType = oauthConnector.connectorType; + newConnector = result.data.find( + (c: SearchSourceConnector) => c.connector_type === oauthConnectorType + ); + } } if (newConnector && oauthConnector) { @@ -401,7 +404,7 @@ export const useConnectorDialog = () => { // Invalid query params - log but don't crash console.warn("Invalid connector popup query params in OAuth success handler:", error); } - }, [searchParams, searchSpaceId, refetchAllConnectors]); + }, [searchParams, searchSpaceId, refetchAllConnectors, setIsOpen]); // Handle OAuth connection const handleConnectOAuth = useCallback( @@ -516,7 +519,7 @@ export const useConnectorDialog = () => { } finally { setConnectingId(null); } - }, [searchSpaceId, createConnector, refetchAllConnectors]); + }, [searchSpaceId, createConnector, refetchAllConnectors, setIsOpen]); // Handle connecting non-OAuth connectors (like Tavily API) const handleConnectNonOAuth = useCallback( @@ -676,15 +679,11 @@ export const useConnectorDialog = () => { }, }); - const successMessage = - currentConnectorType === "MCP_CONNECTOR" - ? `${connector.name} added successfully` - : `${connectorTitle} connected and indexing started!`; - toast.success(successMessage, { - description: periodicEnabledForIndexing - ? `Periodic sync enabled every ${getFrequencyLabel(frequencyMinutesForIndexing)}.` - : "You can continue working while we sync your data.", - }); + const successMessage = + currentConnectorType === "MCP_CONNECTOR" + ? `${connector.name} added successfully` + : `${connectorTitle} connected and syncing started!`; + toast.success(successMessage); const url = new URL(window.location.href); url.searchParams.delete("modal"); @@ -784,7 +783,6 @@ export const useConnectorDialog = () => { updateConnector, indexConnector, router, - getFrequencyLabel, ] ); @@ -1012,11 +1010,7 @@ export const useConnectorDialog = () => { ); } - toast.success(`${indexingConfig.connectorTitle} indexing started`, { - description: periodicEnabled - ? `Periodic sync enabled every ${getFrequencyLabel(frequencyMinutes)}.` - : "You can continue working while we sync your data.", - }); + toast.success(`${indexingConfig.connectorTitle} indexing started`); // Update URL - the effect will handle closing the modal and clearing state const url = new URL(window.location.href); @@ -1047,7 +1041,6 @@ export const useConnectorDialog = () => { updateConnector, periodicEnabled, frequencyMinutes, - getFrequencyLabel, router, indexingConnectorConfig, ] @@ -1428,9 +1421,7 @@ export const useConnectorDialog = () => { end_date: endDateStr, }, }); - toast.success("Indexing started", { - description: "You can continue working while we sync your data.", - }); + toast.success("Indexing started"); // Invalidate queries to refresh data queryClient.invalidateQueries({ @@ -1447,7 +1438,7 @@ export const useConnectorDialog = () => { } } }, - [searchSpaceId, indexConnector, queryClient] + [searchSpaceId, indexConnector] ); // Handle going back from edit view @@ -1529,7 +1520,7 @@ export const useConnectorDialog = () => { } } }, - [activeTab, isStartingIndexing, isDisconnecting, isSaving, isCreatingConnector] + [activeTab, isStartingIndexing, isDisconnecting, isSaving, isCreatingConnector, setIsOpen] ); // Handle tab change From cc1e796c1295b1650803c855fa603db1442fab14 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 04:54:50 +0530 Subject: [PATCH 29/36] feat: implement two-phase document indexing for webcrawler and YouTube video processors with real-time status updates --- .../connector_indexers/webcrawler_indexer.py | 396 ++++++++++-------- .../document_processors/youtube_processor.py | 265 +++++++----- 2 files changed, 375 insertions(+), 286 deletions(-) diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index cb11a6ec2..5d25b4623 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -1,5 +1,9 @@ """ Webcrawler connector indexer. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create all documents with 'pending' status (visible in UI immediately) +- Phase 2: Process each document: pending → processing → ready/failed """ import time @@ -11,7 +15,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.config import config from app.connectors.webcrawler_connector import WebCrawlerConnector -from app.db import Document, DocumentType, SearchSourceConnectorType +from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -28,6 +32,7 @@ from .base import ( get_connector_by_id, get_current_timestamp, logger, + safe_set_chunks, update_connector_last_indexed, ) @@ -49,7 +54,11 @@ async def index_crawled_urls( on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, str | None]: """ - Index web page URLs. + Index web page URLs with real-time document status updates. + + Implements 2-phase approach for real-time UI feedback: + - Phase 1: Create all documents with 'pending' status (visible in UI immediately) + - Phase 2: Process each document: pending → processing → ready/failed Args: session: Database session @@ -138,9 +147,9 @@ async def index_crawled_urls( await task_logger.log_task_progress( log_entry, - f"Starting to crawl {len(urls)} URLs", + f"Starting to process {len(urls)} URLs", { - "stage": "crawling", + "stage": "processing", "total_urls": len(urls), }, ) @@ -148,28 +157,118 @@ async def index_crawled_urls( documents_indexed = 0 documents_updated = 0 documents_skipped = 0 - failed_urls = [] + documents_failed = 0 + duplicate_content_count = 0 # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() - for idx, url in enumerate(urls, 1): - # Check if it's time for a heartbeat update - if ( - on_heartbeat_callback - and (time.time() - last_heartbeat_time) >= HEARTBEAT_INTERVAL_SECONDS - ): - await on_heartbeat_callback(documents_indexed) - last_heartbeat_time = time.time() + # ======================================================================= + # PHASE 1: Analyze all URLs, create pending documents for new ones + # This makes ALL new documents visible in the UI immediately with pending status + # ======================================================================= + urls_to_process = [] # List of dicts with document and URL data + new_documents_created = False + + for url in urls: try: - logger.info(f"Processing URL {idx}/{len(urls)}: {url}") + # Generate unique identifier hash for this URL + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.CRAWLED_URL, url, search_space_id + ) + + # Check if document with this unique identifier already exists + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + # Document exists - check if it's already being processed + if DocumentStatus.is_state(existing_document.status, DocumentStatus.PENDING): + logger.info(f"URL {url} already pending. Skipping.") + documents_skipped += 1 + continue + if DocumentStatus.is_state(existing_document.status, DocumentStatus.PROCESSING): + logger.info(f"URL {url} already processing. Skipping.") + documents_skipped += 1 + continue + + # Queue existing document for potential update check + urls_to_process.append({ + 'document': existing_document, + 'is_new': False, + 'url': url, + 'unique_identifier_hash': unique_identifier_hash, + }) + continue + + # Create new document with PENDING status (visible in UI immediately) + document = Document( + search_space_id=search_space_id, + title=url[:100], # Placeholder - URL as title (truncated) + document_type=DocumentType.CRAWLED_URL, + document_metadata={ + "url": url, + "connector_id": connector_id, + }, + content="Pending crawl...", # Placeholder content + content_hash=unique_identifier_hash, # Temporary unique value + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation - safe for async + status=DocumentStatus.pending(), # PENDING status - visible in UI + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector_id, + ) + session.add(document) + new_documents_created = True + + urls_to_process.append({ + 'document': document, + 'is_new': True, + 'url': url, + 'unique_identifier_hash': unique_identifier_hash, + }) + + except Exception as e: + logger.error(f"Error in Phase 1 for URL {url}: {e!s}", exc_info=True) + documents_failed += 1 + continue + + # Commit all pending documents - they all appear in UI now + if new_documents_created: + logger.info(f"Phase 1: Committing {len([u for u in urls_to_process if u['is_new']])} pending documents") + await session.commit() + + # ======================================================================= + # PHASE 2: Process each URL one by one + # Each document transitions: pending → processing → ready/failed + # ======================================================================= + logger.info(f"Phase 2: Processing {len(urls_to_process)} URLs") + + for item in urls_to_process: + # Send heartbeat periodically + if on_heartbeat_callback: + current_time = time.time() + if current_time - last_heartbeat_time >= HEARTBEAT_INTERVAL_SECONDS: + await on_heartbeat_callback(documents_indexed + documents_updated) + last_heartbeat_time = current_time + + document = item['document'] + url = item['url'] + is_new = item['is_new'] + + try: + # Set to PROCESSING and commit - shows "processing" in UI for THIS document only + document.status = DocumentStatus.processing() + await session.commit() await task_logger.log_task_progress( log_entry, - f"Crawling URL {idx}/{len(urls)}: {url}", + f"Crawling URL: {url}", { "stage": "crawling_url", - "url_index": idx, "url": url, }, ) @@ -179,7 +278,10 @@ async def index_crawled_urls( if error or not crawl_result: logger.warning(f"Failed to crawl URL {url}: {error}") - failed_urls.append((url, error or "Unknown error")) + document.status = DocumentStatus.failed(error or "Crawl failed") + document.updated_at = get_current_timestamp() + await session.commit() + documents_failed += 1 continue # Extract content and metadata @@ -189,23 +291,16 @@ async def index_crawled_urls( if not content.strip(): logger.warning(f"Skipping URL with no content: {url}") - failed_urls.append((url, "No content extracted")) - documents_skipped += 1 + document.status = DocumentStatus.failed("No content extracted") + document.updated_at = get_current_timestamp() + await session.commit() + documents_failed += 1 continue - # Format content as structured document for summary generation (includes all metadata) - structured_document = crawler.format_to_structured_document( - crawl_result - ) - - # Generate unique identifier hash for this URL - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.CRAWLED_URL, url, search_space_id - ) + # Format content as structured document for summary generation + structured_document = crawler.format_to_structured_document(crawl_result) # Generate content hash using a version WITHOUT metadata - # This ensures the hash only changes when actual content changes, - # not when metadata (which contains dynamic fields like timestamps, IDs, etc.) changes structured_document_for_hash = crawler.format_to_structured_document( crawl_result, exclude_metadata=True ) @@ -213,114 +308,51 @@ async def index_crawled_urls( structured_document_for_hash, search_space_id ) - # Check if document with this unique identifier already exists - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - # Extract useful metadata title = metadata.get("title", url) description = metadata.get("description", "") language = metadata.get("language", "") - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - logger.info(f"Document for URL {url} unchanged. Skipping.") - documents_skipped += 1 - continue - else: - # Content has changed - update the existing document - logger.info( - f"Content changed for URL {url}. Updating document." - ) + # Update title immediately for better UX + document.title = title + await session.commit() - # Generate summary with metadata - user_llm = await get_user_long_context_llm( - session, user_id, search_space_id - ) - - if user_llm: - document_metadata = { - "url": url, - "title": title, - "description": description, - "language": language, - "document_type": "Crawled URL", - "crawler_type": crawler_type, - } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - structured_document, user_llm, document_metadata - ) - else: - # Fallback to simple summary if no LLM configured - summary_content = f"Crawled URL: {title}\n\n" - summary_content += f"URL: {url}\n" - if description: - summary_content += f"Description: {description}\n" - if language: - summary_content += f"Language: {language}\n" - summary_content += f"Crawler: {crawler_type}\n\n" - - # Add content preview - content_preview = content[:1000] - if len(content) > 1000: - content_preview += "..." - summary_content += f"Content Preview:\n{content_preview}\n" - - summary_embedding = config.embedding_model_instance.embed( - summary_content - ) - - # Process chunks - chunks = await create_document_chunks(content) - - # Update existing document - existing_document.title = title - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - **metadata, - "crawler_type": crawler_type, - "last_crawled_at": datetime.now().strftime( - "%Y-%m-%d %H:%M:%S" - ), - } - existing_document.chunks = chunks - existing_document.updated_at = get_current_timestamp() - - documents_updated += 1 - logger.info(f"Successfully updated URL {url}") - continue - - # Document doesn't exist by unique_identifier_hash - # Check if a document with the same content_hash exists (from another connector) - with session.no_autoflush: - duplicate_by_content = await check_duplicate_document_by_hash( - session, content_hash - ) - - if duplicate_by_content: - logger.info( - f"URL {url} already indexed by another connector " - f"(existing document ID: {duplicate_by_content.id}, " - f"type: {duplicate_by_content.document_type}). Skipping." - ) + # For existing documents, check if content has changed + if not is_new and document.content_hash == content_hash: + logger.info(f"Document for URL {url} unchanged. Marking as ready.") + # Ensure status is ready (might have been stuck) + document.status = DocumentStatus.ready() + await session.commit() documents_skipped += 1 continue - # Document doesn't exist - create new one - # Generate summary with metadata + # For new documents, check if duplicate content exists elsewhere + if is_new: + with session.no_autoflush: + duplicate_by_content = await check_duplicate_document_by_hash( + session, content_hash + ) + + if duplicate_by_content: + logger.info( + f"URL {url} already indexed by another connector " + f"(existing document ID: {duplicate_by_content.id}). " + f"Marking as failed." + ) + document.status = DocumentStatus.failed("Duplicate content exists") + document.updated_at = get_current_timestamp() + await session.commit() + duplicate_content_count += 1 + documents_skipped += 1 + continue + + # Generate summary with LLM user_llm = await get_user_long_context_llm( session, user_id, search_space_id ) if user_llm: - document_metadata = { + document_metadata_for_summary = { "url": url, "title": title, "description": description, @@ -328,11 +360,8 @@ async def index_crawled_urls( "document_type": "Crawled URL", "crawler_type": crawler_type, } - ( - summary_content, - summary_embedding, - ) = await generate_document_summary( - structured_document, user_llm, document_metadata + summary_content, summary_embedding = await generate_document_summary( + structured_document, user_llm, document_metadata_for_summary ) else: # Fallback to simple summary if no LLM configured @@ -354,32 +383,32 @@ async def index_crawled_urls( summary_content ) + # Process chunks chunks = await create_document_chunks(content) - document = Document( - search_space_id=search_space_id, - title=title, - document_type=DocumentType.CRAWLED_URL, - document_metadata={ - **metadata, - "crawler_type": crawler_type, - "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - }, - content=summary_content, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - embedding=summary_embedding, - chunks=chunks, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector_id, - ) + # Update document to READY with actual content + document.title = title + document.content = summary_content + document.content_hash = content_hash + document.embedding = summary_embedding + document.document_metadata = { + **metadata, + "crawler_type": crawler_type, + "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "connector_id": connector_id, + } + safe_set_chunks(document, chunks) + document.status = DocumentStatus.ready() # READY status + document.updated_at = get_current_timestamp() - session.add(document) - documents_indexed += 1 - logger.info(f"Successfully indexed new URL {url}") + if is_new: + documents_indexed += 1 + else: + documents_updated += 1 - # Batch commit every 10 documents + logger.info(f"Successfully processed URL {url}") + + # Batch commit every 10 documents (for ready status updates) if (documents_indexed + documents_updated) % 10 == 0: logger.info( f"Committing batch: {documents_indexed + documents_updated} URLs processed so far" @@ -387,32 +416,47 @@ async def index_crawled_urls( await session.commit() except Exception as e: - logger.error( - f"Error processing URL {url}: {e!s}", - exc_info=True, - ) - failed_urls.append((url, str(e))) + logger.error(f"Error processing URL {url}: {e!s}", exc_info=True) + # Mark document as failed with reason (visible in UI) + try: + document.status = DocumentStatus.failed(str(e)[:200]) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception as status_error: + logger.error(f"Failed to update document status to failed: {status_error}") + documents_failed += 1 continue total_processed = documents_indexed + documents_updated - if total_processed > 0: - await update_connector_last_indexed(session, connector, update_last_indexed) + # CRITICAL: Always update timestamp (even if 0 documents indexed) so Electric SQL syncs + await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches logger.info( f"Final commit: Total {documents_indexed} new, {documents_updated} updated URLs processed" ) - await session.commit() + try: + await session.commit() + logger.info("Successfully committed all webcrawler document changes to database") + except Exception as e: + # Handle any remaining integrity errors gracefully + if "duplicate key value violates unique constraint" in str(e).lower(): + logger.warning( + f"Duplicate content_hash detected during final commit. " + f"Rolling back and continuing. Error: {e!s}" + ) + await session.rollback() + else: + raise - # Log failed URLs if any (for debugging purposes) - if failed_urls: - failed_summary = "; ".join( - [f"{url}: {error}" for url, error in failed_urls[:5]] - ) - if len(failed_urls) > 5: - failed_summary += f" (and {len(failed_urls) - 5} more)" - logger.warning(f"Some URLs failed to index: {failed_summary}") + # Build warning message if there were issues + warning_parts = [] + if duplicate_content_count > 0: + warning_parts.append(f"{duplicate_content_count} duplicate") + if documents_failed > 0: + warning_parts.append(f"{documents_failed} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None await task_logger.log_task_success( log_entry, @@ -422,19 +466,21 @@ async def index_crawled_urls( "documents_indexed": documents_indexed, "documents_updated": documents_updated, "documents_skipped": documents_skipped, - "failed_urls_count": len(failed_urls), + "documents_failed": documents_failed, + "duplicate_content_count": duplicate_content_count, }, ) logger.info( f"Web page indexing completed: {documents_indexed} new, " f"{documents_updated} updated, {documents_skipped} skipped, " - f"{len(failed_urls)} failed" + f"{documents_failed} failed" ) - return ( - total_processed, - None, - ) # Return None on success (result_message is for logging only) + + if warning_message: + return total_processed, f"Completed with issues: {warning_message}" + + return total_processed, None except SQLAlchemyError as db_error: await session.rollback() @@ -482,9 +528,7 @@ async def get_crawled_url_documents( ) if connector_id: - # Filter by connector if needed - you might need to add a connector_id field to Document - # or filter by some other means depending on your schema - pass + query = query.filter(Document.connector_id == connector_id) result = await session.execute(query) documents = result.scalars().all() diff --git a/surfsense_backend/app/tasks/document_processors/youtube_processor.py b/surfsense_backend/app/tasks/document_processors/youtube_processor.py index 7251fb22f..19092b592 100644 --- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py +++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py @@ -1,5 +1,9 @@ """ YouTube video document processor. + +Implements 2-phase document status updates for real-time UI feedback: +- Phase 1: Create document with 'pending' status (visible in UI immediately) +- Phase 2: Process document: pending → processing → ready/failed """ import logging @@ -10,7 +14,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from youtube_transcript_api import YouTubeTranscriptApi -from app.db import Document, DocumentType +from app.db import Document, DocumentStatus, DocumentType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService from app.utils.document_converters import ( @@ -23,6 +27,7 @@ from app.utils.document_converters import ( from .base import ( check_document_by_unique_identifier, get_current_timestamp, + safe_set_chunks, ) @@ -58,6 +63,10 @@ async def add_youtube_video_document( """ Process a YouTube video URL, extract transcripts, and store as a document. + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Create document with 'pending' status (visible in UI immediately) + - Phase 2: Process document: pending → processing → ready/failed + Args: session: Database session for storing the document url: YouTube video URL (supports standard, shortened, and embed formats) @@ -82,15 +91,18 @@ async def add_youtube_video_document( metadata={"url": url, "user_id": str(user_id)}, ) + document = None + video_id = None + is_new_document = False + try: - # Extract video ID from URL + # Extract video ID from URL (lightweight operation) await task_logger.log_task_progress( log_entry, f"Extracting video ID from URL: {url}", {"stage": "video_id_extraction"}, ) - # Get video ID video_id = get_youtube_video_id(url) if not video_id: raise ValueError(f"Could not extract video ID from URL: {url}") @@ -101,13 +113,79 @@ async def add_youtube_video_document( {"stage": "video_id_extracted", "video_id": video_id}, ) - # Get video metadata + # Generate unique identifier hash for this YouTube video + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.YOUTUBE_VIDEO, video_id, search_space_id + ) + + # Check if document with this unique identifier already exists + await task_logger.log_task_progress( + log_entry, + f"Checking for existing video: {video_id}", + {"stage": "duplicate_check", "video_id": video_id}, + ) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + # ======================================================================= + # PHASE 1: Create pending document or prepare existing for update + # ======================================================================= + if existing_document: + document = existing_document + is_new_document = False + # Check if already being processed + if DocumentStatus.is_state(existing_document.status, DocumentStatus.PENDING): + logging.info(f"YouTube video {video_id} already pending. Returning existing.") + return existing_document + if DocumentStatus.is_state(existing_document.status, DocumentStatus.PROCESSING): + logging.info(f"YouTube video {video_id} already processing. Returning existing.") + return existing_document + else: + # Create new document with PENDING status (visible in UI immediately) + await task_logger.log_task_progress( + log_entry, + f"Creating pending document for video: {video_id}", + {"stage": "pending_document_creation"}, + ) + + document = Document( + title=f"YouTube Video: {video_id}", # Placeholder title + document_type=DocumentType.YOUTUBE_VIDEO, + document_metadata={ + "url": url, + "video_id": video_id, + }, + content="Processing video...", # Placeholder content + content_hash=unique_identifier_hash, # Temporary unique value + unique_identifier_hash=unique_identifier_hash, + embedding=None, + chunks=[], # Empty at creation + status=DocumentStatus.pending(), # PENDING status - visible in UI + search_space_id=search_space_id, + updated_at=get_current_timestamp(), + created_by_id=user_id, + ) + session.add(document) + await session.commit() # Document visible in UI now with pending status! + is_new_document = True + + logging.info(f"Created pending document for YouTube video {video_id}") + + # ======================================================================= + # PHASE 2: Set to PROCESSING and do heavy work + # ======================================================================= + document.status = DocumentStatus.processing() + await session.commit() # UI shows "processing" status + await task_logger.log_task_progress( log_entry, f"Fetching video metadata for: {video_id}", {"stage": "metadata_fetch"}, ) + # Fetch video metadata params = { "format": "json", "url": f"https://www.youtube.com/watch?v={video_id}", @@ -120,6 +198,10 @@ async def add_youtube_video_document( ): video_data = await response.json() + # Update title immediately for better UX (user sees actual title sooner) + document.title = video_data.get("title", f"YouTube Video: {video_id}") + await session.commit() + await task_logger.log_task_progress( log_entry, f"Video metadata fetched: {video_data.get('title', 'Unknown')}", @@ -204,53 +286,26 @@ async def add_youtube_video_document( document_parts.append("") combined_document_string = "\n".join(document_parts) - # Generate unique identifier hash for this YouTube video - unique_identifier_hash = generate_unique_identifier_hash( - DocumentType.YOUTUBE_VIDEO, video_id, search_space_id - ) - # Generate content hash content_hash = generate_content_hash(combined_document_string, search_space_id) - # Check if document with this unique identifier already exists - await task_logger.log_task_progress( - log_entry, - f"Checking for existing video: {video_id}", - {"stage": "duplicate_check", "video_id": video_id}, - ) + # For existing documents, check if content has changed + if not is_new_document and existing_document.content_hash == content_hash: + await task_logger.log_task_success( + log_entry, + f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}", + { + "duplicate_detected": True, + "existing_document_id": existing_document.id, + "video_id": video_id, + }, + ) + logging.info(f"Document for YouTube video {video_id} unchanged. Marking as ready.") + document.status = DocumentStatus.ready() + await session.commit() + return document - existing_document = await check_document_by_unique_identifier( - session, unique_identifier_hash - ) - - if existing_document: - # Document exists - check if content has changed - if existing_document.content_hash == content_hash: - await task_logger.log_task_success( - log_entry, - f"YouTube video document unchanged: {video_data.get('title', 'YouTube Video')}", - { - "duplicate_detected": True, - "existing_document_id": existing_document.id, - "video_id": video_id, - }, - ) - logging.info( - f"Document for YouTube video {video_id} unchanged. Skipping." - ) - return existing_document - else: - # Content has changed - update the existing document - logging.info( - f"Content changed for YouTube video {video_id}. Updating document." - ) - await task_logger.log_task_progress( - log_entry, - f"Updating YouTube video document: {video_data.get('title', 'YouTube Video')}", - {"stage": "document_update", "video_id": video_id}, - ) - - # Get LLM for summary generation (needed for both create and update) + # Get LLM for summary generation await task_logger.log_task_progress( log_entry, f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}", @@ -272,7 +327,7 @@ async def add_youtube_video_document( ) # Generate summary with metadata - document_metadata = { + document_metadata_for_summary = { "url": url, "video_id": video_id, "title": video_data.get("title", "YouTube Video"), @@ -282,7 +337,7 @@ async def add_youtube_video_document( "has_transcript": "No captions available" not in transcript_text, } summary_content, summary_embedding = await generate_document_summary( - combined_document_string, user_llm, document_metadata + combined_document_string, user_llm, document_metadata_for_summary ) # Process chunks @@ -304,65 +359,33 @@ async def add_youtube_video_document( chunks = await create_document_chunks(combined_document_string) - # Update or create document - if existing_document: - # Update existing document - await task_logger.log_task_progress( - log_entry, - f"Updating YouTube video document in database: {video_data.get('title', 'YouTube Video')}", - {"stage": "document_update", "chunks_count": len(chunks)}, - ) + # ======================================================================= + # PHASE 3: Update document to READY with all content + # ======================================================================= + await task_logger.log_task_progress( + log_entry, + f"Finalizing document: {video_data.get('title', 'YouTube Video')}", + {"stage": "document_finalization", "chunks_count": len(chunks)}, + ) - existing_document.title = video_data.get("title", "YouTube Video") - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "url": url, - "video_id": video_id, - "video_title": video_data.get("title", "YouTube Video"), - "author": video_data.get("author_name", "Unknown"), - "thumbnail": video_data.get("thumbnail_url", ""), - } - existing_document.chunks = chunks - existing_document.blocknote_document = blocknote_json - existing_document.updated_at = get_current_timestamp() + document.title = video_data.get("title", "YouTube Video") + document.content = summary_content + document.content_hash = content_hash + document.embedding = summary_embedding + document.document_metadata = { + "url": url, + "video_id": video_id, + "video_title": video_data.get("title", "YouTube Video"), + "author": video_data.get("author_name", "Unknown"), + "thumbnail": video_data.get("thumbnail_url", ""), + } + safe_set_chunks(document, chunks) + document.blocknote_document = blocknote_json + document.status = DocumentStatus.ready() # READY status - fully processed + document.updated_at = get_current_timestamp() - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - # Create new document - await task_logger.log_task_progress( - log_entry, - f"Creating YouTube video document in database: {video_data.get('title', 'YouTube Video')}", - {"stage": "document_creation", "chunks_count": len(chunks)}, - ) - - document = Document( - title=video_data.get("title", "YouTube Video"), - document_type=DocumentType.YOUTUBE_VIDEO, - document_metadata={ - "url": url, - "video_id": video_id, - "video_title": video_data.get("title", "YouTube Video"), - "author": video_data.get("author_name", "Unknown"), - "thumbnail": video_data.get("thumbnail_url", ""), - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - search_space_id=search_space_id, - content_hash=content_hash, - unique_identifier_hash=unique_identifier_hash, - blocknote_document=blocknote_json, - updated_at=get_current_timestamp(), - created_by_id=user_id, - ) - - session.add(document) - await session.commit() - await session.refresh(document) + await session.commit() + await session.refresh(document) # Log success await task_logger.log_task_success( @@ -380,27 +403,49 @@ async def add_youtube_video_document( ) return document + except SQLAlchemyError as db_error: - await session.rollback() + # Mark document as failed if it exists + if document: + try: + document.status = DocumentStatus.failed(f"Database error: {str(db_error)[:150]}") + document.updated_at = get_current_timestamp() + await session.commit() + except Exception: + await session.rollback() + else: + await session.rollback() + await task_logger.log_task_failure( log_entry, f"Database error while processing YouTube video: {url}", str(db_error), { "error_type": "SQLAlchemyError", - "video_id": video_id if "video_id" in locals() else None, + "video_id": video_id, }, ) raise db_error + except Exception as e: - await session.rollback() + # Mark document as failed if it exists + if document: + try: + document.status = DocumentStatus.failed(str(e)[:200]) + document.updated_at = get_current_timestamp() + await session.commit() + except Exception: + await session.rollback() + else: + await session.rollback() + await task_logger.log_task_failure( log_entry, f"Failed to process YouTube video: {url}", str(e), { "error_type": type(e).__name__, - "video_id": video_id if "video_id" in locals() else None, + "video_id": video_id, }, ) logging.error(f"Failed to process YouTube video: {e!s}") From f56f5a281e86d4dd1730e3f6b7f26b0937231c16 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 05:15:35 +0530 Subject: [PATCH 30/36] fix: disable Edit and Delete actions while processing in RowActions component --- .../documents/(manage)/components/RowActions.tsx | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx index 4133f2960..4f23693ad 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx @@ -53,7 +53,8 @@ export function RowActions({ document.document_type as (typeof NON_DELETABLE_DOCUMENT_TYPES)[number] ); - // Delete is disabled while processing + // Edit and Delete are disabled while processing + const isEditDisabled = isBeingProcessed; const isDeleteDisabled = isBeingProcessed; const handleDelete = async () => { @@ -97,7 +98,11 @@ export function RowActions({ - + !isEditDisabled && handleEdit()} + disabled={isEditDisabled} + className={isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : ""} + > Edit @@ -142,7 +147,11 @@ export function RowActions({ - + !isEditDisabled && handleEdit()} + disabled={isEditDisabled} + className={isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : ""} + > Edit From ed2fc5c6365608ff1a003bc39357fe3a7c40c4bf Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 05:15:47 +0530 Subject: [PATCH 31/36] feat: enhance document upload process with two-phase indexing and real-time status updates --- .../app/routes/documents_routes.py | 99 +++++- .../app/tasks/celery_tasks/document_tasks.py | 292 ++++++++++++++++ .../document_processors/file_processors.py | 314 ++++++++++++++++++ 3 files changed, 694 insertions(+), 11 deletions(-) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index b905ebf91..00c80dcb5 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -113,9 +113,23 @@ async def create_documents_file_upload( user: User = Depends(current_active_user), ): """ - Upload files as documents. + Upload files as documents with real-time status tracking. + + Implements 2-phase document status updates for real-time UI feedback: + - Phase 1: Create all documents with 'pending' status (visible in UI immediately via ElectricSQL) + - Phase 2: Celery processes each file: pending → processing → ready/failed + Requires DOCUMENTS_CREATE permission. """ + from datetime import datetime + + from app.db import DocumentStatus + from app.tasks.document_processors.base import ( + check_document_by_unique_identifier, + get_current_timestamp, + ) + from app.utils.document_converters import generate_unique_identifier_hash + try: # Check permission await check_permission( @@ -129,38 +143,101 @@ async def create_documents_file_upload( if not files: raise HTTPException(status_code=400, detail="No files provided") + created_documents: list[Document] = [] + files_to_process: list[tuple[Document, str, str]] = [] # (document, temp_path, filename) + skipped_duplicates = 0 + + # ===== PHASE 1: Create pending documents for all files ===== + # This makes ALL documents visible in the UI immediately with pending status for file in files: try: - # Save file to a temporary location to avoid stream issues import os import tempfile - # Create temp file + # Save file to temp location with tempfile.NamedTemporaryFile( - delete=False, suffix=os.path.splitext(file.filename)[1] + delete=False, suffix=os.path.splitext(file.filename or "")[1] ) as temp_file: temp_path = temp_file.name - # Write uploaded file to temp file content = await file.read() with open(temp_path, "wb") as f: f.write(content) - from app.tasks.celery_tasks.document_tasks import ( - process_file_upload_task, + file_size = len(content) + + # Generate unique identifier for deduplication check + unique_identifier_hash = generate_unique_identifier_hash( + DocumentType.FILE, file.filename or "unknown", search_space_id ) - process_file_upload_task.delay( - temp_path, file.filename, search_space_id, str(user.id) + # Check if document already exists (by unique identifier) + existing = await check_document_by_unique_identifier( + session, unique_identifier_hash ) + if existing: + # Clean up temp file for duplicates + os.unlink(temp_path) + skipped_duplicates += 1 + continue + + # Create pending document (visible immediately in UI via ElectricSQL) + document = Document( + search_space_id=search_space_id, + title=file.filename or "Uploaded File", + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file.filename, + "file_size": file_size, + "upload_time": datetime.now().isoformat(), + }, + content="Processing...", # Placeholder until processed + content_hash=unique_identifier_hash, # Temporary, updated when ready + unique_identifier_hash=unique_identifier_hash, + embedding=None, + status=DocumentStatus.pending(), # Shows "pending" in UI + updated_at=get_current_timestamp(), + created_by_id=str(user.id), + ) + session.add(document) + created_documents.append(document) + files_to_process.append((document, temp_path, file.filename or "unknown")) + except Exception as e: raise HTTPException( status_code=422, detail=f"Failed to process file {file.filename}: {e!s}", ) from e - await session.commit() - return {"message": "Files uploaded for processing"} + # Commit all pending documents - they appear in UI immediately via ElectricSQL + if created_documents: + await session.commit() + # Refresh to get generated IDs + for doc in created_documents: + await session.refresh(doc) + + # ===== PHASE 2: Dispatch Celery tasks for each file ===== + # Each task will update document status: pending → processing → ready/failed + from app.tasks.celery_tasks.document_tasks import ( + process_file_upload_with_document_task, + ) + + for document, temp_path, filename in files_to_process: + process_file_upload_with_document_task.delay( + document_id=document.id, + temp_path=temp_path, + filename=filename, + search_space_id=search_space_id, + user_id=str(user.id), + ) + + return { + "message": "Files uploaded for processing", + "document_ids": [doc.id for doc in created_documents], + "total_files": len(files), + "pending_files": len(files_to_process), + "skipped_duplicates": skipped_duplicates, + } except HTTPException: raise except Exception as e: diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index f310bb03e..cd5537927 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -537,6 +537,298 @@ async def _process_file_upload( raise +@celery_app.task(name="process_file_upload_with_document", bind=True) +def process_file_upload_with_document_task( + self, + document_id: int, + temp_path: str, + filename: str, + search_space_id: int, + user_id: str, +): + """ + Celery task to process uploaded file with existing pending document. + + This task is used by the 2-phase document upload flow: + - Phase 1 (API): Creates pending document (visible in UI immediately) + - Phase 2 (this task): Updates document status: pending → processing → ready/failed + + Args: + document_id: ID of the pending document created in Phase 1 + temp_path: Path to the uploaded file + filename: Original filename + search_space_id: ID of the search space + user_id: ID of the user + """ + import asyncio + import os + import traceback + + logger.info( + f"[process_file_upload_with_document] Task started - document_id: {document_id}, " + f"file: {filename}, search_space_id: {search_space_id}" + ) + + # Check if file exists and is accessible + if not os.path.exists(temp_path): + logger.error( + f"[process_file_upload_with_document] File does not exist: {temp_path}. " + "The temp file may have been cleaned up before the task ran." + ) + # Mark document as failed since file is missing + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete( + _mark_document_failed( + document_id, + "File not found - temp file may have been cleaned up", + ) + ) + finally: + loop.close() + return + + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + loop.run_until_complete( + _process_file_with_document( + document_id, temp_path, filename, search_space_id, user_id + ) + ) + logger.info( + f"[process_file_upload_with_document] Task completed successfully for: {filename}" + ) + except Exception as e: + logger.error( + f"[process_file_upload_with_document] Task failed for {filename}: {e}\n" + f"Traceback:\n{traceback.format_exc()}" + ) + raise + finally: + loop.close() + + +async def _mark_document_failed(document_id: int, reason: str): + """Mark a document as failed when task cannot proceed.""" + from app.db import Document, DocumentStatus + from app.tasks.document_processors.base import get_current_timestamp + + async with get_celery_session_maker()() as session: + document = await session.get(Document, document_id) + if document: + document.status = DocumentStatus.failed(reason) + document.updated_at = get_current_timestamp() + await session.commit() + logger.info(f"Marked document {document_id} as failed: {reason}") + + +async def _process_file_with_document( + document_id: int, + temp_path: str, + filename: str, + search_space_id: int, + user_id: str, +): + """ + Process file and update existing pending document status. + + This function implements Phase 2 of the 2-phase document upload: + - Sets document status to 'processing' (shows spinner in UI) + - Processes the file (parsing, embedding, chunking) + - Updates document to 'ready' on success or 'failed' on error + """ + import os + + from app.db import Document, DocumentStatus + from app.tasks.document_processors.base import get_current_timestamp + from app.tasks.document_processors.file_processors import ( + process_file_in_background_with_document, + ) + + logger.info( + f"[_process_file_with_document] Starting async processing for: {filename}" + ) + + async with get_celery_session_maker()() as session: + logger.info( + f"[_process_file_with_document] Database session created for: {filename}" + ) + task_logger = TaskLoggingService(session, search_space_id) + + # Get the document + document = await session.get(Document, document_id) + if not document: + logger.error(f"Document {document_id} not found") + return + + # Get file size for notification metadata + try: + file_size = os.path.getsize(temp_path) + logger.info(f"[_process_file_with_document] File size: {file_size} bytes") + except Exception as e: + logger.warning(f"[_process_file_with_document] Could not get file size: {e}") + file_size = None + + # Create notification for document processing + logger.info(f"[_process_file_with_document] Creating notification for: {filename}") + notification = ( + await NotificationService.document_processing.notify_processing_started( + session=session, + user_id=UUID(user_id), + document_type="FILE", + document_name=filename, + search_space_id=search_space_id, + file_size=file_size, + ) + ) + + log_entry = await task_logger.log_task_start( + task_name="process_file_upload_with_document", + source="document_processor", + message=f"Starting file processing for: {filename} (document_id: {document_id})", + metadata={ + "document_type": "FILE", + "document_id": document_id, + "filename": filename, + "file_path": temp_path, + "user_id": user_id, + }, + ) + + try: + # Set status to PROCESSING (shows spinner in UI via ElectricSQL) + document.status = DocumentStatus.processing() + await session.commit() + logger.info( + f"[_process_file_with_document] Document {document_id} status set to 'processing'" + ) + + # Process the file and update document + result = await process_file_in_background_with_document( + document=document, + file_path=temp_path, + filename=filename, + search_space_id=search_space_id, + user_id=user_id, + session=session, + task_logger=task_logger, + log_entry=log_entry, + notification=notification, + ) + + # Update notification on success + if result: + await ( + NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + document_id=result.id, + chunks_count=None, + ) + ) + logger.info( + f"[_process_file_with_document] Successfully processed document {document_id}" + ) + else: + # Duplicate detected - mark as failed + document.status = DocumentStatus.failed("Duplicate content detected") + document.updated_at = get_current_timestamp() + await session.commit() + await ( + NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + error_message="Document already exists (duplicate)", + ) + ) + + except Exception as e: + # Import here to avoid circular dependencies + from fastapi import HTTPException + + from app.services.page_limit_service import PageLimitExceededError + + # Check if this is a page limit error + page_limit_error: PageLimitExceededError | None = None + if isinstance(e, PageLimitExceededError): + page_limit_error = e + elif ( + isinstance(e, HTTPException) + and e.__cause__ + and isinstance(e.__cause__, PageLimitExceededError) + ): + page_limit_error = e.__cause__ + + # Mark document as failed (shows error in UI via ElectricSQL) + error_message = str(e)[:500] + document.status = DocumentStatus.failed(error_message) + document.updated_at = get_current_timestamp() + await session.commit() + logger.info( + f"[_process_file_with_document] Document {document_id} marked as failed: {error_message[:100]}" + ) + + # Handle page limit errors with dedicated notification + if page_limit_error is not None: + try: + await session.refresh(notification) + await NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + error_message="Page limit exceeded", + ) + await NotificationService.page_limit.notify_page_limit_exceeded( + session=session, + user_id=UUID(user_id), + document_name=filename, + document_type="FILE", + search_space_id=search_space_id, + pages_used=page_limit_error.pages_used, + pages_limit=page_limit_error.pages_limit, + pages_to_add=page_limit_error.pages_to_add, + ) + except Exception as notif_error: + logger.error( + f"Failed to create page limit notification: {notif_error!s}" + ) + else: + # Update notification on failure + try: + await session.refresh(notification) + await NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + error_message=str(e)[:100], + ) + except Exception as notif_error: + logger.error( + f"Failed to update notification on failure: {notif_error!s}" + ) + + await task_logger.log_task_failure( + log_entry, + error_message[:100], + str(e), + {"error_type": type(e).__name__, "document_id": document_id}, + ) + logger.error(f"Error processing file {filename}: {e!s}") + raise + + finally: + # Clean up temp file + if os.path.exists(temp_path): + try: + os.unlink(temp_path) + logger.info(f"[_process_file_with_document] Cleaned up temp file: {temp_path}") + except Exception as cleanup_error: + logger.warning( + f"[_process_file_with_document] Failed to clean up temp file: {cleanup_error}" + ) + + @celery_app.task(name="process_circleback_meeting", bind=True) def process_circleback_meeting_task( self, diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 4433cb11e..e14dc3f42 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -33,6 +33,7 @@ from .base import ( check_document_by_unique_identifier, check_duplicate_document, get_current_timestamp, + safe_set_chunks, ) from .markdown_processor import add_received_markdown_file_document @@ -1612,3 +1613,316 @@ async def process_file_in_background( logging.error(f"Error processing file in background: {error_message}") raise # Re-raise so the wrapper can also handle it + + +async def process_file_in_background_with_document( + document: Document, + file_path: str, + filename: str, + search_space_id: int, + user_id: str, + session: AsyncSession, + task_logger: TaskLoggingService, + log_entry: Log, + connector: dict | None = None, + notification: Notification | None = None, +) -> Document | None: + """ + Process file and update existing pending document (2-phase pattern). + + This function is Phase 2 of the real-time document status updates: + - Phase 1 (API): Created document with pending status + - Phase 2 (this): Process file and update document to ready/failed + + The document already exists with pending status. This function: + 1. Parses the file content (markdown, audio, or ETL services) + 2. Updates the document with content, embeddings, and chunks + 3. Sets status to 'ready' on success + + Args: + document: Existing document with pending status + file_path: Path to the uploaded file + filename: Original filename + search_space_id: ID of the search space + user_id: ID of the user + session: Database session + task_logger: Task logging service + log_entry: Log entry for this task + connector: Optional connector info for Google Drive files + notification: Optional notification for progress updates + + Returns: + Updated Document object if successful, None if duplicate content detected + """ + import os + + from app.config import config as app_config + from app.services.llm_service import get_user_long_context_llm + from app.utils.blocknote_converter import convert_markdown_to_blocknote + + try: + markdown_content = None + etl_service = None + + # ===== STEP 1: Parse file content based on type ===== + + # Check if the file is a markdown or text file + if filename.lower().endswith((".md", ".markdown", ".txt")): + # Update notification: parsing stage + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, notification, stage="parsing", stage_message="Reading file" + ) + + await task_logger.log_task_progress( + log_entry, + f"Processing markdown/text file: {filename}", + {"file_type": "markdown", "processing_stage": "reading_file"}, + ) + + # Read markdown content directly + with open(file_path, encoding="utf-8") as f: + markdown_content = f.read() + etl_service = "MARKDOWN" + + # Clean up temp file + with contextlib.suppress(Exception): + os.unlink(file_path) + + # Check if the file is an audio file + elif filename.lower().endswith( + (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") + ): + # Update notification: parsing stage (transcription) + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, notification, stage="parsing", stage_message="Transcribing audio" + ) + + await task_logger.log_task_progress( + log_entry, + f"Processing audio file for transcription: {filename}", + {"file_type": "audio", "processing_stage": "starting_transcription"}, + ) + + # Transcribe audio + stt_service_type = ( + "local" + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + + if stt_service_type == "local": + from app.services.stt_service import stt_service + + result = stt_service.transcribe_file(file_path) + transcribed_text = result.get("text", "") + if not transcribed_text: + raise ValueError("Transcription returned empty text") + markdown_content = f"# Transcription of {filename}\n\n{transcribed_text}" + else: + with open(file_path, "rb") as audio_file: + transcription_kwargs = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE + transcription_response = await atranscription(**transcription_kwargs) + transcribed_text = transcription_response.get("text", "") + if not transcribed_text: + raise ValueError("Transcription returned empty text") + markdown_content = f"# Transcription of {filename}\n\n{transcribed_text}" + + etl_service = "AUDIO_TRANSCRIPTION" + # Clean up temp file + with contextlib.suppress(Exception): + os.unlink(file_path) + + else: + # Document files - use ETL service + from app.services.page_limit_service import PageLimitExceededError, PageLimitService + + page_limit_service = PageLimitService(session) + + # Estimate page count + try: + estimated_pages = page_limit_service.estimate_pages_before_processing(file_path) + except Exception: + file_size = os.path.getsize(file_path) + estimated_pages = max(1, file_size // (80 * 1024)) + + # Check page limit + await page_limit_service.check_page_limit(user_id, estimated_pages) + + if app_config.ETL_SERVICE == "UNSTRUCTURED": + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, notification, stage="parsing", stage_message="Extracting content" + ) + + from langchain_unstructured import UnstructuredLoader + + loader = UnstructuredLoader( + file_path, mode="elements", post_processors=[], languages=["eng"], + include_orig_elements=False, include_metadata=False, strategy="auto" + ) + docs = await loader.aload() + markdown_content = await convert_document_to_markdown(docs) + actual_pages = page_limit_service.estimate_pages_from_elements(docs) + final_page_count = max(estimated_pages, actual_pages) + etl_service = "UNSTRUCTURED" + + # Update page usage + await page_limit_service.update_page_usage(user_id, final_page_count, allow_exceed=True) + + elif app_config.ETL_SERVICE == "LLAMACLOUD": + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, notification, stage="parsing", stage_message="Extracting content" + ) + + result = await parse_with_llamacloud_retry( + file_path=file_path, estimated_pages=estimated_pages, + task_logger=task_logger, log_entry=log_entry + ) + markdown_documents = await result.aget_markdown_documents(split_by_page=False) + if not markdown_documents: + raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}") + markdown_content = markdown_documents[0].text + etl_service = "LLAMACLOUD" + + # Update page usage + await page_limit_service.update_page_usage(user_id, estimated_pages, allow_exceed=True) + + elif app_config.ETL_SERVICE == "DOCLING": + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, notification, stage="parsing", stage_message="Extracting content" + ) + + # Suppress logging during Docling import + getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) + getLogger("docling.document_converter").setLevel(ERROR) + getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel(ERROR) + + from docling.document_converter import DocumentConverter + + converter = DocumentConverter() + result = converter.convert(file_path) + markdown_content = result.document.export_to_markdown() + etl_service = "DOCLING" + + # Update page usage + await page_limit_service.update_page_usage(user_id, estimated_pages, allow_exceed=True) + + else: + raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + + # Clean up temp file + with contextlib.suppress(Exception): + os.unlink(file_path) + + if not markdown_content: + raise RuntimeError(f"Failed to extract content from file: {filename}") + + # ===== STEP 2: Check for duplicate content ===== + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_by_content = await check_duplicate_document(session, content_hash) + if existing_by_content and existing_by_content.id != document.id: + # Duplicate content found - mark this document as failed + logging.info( + f"Duplicate content detected for {filename}, " + f"matches document {existing_by_content.id}" + ) + return None + + # ===== STEP 3: Generate embeddings and chunks ===== + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, notification, stage="chunking" + ) + + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + + if user_llm: + document_metadata = { + "file_name": filename, + "etl_service": etl_service, + "document_type": "File Document", + } + summary_content, summary_embedding = await generate_document_summary( + markdown_content, user_llm, document_metadata + ) + else: + # Fallback: use truncated content as summary + summary_content = markdown_content[:4000] + from app.config import config + + summary_embedding = config.embedding_model_instance.embed(summary_content) + + chunks = await create_document_chunks(markdown_content) + + # Convert to BlockNote for editing + blocknote_json = await convert_markdown_to_blocknote(markdown_content) + + # ===== STEP 4: Update document to READY ===== + from sqlalchemy.orm.attributes import flag_modified + + document.title = filename + document.content = summary_content + document.content_hash = content_hash + document.embedding = summary_embedding + document.document_metadata = { + "FILE_NAME": filename, + "ETL_SERVICE": etl_service or "UNKNOWN", + **(document.document_metadata or {}), + } + flag_modified(document, "document_metadata") + + # Use safe_set_chunks to avoid async issues + safe_set_chunks(document, chunks) + + document.blocknote_document = blocknote_json + document.content_needs_reindexing = False + document.updated_at = get_current_timestamp() + document.status = DocumentStatus.ready() # Shows checkmark in UI + + await session.commit() + await session.refresh(document) + + await task_logger.log_task_success( + log_entry, + f"Successfully processed file: {filename}", + { + "document_id": document.id, + "content_hash": content_hash, + "file_type": etl_service, + "chunks_count": len(chunks), + }, + ) + + return document + + except Exception as e: + await session.rollback() + + from app.services.page_limit_service import PageLimitExceededError + + if isinstance(e, PageLimitExceededError): + error_message = str(e) + elif isinstance(e, HTTPException) and "page limit" in str(e.detail).lower(): + error_message = str(e.detail) + else: + error_message = f"Failed to process file: {filename}" + + await task_logger.log_task_failure( + log_entry, + error_message, + str(e), + {"error_type": type(e).__name__, "filename": filename, "document_id": document.id}, + ) + logging.error(f"Error processing file with document: {error_message}") + raise From 00a617ef179cba71a9dad23f8e2d1fdb4493aea6 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 05:31:45 +0530 Subject: [PATCH 32/36] feat: enhance stale notification cleanup task to mark associated documents as failed --- .../stale_notification_cleanup_task.py | 142 +++++++++++++++--- 1 file changed, 124 insertions(+), 18 deletions(-) diff --git a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py index 9041655b0..ff7a11645 100644 --- a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py @@ -4,33 +4,41 @@ This task runs periodically (every 5 minutes by default) to find notifications that are stuck in "in_progress" status but don't have an active Redis heartbeat key. These are marked as "failed" to prevent the frontend from showing a perpetual "syncing" state. +Additionally, it cleans up documents stuck in pending/processing state that belong +to connectors with stale notifications. + Detection mechanism: - Active indexing tasks set a Redis key with TTL (2 minutes) as a heartbeat - If the task crashes, the Redis key expires automatically - This cleanup task checks for in-progress notifications without a Redis heartbeat key - Such notifications are marked as failed with O(1) batch UPDATE +- Documents with pending/processing status for those connectors are also marked as failed """ +import contextlib import json import logging import os from datetime import UTC, datetime import redis -from sqlalchemy import and_, text +from sqlalchemy import and_, or_, text from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine from sqlalchemy.future import select from sqlalchemy.pool import NullPool from app.celery_app import celery_app from app.config import config -from app.db import Notification +from app.db import Document, DocumentStatus, Notification logger = logging.getLogger(__name__) # Redis client for checking heartbeats _redis_client: redis.Redis | None = None +# Error message shown to users when sync is interrupted +STALE_SYNC_ERROR_MESSAGE = "Sync was interrupted unexpectedly. Please retry." + def get_redis_client() -> redis.Redis: """Get or create Redis client for heartbeat checking.""" @@ -70,6 +78,7 @@ def cleanup_stale_indexing_notifications_task(): - Do NOT have a corresponding Redis heartbeat key (meaning task crashed) And marks them as failed with O(1) batch UPDATE. + Also marks associated pending/processing documents as failed. """ import asyncio @@ -86,15 +95,20 @@ async def _cleanup_stale_notifications(): """Find and mark stale connector indexing notifications as failed. Uses Redis TTL-based detection: - 1. Find all in-progress notifications + 1. Find all in-progress notifications with their connector_id 2. Check which ones are missing their Redis heartbeat key 3. Mark those as failed with O(1) batch UPDATE using JSONB || operator + 4. Mark associated documents (pending/processing) as failed """ async with get_celery_session_maker()() as session: try: # Find all in-progress connector indexing notifications + # Fetch full metadata to properly extract connector_id result = await session.execute( - select(Notification.id).where( + select( + Notification.id, + Notification.notification_metadata, + ).where( and_( Notification.type == "connector_indexing", Notification.notification_metadata["status"].astext @@ -102,24 +116,37 @@ async def _cleanup_stale_notifications(): ) ) ) - in_progress_ids = [row[0] for row in result.fetchall()] + in_progress_rows = result.fetchall() - if not in_progress_ids: + if not in_progress_rows: logger.debug("No in-progress connector indexing notifications found") return # Check which ones are missing heartbeat keys in Redis redis_client = get_redis_client() stale_notification_ids = [] + stale_connector_ids = [] - for notification_id in in_progress_ids: + for row in in_progress_rows: + notification_id = row[0] + metadata = row[1] # Full metadata dict heartbeat_key = _get_heartbeat_key(notification_id) if not redis_client.exists(heartbeat_key): stale_notification_ids.append(notification_id) + # Extract connector_id from metadata dict for document cleanup + if metadata and isinstance(metadata, dict): + connector_id = metadata.get("connector_id") + logger.debug( + f"Notification {notification_id} metadata: {metadata}, " + f"connector_id: {connector_id}" + ) + if connector_id is not None: + with contextlib.suppress(ValueError, TypeError): + stale_connector_ids.append(int(connector_id)) if not stale_notification_ids: logger.debug( - f"All {len(in_progress_ids)} in-progress notifications have active Redis heartbeats" + f"All {len(in_progress_rows)} in-progress notifications have active Redis heartbeats" ) return @@ -127,18 +154,17 @@ async def _cleanup_stale_notifications(): f"Found {len(stale_notification_ids)} stale connector indexing notifications " f"(no Redis heartbeat key): {stale_notification_ids}" ) - - # O(1) Batch UPDATE using JSONB || operator - # This merges the update data into existing notification_metadata - # Also updates title and message for proper UI display - error_message = ( - "Something went wrong while syncing your content. Please retry." + logger.info( + f"Connector IDs for document cleanup: {stale_connector_ids}" ) + # O(1) Batch UPDATE notifications using JSONB || operator + # This merges the update data into existing notification_metadata + # Also updates title and message for proper UI display update_data = { "status": "failed", "completed_at": datetime.now(UTC).isoformat(), - "error_message": error_message, + "error_message": STALE_SYNC_ERROR_MESSAGE, "sync_stage": "failed", } @@ -152,16 +178,96 @@ async def _cleanup_stale_notifications(): """), { "update_json": json.dumps(update_data), - "display_message": f"{error_message}", + "display_message": STALE_SYNC_ERROR_MESSAGE, "ids": stale_notification_ids, }, ) - await session.commit() logger.info( - f"Successfully marked {len(stale_notification_ids)} stale notifications as failed (batch UPDATE)" + f"Successfully marked {len(stale_notification_ids)} stale notifications as failed" ) + # ===== Clean up stuck documents for stale connectors ===== + if stale_connector_ids: + await _cleanup_stuck_documents(session, stale_connector_ids) + + await session.commit() + except Exception as e: logger.error(f"Error cleaning up stale notifications: {e!s}", exc_info=True) await session.rollback() + + +async def _cleanup_stuck_documents(session, connector_ids: list[int]): + """ + Mark documents stuck in pending/processing state as failed for given connectors. + + This ensures that when a connector sync is interrupted, all partially-processed + documents are marked with a clear error state instead of being stuck indefinitely. + + Args: + session: Database session + connector_ids: List of connector IDs whose documents should be cleaned up + """ + if not connector_ids: + return + + try: + # Count documents that will be affected (for logging) + count_result = await session.execute( + select(Document.id).where( + and_( + Document.connector_id.in_(connector_ids), + or_( + Document.status["state"].astext == DocumentStatus.PENDING, + Document.status["state"].astext == DocumentStatus.PROCESSING, + ), + ) + ) + ) + stuck_doc_ids = [row[0] for row in count_result.fetchall()] + + if not stuck_doc_ids: + logger.debug(f"No stuck documents found for connector IDs: {connector_ids}") + return + + logger.warning( + f"Found {len(stuck_doc_ids)} stuck documents (pending/processing) " + f"for connector IDs {connector_ids}: {stuck_doc_ids[:20]}..." # Log first 20 + ) + + # O(1) Batch UPDATE: Mark all stuck documents as failed using JSONB + # The error message matches what we show in notifications + failed_status = DocumentStatus.failed(STALE_SYNC_ERROR_MESSAGE) + + await session.execute( + text(""" + UPDATE documents + SET status = CAST(:failed_status AS jsonb), + updated_at = :now + WHERE connector_id = ANY(:connector_ids) + AND ( + status->>'state' = :pending_state + OR status->>'state' = :processing_state + ) + """), + { + "failed_status": json.dumps(failed_status), + "now": datetime.now(UTC), + "connector_ids": connector_ids, + "pending_state": DocumentStatus.PENDING, + "processing_state": DocumentStatus.PROCESSING, + }, + ) + + logger.info( + f"Successfully marked {len(stuck_doc_ids)} stuck documents as failed " + f"for connector IDs: {connector_ids}" + ) + + except Exception as e: + logger.error( + f"Error cleaning up stuck documents for connectors {connector_ids}: {e!s}", + exc_info=True, + ) + # Don't raise - let the notification cleanup continue even if document cleanup fails From aa66928154aacb1e2f8a0fdc4cdc4679d9d2d0b0 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 05:35:15 +0530 Subject: [PATCH 33/36] chore: ran linting --- .../versions/92_add_document_status_column.py | 3 - .../connectors/composio_gmail_connector.py | 104 +-- .../composio_google_calendar_connector.py | 99 ++- .../composio_google_drive_connector.py | 164 ++-- surfsense_backend/app/db.py | 29 +- .../app/routes/documents_routes.py | 24 +- surfsense_backend/app/schemas/documents.py | 9 +- .../app/services/connector_service.py | 12 +- .../app/tasks/celery_tasks/document_tasks.py | 18 +- .../stale_notification_cleanup_task.py | 4 +- .../connector_indexers/airtable_indexer.py | 105 ++- .../app/tasks/connector_indexers/base.py | 13 +- .../connector_indexers/bookstack_indexer.py | 116 +-- .../connector_indexers/clickup_indexer.py | 124 +-- .../connector_indexers/confluence_indexer.py | 96 ++- .../connector_indexers/discord_indexer.py | 100 ++- .../elasticsearch_indexer.py | 90 +- .../connector_indexers/github_indexer.py | 74 +- .../google_calendar_indexer.py | 119 +-- .../google_drive_indexer.py | 67 +- .../google_gmail_indexer.py | 101 ++- .../tasks/connector_indexers/jira_indexer.py | 94 ++- .../connector_indexers/linear_indexer.py | 107 ++- .../tasks/connector_indexers/luma_indexer.py | 148 ++-- .../connector_indexers/notion_indexer.py | 73 +- .../connector_indexers/obsidian_indexer.py | 96 ++- .../tasks/connector_indexers/slack_indexer.py | 86 +- .../tasks/connector_indexers/teams_indexer.py | 98 ++- .../connector_indexers/webcrawler_indexer.py | 67 +- .../app/tasks/document_processors/base.py | 13 +- .../circleback_processor.py | 18 +- .../document_processors/file_processors.py | 124 ++- .../document_processors/youtube_processor.py | 24 +- .../(manage)/components/DocumentTypeIcon.tsx | 4 +- .../(manage)/components/DocumentsFilters.tsx | 192 +++-- .../components/DocumentsTableShell.tsx | 82 +- .../(manage)/components/RowActions.tsx | 34 +- .../documents/(manage)/page.tsx | 68 +- .../connector-dialog.atoms.ts | 1 - .../assistant-ui/connector-popup.tsx | 4 +- .../hooks/use-connector-dialog.ts | 36 +- .../components/theme/theme-toggle.tsx | 787 +++++++++--------- surfsense_web/hooks/use-documents.ts | 31 +- surfsense_web/lib/electric/client.ts | 25 +- 44 files changed, 2025 insertions(+), 1658 deletions(-) diff --git a/surfsense_backend/alembic/versions/92_add_document_status_column.py b/surfsense_backend/alembic/versions/92_add_document_status_column.py index 550faa3c3..8204096aa 100644 --- a/surfsense_backend/alembic/versions/92_add_document_status_column.py +++ b/surfsense_backend/alembic/versions/92_add_document_status_column.py @@ -13,8 +13,6 @@ Changes: from collections.abc import Sequence -import sqlalchemy as sa - from alembic import op # revision identifiers, used by Alembic. @@ -77,4 +75,3 @@ def downgrade() -> None: END$$; """ ) - diff --git a/surfsense_backend/app/connectors/composio_gmail_connector.py b/surfsense_backend/app/connectors/composio_gmail_connector.py index 870053c7f..4764a0a41 100644 --- a/surfsense_backend/app/connectors/composio_gmail_connector.py +++ b/surfsense_backend/app/connectors/composio_gmail_connector.py @@ -285,24 +285,28 @@ async def _analyze_gmail_messages_phase1( if existing_document: if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - messages_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'message_id': message_id, - 'thread_id': thread_id, - 'subject': subject, - 'sender': sender, - 'date_str': date_str, - 'label_ids': label_ids, - }) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date_str": date_str, + "label_ids": label_ids, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -350,18 +354,20 @@ async def _analyze_gmail_messages_phase1( ) session.add(document) - messages_to_process.append({ - 'document': document, - 'is_new': True, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'message_id': message_id, - 'thread_id': thread_id, - 'subject': subject, - 'sender': sender, - 'date_str': date_str, - 'label_ids': label_ids, - }) + messages_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date_str": date_str, + "label_ids": label_ids, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True) @@ -398,7 +404,7 @@ async def _process_gmail_messages_phase2( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -411,37 +417,35 @@ async def _process_gmail_messages_phase2( if user_llm: document_metadata_for_summary = { - "message_id": item['message_id'], - "thread_id": item['thread_id'], - "subject": item['subject'], - "sender": item['sender'], + "message_id": item["message_id"], + "thread_id": item["thread_id"], + "subject": item["subject"], + "sender": item["sender"], "document_type": "Gmail Message (Composio)", } summary_content, summary_embedding = await generate_document_summary( - item['markdown_content'], user_llm, document_metadata_for_summary + item["markdown_content"], user_llm, document_metadata_for_summary ) else: - summary_content = ( - f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}" - ) + summary_content = f"Gmail: {item['subject']}\n\nFrom: {item['sender']}\nDate: {item['date_str']}" summary_embedding = config.embedding_model_instance.embed( summary_content ) - chunks = await create_document_chunks(item['markdown_content']) + chunks = await create_document_chunks(item["markdown_content"]) # Update document to READY with actual content - document.title = item['subject'] + document.title = item["subject"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "message_id": item['message_id'], - "thread_id": item['thread_id'], - "subject": item['subject'], - "sender": item['sender'], - "date": item['date_str'], - "labels": item['label_ids'], + "message_id": item["message_id"], + "thread_id": item["thread_id"], + "subject": item["subject"], + "sender": item["sender"], + "date": item["date_str"], + "labels": item["label_ids"], "connector_id": connector_id, "source": "composio", } @@ -465,7 +469,9 @@ async def _process_gmail_messages_phase2( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue @@ -571,7 +577,9 @@ async def index_composio_gmail( ) all_messages.extend(messages) - logger.info(f"Fetched {len(messages)} messages (total: {len(all_messages)})") + logger.info( + f"Fetched {len(messages)} messages (total: {len(all_messages)})" + ) if not next_token or len(messages) < current_batch_size: break @@ -616,7 +624,7 @@ async def index_composio_gmail( ) # Commit all pending documents - they all appear in UI now - new_documents_count = len([m for m in messages_to_process if m['is_new']]) + new_documents_count = len([m for m in messages_to_process if m["is_new"]]) if new_documents_count > 0: logger.info(f"Phase 1: Committing {new_documents_count} pending documents") await session.commit() @@ -645,9 +653,7 @@ async def index_composio_gmail( await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit to ensure all documents are persisted - logger.info( - f"Final commit: Total {documents_indexed} Gmail messages processed" - ) + logger.info(f"Final commit: Total {documents_indexed} Gmail messages processed") try: await session.commit() logger.info( diff --git a/surfsense_backend/app/connectors/composio_google_calendar_connector.py b/surfsense_backend/app/connectors/composio_google_calendar_connector.py index dc9c18c99..6593721a1 100644 --- a/surfsense_backend/app/connectors/composio_google_calendar_connector.py +++ b/surfsense_backend/app/connectors/composio_google_calendar_connector.py @@ -268,7 +268,9 @@ async def index_composio_google_calendar( documents_indexed = 0 documents_skipped = 0 documents_failed = 0 # Track events that failed processing - duplicate_content_count = 0 # Track events skipped due to duplicate content_hash + duplicate_content_count = ( + 0 # Track events skipped due to duplicate content_hash + ) last_heartbeat_time = time.time() # ======================================================================= @@ -317,23 +319,27 @@ async def index_composio_google_calendar( if existing_document: if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - events_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'event_id': event_id, - 'summary': summary, - 'start_time': start_time, - 'end_time': end_time, - 'location': location, - }) + events_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -383,17 +389,19 @@ async def index_composio_google_calendar( session.add(document) new_documents_created = True - events_to_process.append({ - 'document': document, - 'is_new': True, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'event_id': event_id, - 'summary': summary, - 'start_time': start_time, - 'end_time': end_time, - 'location': location, - }) + events_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "event_id": event_id, + "summary": summary, + "start_time": start_time, + "end_time": end_time, + "location": location, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) @@ -402,7 +410,9 @@ async def index_composio_google_calendar( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -419,7 +429,7 @@ async def index_composio_google_calendar( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -432,35 +442,40 @@ async def index_composio_google_calendar( if user_llm: document_metadata_for_summary = { - "event_id": item['event_id'], - "summary": item['summary'], - "start_time": item['start_time'], + "event_id": item["event_id"], + "summary": item["summary"], + "start_time": item["start_time"], "document_type": "Google Calendar Event (Composio)", } - summary_content, summary_embedding = await generate_document_summary( - item['markdown_content'], user_llm, document_metadata_for_summary + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["markdown_content"], + user_llm, + document_metadata_for_summary, ) else: summary_content = f"Calendar: {item['summary']}\n\nStart: {item['start_time']}\nEnd: {item['end_time']}" - if item['location']: + if item["location"]: summary_content += f"\nLocation: {item['location']}" summary_embedding = config.embedding_model_instance.embed( summary_content ) - chunks = await create_document_chunks(item['markdown_content']) + chunks = await create_document_chunks(item["markdown_content"]) # Update document to READY with actual content - document.title = item['summary'] + document.title = item["summary"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "event_id": item['event_id'], - "summary": item['summary'], - "start_time": item['start_time'], - "end_time": item['end_time'], - "location": item['location'], + "event_id": item["event_id"], + "summary": item["summary"], + "start_time": item["start_time"], + "end_time": item["end_time"], + "location": item["location"], "connector_id": connector_id, "source": "composio", } @@ -484,7 +499,9 @@ async def index_composio_google_calendar( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue diff --git a/surfsense_backend/app/connectors/composio_google_drive_connector.py b/surfsense_backend/app/connectors/composio_google_drive_connector.py index 26cfd3020..4ccd195e6 100644 --- a/surfsense_backend/app/connectors/composio_google_drive_connector.py +++ b/surfsense_backend/app/connectors/composio_google_drive_connector.py @@ -938,13 +938,15 @@ async def _index_composio_drive_delta_sync( if existing_document: # Queue existing document for update - files_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'file_id': file_id, - 'file_name': file_name, - 'mime_type': mime_type, - }) + files_to_process.append( + { + "document": existing_document, + "is_new": False, + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + } + ) continue # Create new document with PENDING status @@ -974,13 +976,15 @@ async def _index_composio_drive_delta_sync( session.add(document) new_documents_created = True - files_to_process.append({ - 'document': document, - 'is_new': True, - 'file_id': file_id, - 'file_name': file_name, - 'mime_type': mime_type, - }) + files_to_process.append( + { + "document": document, + "is_new": True, + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for change: {e!s}", exc_info=True) @@ -989,7 +993,9 @@ async def _index_composio_drive_delta_sync( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -1005,7 +1011,7 @@ async def _index_composio_drive_delta_sync( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit document.status = DocumentStatus.processing() @@ -1013,11 +1019,13 @@ async def _index_composio_drive_delta_sync( # Get file content content, content_error = await composio_connector.get_drive_file_content( - item['file_id'], original_mime_type=item['mime_type'] + item["file_id"], original_mime_type=item["mime_type"] ) if content_error or not content: - logger.warning(f"Could not get content for file {item['file_name']}: {content_error}") + logger.warning( + f"Could not get content for file {item['file_name']}: {content_error}" + ) markdown_content = f"# {item['file_name']}\n\n" markdown_content += f"**File ID:** {item['file_id']}\n" markdown_content += f"**Type:** {item['mime_type']}\n" @@ -1031,9 +1039,9 @@ async def _index_composio_drive_delta_sync( else: markdown_content = await _process_file_content( content=content, - file_name=item['file_name'], - file_id=item['file_id'], - mime_type=item['mime_type'], + file_name=item["file_name"], + file_id=item["file_id"], + mime_type=item["mime_type"], search_space_id=search_space_id, user_id=user_id, session=session, @@ -1045,14 +1053,14 @@ async def _index_composio_drive_delta_sync( content_hash = generate_content_hash(markdown_content, search_space_id) # For existing documents, check if content changed - if not item['is_new'] and document.content_hash == content_hash: + if not item["is_new"] and document.content_hash == content_hash: if not DocumentStatus.is_state(document.status, DocumentStatus.READY): document.status = DocumentStatus.ready() documents_skipped += 1 continue # Check for duplicate content hash (for new documents) - if item['is_new']: + if item["is_new"]: with session.no_autoflush: duplicate_by_content = await check_duplicate_document_by_hash( session, content_hash @@ -1067,13 +1075,15 @@ async def _index_composio_drive_delta_sync( continue # Heavy processing (LLM, embeddings, chunks) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) if user_llm: document_metadata_for_summary = { - "file_id": item['file_id'], - "file_name": item['file_name'], - "mime_type": item['mime_type'], + "file_id": item["file_id"], + "file_name": item["file_name"], + "mime_type": item["mime_type"], "document_type": "Google Drive File (Composio)", } summary_content, summary_embedding = await generate_document_summary( @@ -1081,20 +1091,22 @@ async def _index_composio_drive_delta_sync( ) else: summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}" - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) chunks = await create_document_chunks(markdown_content) # Update document to READY - document.title = item['file_name'] + document.title = item["file_name"] document.content = summary_content document.content_hash = content_hash document.embedding = summary_embedding document.document_metadata = { - "file_id": item['file_id'], - "file_name": item['file_name'], - "FILE_NAME": item['file_name'], - "mime_type": item['mime_type'], + "file_id": item["file_id"], + "file_name": item["file_name"], + "FILE_NAME": item["file_name"], + "mime_type": item["mime_type"], "connector_id": connector_id, "source": "composio", } @@ -1117,7 +1129,9 @@ async def _index_composio_drive_delta_sync( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue @@ -1329,13 +1343,15 @@ async def _index_composio_drive_full_scan( if existing_document: # Queue existing document for update (will be set to processing in Phase 2) - files_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'file_id': file_id, - 'file_name': file_name, - 'mime_type': mime_type, - }) + files_to_process.append( + { + "document": existing_document, + "is_new": False, + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + } + ) continue # Create new document with PENDING status (visible in UI immediately) @@ -1365,13 +1381,15 @@ async def _index_composio_drive_full_scan( session.add(document) new_documents_created = True - files_to_process.append({ - 'document': document, - 'is_new': True, - 'file_id': file_id, - 'file_name': file_name, - 'mime_type': mime_type, - }) + files_to_process.append( + { + "document": document, + "is_new": True, + "file_id": file_id, + "file_name": file_name, + "mime_type": mime_type, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for file: {e!s}", exc_info=True) @@ -1380,7 +1398,9 @@ async def _index_composio_drive_full_scan( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -1397,7 +1417,7 @@ async def _index_composio_drive_full_scan( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -1405,11 +1425,13 @@ async def _index_composio_drive_full_scan( # Get file content (pass mime_type for Google Workspace export handling) content, content_error = await composio_connector.get_drive_file_content( - item['file_id'], original_mime_type=item['mime_type'] + item["file_id"], original_mime_type=item["mime_type"] ) if content_error or not content: - logger.warning(f"Could not get content for file {item['file_name']}: {content_error}") + logger.warning( + f"Could not get content for file {item['file_name']}: {content_error}" + ) markdown_content = f"# {item['file_name']}\n\n" markdown_content += f"**File ID:** {item['file_id']}\n" markdown_content += f"**Type:** {item['mime_type']}\n" @@ -1424,9 +1446,9 @@ async def _index_composio_drive_full_scan( # Process content based on file type markdown_content = await _process_file_content( content=content, - file_name=item['file_name'], - file_id=item['file_id'], - mime_type=item['mime_type'], + file_name=item["file_name"], + file_id=item["file_id"], + mime_type=item["mime_type"], search_space_id=search_space_id, user_id=user_id, session=session, @@ -1438,7 +1460,7 @@ async def _index_composio_drive_full_scan( content_hash = generate_content_hash(markdown_content, search_space_id) # For existing documents, check if content changed - if not item['is_new'] and document.content_hash == content_hash: + if not item["is_new"] and document.content_hash == content_hash: # Ensure status is ready if not DocumentStatus.is_state(document.status, DocumentStatus.READY): document.status = DocumentStatus.ready() @@ -1446,7 +1468,7 @@ async def _index_composio_drive_full_scan( continue # Check for duplicate content hash (for new documents) - if item['is_new']: + if item["is_new"]: with session.no_autoflush: duplicate_by_content = await check_duplicate_document_by_hash( session, content_hash @@ -1462,13 +1484,15 @@ async def _index_composio_drive_full_scan( continue # Heavy processing (LLM, embeddings, chunks) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + user_llm = await get_user_long_context_llm( + session, user_id, search_space_id + ) if user_llm: document_metadata_for_summary = { - "file_id": item['file_id'], - "file_name": item['file_name'], - "mime_type": item['mime_type'], + "file_id": item["file_id"], + "file_name": item["file_name"], + "mime_type": item["mime_type"], "document_type": "Google Drive File (Composio)", } summary_content, summary_embedding = await generate_document_summary( @@ -1476,20 +1500,22 @@ async def _index_composio_drive_full_scan( ) else: summary_content = f"Google Drive File: {item['file_name']}\n\nType: {item['mime_type']}" - summary_embedding = config.embedding_model_instance.embed(summary_content) + summary_embedding = config.embedding_model_instance.embed( + summary_content + ) chunks = await create_document_chunks(markdown_content) # Update document to READY with actual content - document.title = item['file_name'] + document.title = item["file_name"] document.content = summary_content document.content_hash = content_hash document.embedding = summary_embedding document.document_metadata = { - "file_id": item['file_id'], - "file_name": item['file_name'], - "FILE_NAME": item['file_name'], - "mime_type": item['mime_type'], + "file_id": item["file_id"], + "file_name": item["file_name"], + "FILE_NAME": item["file_name"], + "mime_type": item["mime_type"], "connector_id": connector_id, "source": "composio", } @@ -1515,7 +1541,9 @@ async def _index_composio_drive_full_scan( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index fb5c711ed..344d83f13 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -103,67 +103,70 @@ class PodcastStatus(str, Enum): class DocumentStatus: """ Helper class for document processing status (stored as JSONB). - + Status values: - {"state": "ready"} - Document is fully processed and searchable - {"state": "pending"} - Document is queued, waiting to be processed - {"state": "processing"} - Document is currently being processed (only 1 at a time) - {"state": "failed", "reason": "..."} - Processing failed with reason - + Usage: document.status = DocumentStatus.pending() document.status = DocumentStatus.processing() document.status = DocumentStatus.ready() document.status = DocumentStatus.failed("LLM rate limit exceeded") """ - + # State constants READY = "ready" PENDING = "pending" PROCESSING = "processing" FAILED = "failed" - + @staticmethod def ready() -> dict: """Return status dict for a ready/searchable document.""" return {"state": DocumentStatus.READY} - + @staticmethod def pending() -> dict: """Return status dict for a document waiting to be processed.""" return {"state": DocumentStatus.PENDING} - + @staticmethod def processing() -> dict: """Return status dict for a document being processed.""" return {"state": DocumentStatus.PROCESSING} - + @staticmethod def failed(reason: str, **extra_details) -> dict: """ Return status dict for a failed document. - + Args: reason: Human-readable failure reason **extra_details: Optional additional details (duplicate_of, error_code, etc.) """ - status = {"state": DocumentStatus.FAILED, "reason": reason[:500]} # Truncate long reasons + status = { + "state": DocumentStatus.FAILED, + "reason": reason[:500], + } # Truncate long reasons if extra_details: status.update(extra_details) return status - + @staticmethod def get_state(status: dict | None) -> str | None: """Extract state from status dict, returns None if invalid.""" if status is None: return None return status.get("state") if isinstance(status, dict) else None - + @staticmethod def is_state(status: dict | None, state: str) -> bool: """Check if status matches a given state.""" return DocumentStatus.get_state(status) == state - + @staticmethod def get_failure_reason(status: dict | None) -> str | None: """Extract failure reason from status dict.""" @@ -866,7 +869,7 @@ class Document(BaseModel, TimestampMixin): JSONB, nullable=False, default=DocumentStatus.ready, - server_default=text("'{\"state\": \"ready\"}'::jsonb"), + server_default=text('\'{"state": "ready"}\'::jsonb'), index=True, ) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 00c80dcb5..b20f8cd9c 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -114,11 +114,11 @@ async def create_documents_file_upload( ): """ Upload files as documents with real-time status tracking. - + Implements 2-phase document status updates for real-time UI feedback: - Phase 1: Create all documents with 'pending' status (visible in UI immediately via ElectricSQL) - Phase 2: Celery processes each file: pending → processing → ready/failed - + Requires DOCUMENTS_CREATE permission. """ from datetime import datetime @@ -144,7 +144,9 @@ async def create_documents_file_upload( raise HTTPException(status_code=400, detail="No files provided") created_documents: list[Document] = [] - files_to_process: list[tuple[Document, str, str]] = [] # (document, temp_path, filename) + files_to_process: list[ + tuple[Document, str, str] + ] = [] # (document, temp_path, filename) skipped_duplicates = 0 # ===== PHASE 1: Create pending documents for all files ===== @@ -201,7 +203,9 @@ async def create_documents_file_upload( ) session.add(document) created_documents.append(document) - files_to_process.append((document, temp_path, file.filename or "unknown")) + files_to_process.append( + (document, temp_path, file.filename or "unknown") + ) except Exception as e: raise HTTPException( @@ -348,15 +352,15 @@ async def read_documents( created_by_name = None if doc.created_by: created_by_name = doc.created_by.display_name or doc.created_by.email - + # Parse status from JSONB status_data = None - if hasattr(doc, 'status') and doc.status: + if hasattr(doc, "status") and doc.status: status_data = DocumentStatusSchema( state=doc.status.get("state", "ready"), reason=doc.status.get("reason"), ) - + api_documents.append( DocumentRead( id=doc.id, @@ -503,15 +507,15 @@ async def search_documents( created_by_name = None if doc.created_by: created_by_name = doc.created_by.display_name or doc.created_by.email - + # Parse status from JSONB status_data = None - if hasattr(doc, 'status') and doc.status: + if hasattr(doc, "status") and doc.status: status_data = DocumentStatusSchema( state=doc.status.get("state", "ready"), reason=doc.status.get("reason"), ) - + api_documents.append( DocumentRead( id=doc.id, diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index 7d85d0229..4cedc7d93 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -43,6 +43,7 @@ class DocumentUpdate(DocumentBase): class DocumentStatusSchema(BaseModel): """Document processing status.""" + state: str # "ready", "processing", "failed" reason: str | None = None @@ -59,8 +60,12 @@ class DocumentRead(BaseModel): updated_at: datetime | None search_space_id: int created_by_id: UUID | None = None # User who created/uploaded this document - created_by_name: str | None = None # Display name or email of the user who created this document - status: DocumentStatusSchema | None = None # Processing status (ready, processing, failed) + created_by_name: str | None = ( + None # Display name or email of the user who created this document + ) + status: DocumentStatusSchema | None = ( + None # Processing status (ready, processing, failed) + ) model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index 6967902d1..251241e96 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -1465,11 +1465,7 @@ class ConnectorService: issue_key = metadata.get("issue_key", "") issue_title = metadata.get("issue_title", "Untitled Issue") status = metadata.get("status", "") - title = ( - f"{issue_key} - {issue_title}" - if issue_key - else issue_title - ) + title = f"{issue_key} - {issue_title}" if issue_key else issue_title if status: title += f" ({status})" return title @@ -2387,11 +2383,7 @@ class ConnectorService: def _title_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: event_name = metadata.get("event_name", "Untitled Event") start_time = metadata.get("start_time", "") - return ( - f"{event_name} ({start_time})" - if start_time - else event_name - ) + return f"{event_name} ({start_time})" if start_time else event_name def _url_fn(_doc_info: dict[str, Any], metadata: dict[str, Any]) -> str: return metadata.get("event_url", "") or "" diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index cd5537927..6dfcbff46 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -548,11 +548,11 @@ def process_file_upload_with_document_task( ): """ Celery task to process uploaded file with existing pending document. - + This task is used by the 2-phase document upload flow: - Phase 1 (API): Creates pending document (visible in UI immediately) - Phase 2 (this task): Updates document status: pending → processing → ready/failed - + Args: document_id: ID of the pending document created in Phase 1 temp_path: Path to the uploaded file @@ -634,7 +634,7 @@ async def _process_file_with_document( ): """ Process file and update existing pending document status. - + This function implements Phase 2 of the 2-phase document upload: - Sets document status to 'processing' (shows spinner in UI) - Processes the file (parsing, embedding, chunking) @@ -669,11 +669,15 @@ async def _process_file_with_document( file_size = os.path.getsize(temp_path) logger.info(f"[_process_file_with_document] File size: {file_size} bytes") except Exception as e: - logger.warning(f"[_process_file_with_document] Could not get file size: {e}") + logger.warning( + f"[_process_file_with_document] Could not get file size: {e}" + ) file_size = None # Create notification for document processing - logger.info(f"[_process_file_with_document] Creating notification for: {filename}") + logger.info( + f"[_process_file_with_document] Creating notification for: {filename}" + ) notification = ( await NotificationService.document_processing.notify_processing_started( session=session, @@ -822,7 +826,9 @@ async def _process_file_with_document( if os.path.exists(temp_path): try: os.unlink(temp_path) - logger.info(f"[_process_file_with_document] Cleaned up temp file: {temp_path}") + logger.info( + f"[_process_file_with_document] Cleaned up temp file: {temp_path}" + ) except Exception as cleanup_error: logger.warning( f"[_process_file_with_document] Failed to clean up temp file: {cleanup_error}" diff --git a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py index ff7a11645..ef3a30e43 100644 --- a/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py +++ b/surfsense_backend/app/tasks/celery_tasks/stale_notification_cleanup_task.py @@ -154,9 +154,7 @@ async def _cleanup_stale_notifications(): f"Found {len(stale_notification_ids)} stale connector indexing notifications " f"(no Redis heartbeat key): {stale_notification_ids}" ) - logger.info( - f"Connector IDs for document cleanup: {stale_connector_ids}" - ) + logger.info(f"Connector IDs for document cleanup: {stale_connector_ids}") # O(1) Batch UPDATE notifications using JSONB || operator # This merges the update data into existing notification_metadata diff --git a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py index 05a4007ae..46cd069c9 100644 --- a/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/airtable_indexer.py @@ -140,7 +140,9 @@ async def index_airtable_records( log_entry, success_msg, {"bases_count": 0} ) # CRITICAL: Update timestamp even when no bases found so Electric SQL syncs - await update_connector_last_indexed(session, connector, update_last_indexed) + await update_connector_last_indexed( + session, connector, update_last_indexed + ) await session.commit() return 0, None # Return None (not error) when no items found @@ -277,22 +279,28 @@ async def index_airtable_records( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): - existing_document.status = DocumentStatus.ready() + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = ( + DocumentStatus.ready() + ) documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - records_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'record_id': record_id, - 'record': record, - 'base_name': base_name, - 'table_name': table_name, - }) + records_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, + "record_id": record_id, + "record": record, + "base_name": base_name, + "table_name": table_name, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -339,25 +347,31 @@ async def index_airtable_records( session.add(document) new_documents_created = True - records_to_process.append({ - 'document': document, - 'is_new': True, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'record_id': record_id, - 'record': record, - 'base_name': base_name, - 'table_name': table_name, - }) + records_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "record_id": record_id, + "record": record, + "base_name": base_name, + "table_name": table_name, + } + ) except Exception as e: - logger.error(f"Error in Phase 1 for record: {e!s}", exc_info=True) + logger.error( + f"Error in Phase 1 for record: {e!s}", exc_info=True + ) documents_failed += 1 continue # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([r for r in records_to_process if r['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([r for r in records_to_process if r['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -374,7 +388,7 @@ async def index_airtable_records( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -387,13 +401,18 @@ async def index_airtable_records( if user_llm: document_metadata_for_summary = { - "record_id": item['record_id'], - "created_time": item['record'].get("CREATED_TIME()", ""), + "record_id": item["record_id"], + "created_time": item["record"].get("CREATED_TIME()", ""), "document_type": "Airtable Record", "connector_type": "Airtable", } - summary_content, summary_embedding = await generate_document_summary( - item['markdown_content'], user_llm, document_metadata_for_summary + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["markdown_content"], + user_llm, + document_metadata_for_summary, ) else: # Fallback to simple summary if no LLM configured @@ -402,18 +421,18 @@ async def index_airtable_records( summary_content ) - chunks = await create_document_chunks(item['markdown_content']) + chunks = await create_document_chunks(item["markdown_content"]) # Update document to READY with actual content - document.title = item['record_id'] + document.title = item["record_id"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "record_id": item['record_id'], - "created_time": item['record'].get("CREATED_TIME()", ""), - "base_name": item['base_name'], - "table_name": item['table_name'], + "record_id": item["record_id"], + "created_time": item["record"].get("CREATED_TIME()", ""), + "base_name": item["base_name"], + "table_name": item["table_name"], "connector_id": connector_id, } safe_set_chunks(document, chunks) @@ -430,13 +449,17 @@ async def index_airtable_records( await session.commit() except Exception as e: - logger.error(f"Error processing Airtable record: {e!s}", exc_info=True) + logger.error( + f"Error processing Airtable record: {e!s}", exc_info=True + ) # Mark document as failed with reason (visible in UI) try: document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue @@ -446,7 +469,9 @@ async def index_airtable_records( total_processed = documents_indexed # Final commit to ensure all documents are persisted (safety net) - logger.info(f"Final commit: Total {documents_indexed} Airtable records processed") + logger.info( + f"Final commit: Total {documents_indexed} Airtable records processed" + ) try: await session.commit() logger.info( diff --git a/surfsense_backend/app/tasks/connector_indexers/base.py b/surfsense_backend/app/tasks/connector_indexers/base.py index b5b4e5559..da32e84a6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/base.py +++ b/surfsense_backend/app/tasks/connector_indexers/base.py @@ -31,29 +31,30 @@ def get_current_timestamp() -> datetime: def safe_set_chunks(document: Document, chunks: list) -> None: """ Safely assign chunks to a document without triggering lazy loading. - + ALWAYS use this instead of `document.chunks = chunks` to avoid SQLAlchemy async errors (MissingGreenlet / greenlet_spawn). - + Why this is needed: - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to load the OLD chunks first (for comparison/orphan detection) - This lazy loading fails in async context with asyncpg driver - set_committed_value bypasses this by setting the value directly - + This function is safe regardless of how the document was loaded (with or without selectinload). - + Args: document: The Document object to update chunks: List of Chunk objects to assign - + Example: # Instead of: document.chunks = chunks (DANGEROUS!) safe_set_chunks(document, chunks) # Always safe """ from sqlalchemy.orm.attributes import set_committed_value - set_committed_value(document, 'chunks', chunks) + + set_committed_value(document, "chunks", chunks) async def check_duplicate_document_by_hash( diff --git a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py index fbf90b345..d60884539 100644 --- a/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/bookstack_indexer.py @@ -261,7 +261,9 @@ async def index_bookstack_pages( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() logger.info( f"Document for BookStack page {page_name} unchanged. Skipping." @@ -270,20 +272,22 @@ async def index_bookstack_pages( continue # Queue existing document for update (will be set to processing in Phase 2) - pages_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'page_id': page_id, - 'page_name': page_name, - 'page_slug': page_slug, - 'book_id': book_id, - 'book_slug': book_slug, - 'chapter_id': chapter_id, - 'page_url': page_url, - 'page_content': page_content, - 'full_content': full_content, - 'content_hash': content_hash, - }) + pages_to_process.append( + { + "document": existing_document, + "is_new": False, + "page_id": page_id, + "page_name": page_name, + "page_slug": page_slug, + "book_id": book_id, + "book_slug": book_slug, + "chapter_id": chapter_id, + "page_url": page_url, + "page_content": page_content, + "full_content": full_content, + "content_hash": content_hash, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -331,20 +335,22 @@ async def index_bookstack_pages( session.add(document) new_documents_created = True - pages_to_process.append({ - 'document': document, - 'is_new': True, - 'page_id': page_id, - 'page_name': page_name, - 'page_slug': page_slug, - 'book_id': book_id, - 'book_slug': book_slug, - 'chapter_id': chapter_id, - 'page_url': page_url, - 'page_content': page_content, - 'full_content': full_content, - 'content_hash': content_hash, - }) + pages_to_process.append( + { + "document": document, + "is_new": True, + "page_id": page_id, + "page_name": page_name, + "page_slug": page_slug, + "book_id": book_id, + "book_slug": book_slug, + "chapter_id": chapter_id, + "page_url": page_url, + "page_content": page_content, + "full_content": full_content, + "content_hash": content_hash, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True) @@ -353,7 +359,9 @@ async def index_bookstack_pages( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -370,7 +378,7 @@ async def index_bookstack_pages( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -383,23 +391,23 @@ async def index_bookstack_pages( # Build document metadata doc_metadata = { - "page_id": item['page_id'], - "page_name": item['page_name'], - "page_slug": item['page_slug'], - "book_id": item['book_id'], - "book_slug": item['book_slug'], - "chapter_id": item['chapter_id'], + "page_id": item["page_id"], + "page_name": item["page_name"], + "page_slug": item["page_slug"], + "book_id": item["book_id"], + "book_slug": item["book_slug"], + "chapter_id": item["chapter_id"], "base_url": bookstack_base_url, - "page_url": item['page_url'], + "page_url": item["page_url"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } if user_llm: summary_metadata = { - "page_name": item['page_name'], - "page_id": item['page_id'], - "book_id": item['book_id'], + "page_name": item["page_name"], + "page_id": item["page_id"], + "book_id": item["book_id"], "document_type": "BookStack Page", "connector_type": "BookStack", } @@ -407,17 +415,15 @@ async def index_bookstack_pages( summary_content, summary_embedding, ) = await generate_document_summary( - item['full_content'], user_llm, summary_metadata + item["full_content"], user_llm, summary_metadata ) else: # Fallback to simple summary if no LLM configured - summary_content = ( - f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n" - ) - if item['page_content']: + summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n" + if item["page_content"]: # Take first 1000 characters of content for summary - content_preview = item['page_content'][:1000] - if len(item['page_content']) > 1000: + content_preview = item["page_content"][:1000] + if len(item["page_content"]) > 1000: content_preview += "..." summary_content += f"Content Preview: {content_preview}\n\n" summary_embedding = config.embedding_model_instance.embed( @@ -425,12 +431,12 @@ async def index_bookstack_pages( ) # Process chunks - using the full page content - chunks = await create_document_chunks(item['full_content']) + chunks = await create_document_chunks(item["full_content"]) # Update document to READY with actual content - document.title = item['page_name'] + document.title = item["page_name"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = doc_metadata safe_set_chunks(document, chunks) @@ -456,7 +462,9 @@ async def index_bookstack_pages( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) skipped_pages.append( f"{item.get('page_name', 'Unknown')} (processing error)" ) @@ -473,7 +481,9 @@ async def index_bookstack_pages( ) try: await session.commit() - logger.info("Successfully committed all BookStack document changes to database") + logger.info( + "Successfully committed all BookStack document changes to database" + ) except Exception as e: # Handle any remaining integrity errors gracefully (race conditions, etc.) if ( diff --git a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py index 934e56744..47c5d8b3b 100644 --- a/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/clickup_indexer.py @@ -260,7 +260,9 @@ async def index_clickup_tasks( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() logger.info( f"Document for ClickUp task {task_name} unchanged. Skipping." @@ -272,22 +274,24 @@ async def index_clickup_tasks( logger.info( f"Content changed for ClickUp task {task_name}. Queuing for update." ) - tasks_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'task_content': task_content, - 'content_hash': content_hash, - 'task_id': task_id, - 'task_name': task_name, - 'task_status': task_status, - 'task_priority': task_priority, - 'task_list_name': task_list_name, - 'task_space_name': task_space_name, - 'task_assignees': task_assignees, - 'task_due_date': task_due_date, - 'task_created': task_created, - 'task_updated': task_updated, - }) + tasks_to_process.append( + { + "document": existing_document, + "is_new": False, + "task_content": task_content, + "content_hash": content_hash, + "task_id": task_id, + "task_name": task_name, + "task_status": task_status, + "task_priority": task_priority, + "task_list_name": task_list_name, + "task_space_name": task_space_name, + "task_assignees": task_assignees, + "task_due_date": task_due_date, + "task_created": task_created, + "task_updated": task_updated, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -335,22 +339,24 @@ async def index_clickup_tasks( session.add(document) new_documents_created = True - tasks_to_process.append({ - 'document': document, - 'is_new': True, - 'task_content': task_content, - 'content_hash': content_hash, - 'task_id': task_id, - 'task_name': task_name, - 'task_status': task_status, - 'task_priority': task_priority, - 'task_list_name': task_list_name, - 'task_space_name': task_space_name, - 'task_assignees': task_assignees, - 'task_due_date': task_due_date, - 'task_created': task_created, - 'task_updated': task_updated, - }) + tasks_to_process.append( + { + "document": document, + "is_new": True, + "task_content": task_content, + "content_hash": content_hash, + "task_id": task_id, + "task_name": task_name, + "task_status": task_status, + "task_priority": task_priority, + "task_list_name": task_list_name, + "task_space_name": task_space_name, + "task_assignees": task_assignees, + "task_due_date": task_due_date, + "task_created": task_created, + "task_updated": task_updated, + } + ) except Exception as e: logger.error( @@ -362,7 +368,9 @@ async def index_clickup_tasks( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([t for t in tasks_to_process if t['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([t for t in tasks_to_process if t['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -379,7 +387,7 @@ async def index_clickup_tasks( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -392,13 +400,13 @@ async def index_clickup_tasks( if user_llm: document_metadata_for_summary = { - "task_id": item['task_id'], - "task_name": item['task_name'], - "task_status": item['task_status'], - "task_priority": item['task_priority'], - "task_list": item['task_list_name'], - "task_space": item['task_space_name'], - "assignees": len(item['task_assignees']), + "task_id": item["task_id"], + "task_name": item["task_name"], + "task_status": item["task_status"], + "task_priority": item["task_priority"], + "task_list": item["task_list_name"], + "task_space": item["task_space_name"], + "assignees": len(item["task_assignees"]), "document_type": "ClickUp Task", "connector_type": "ClickUp", } @@ -406,30 +414,30 @@ async def index_clickup_tasks( summary_content, summary_embedding, ) = await generate_document_summary( - item['task_content'], user_llm, document_metadata_for_summary + item["task_content"], user_llm, document_metadata_for_summary ) else: - summary_content = item['task_content'] + summary_content = item["task_content"] summary_embedding = config.embedding_model_instance.embed( - item['task_content'] + item["task_content"] ) - chunks = await create_document_chunks(item['task_content']) + chunks = await create_document_chunks(item["task_content"]) # Update document to READY with actual content - document.title = item['task_name'] + document.title = item["task_name"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "task_id": item['task_id'], - "task_name": item['task_name'], - "task_status": item['task_status'], - "task_priority": item['task_priority'], - "task_assignees": item['task_assignees'], - "task_due_date": item['task_due_date'], - "task_created": item['task_created'], - "task_updated": item['task_updated'], + "task_id": item["task_id"], + "task_name": item["task_name"], + "task_status": item["task_status"], + "task_priority": item["task_priority"], + "task_assignees": item["task_assignees"], + "task_due_date": item["task_due_date"], + "task_created": item["task_created"], + "task_updated": item["task_updated"], "connector_id": connector_id, "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } @@ -456,7 +464,9 @@ async def index_clickup_tasks( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue diff --git a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py index 7fd842996..a3a059d4e 100644 --- a/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/confluence_indexer.py @@ -262,23 +262,27 @@ async def index_confluence_pages( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - pages_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'full_content': full_content, - 'page_content': page_content, - 'content_hash': content_hash, - 'page_id': page_id, - 'page_title': page_title, - 'space_id': space_id, - 'comment_count': comment_count, - }) + pages_to_process.append( + { + "document": existing_document, + "is_new": False, + "full_content": full_content, + "page_content": page_content, + "content_hash": content_hash, + "page_id": page_id, + "page_title": page_title, + "space_id": space_id, + "comment_count": comment_count, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -323,17 +327,19 @@ async def index_confluence_pages( session.add(document) new_documents_created = True - pages_to_process.append({ - 'document': document, - 'is_new': True, - 'full_content': full_content, - 'page_content': page_content, - 'content_hash': content_hash, - 'page_id': page_id, - 'page_title': page_title, - 'space_id': space_id, - 'comment_count': comment_count, - }) + pages_to_process.append( + { + "document": document, + "is_new": True, + "full_content": full_content, + "page_content": page_content, + "content_hash": content_hash, + "page_id": page_id, + "page_title": page_title, + "space_id": space_id, + "comment_count": comment_count, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True) @@ -342,7 +348,9 @@ async def index_confluence_pages( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -359,7 +367,7 @@ async def index_confluence_pages( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -372,10 +380,10 @@ async def index_confluence_pages( if user_llm: document_metadata = { - "page_title": item['page_title'], - "page_id": item['page_id'], - "space_id": item['space_id'], - "comment_count": item['comment_count'], + "page_title": item["page_title"], + "page_id": item["page_id"], + "space_id": item["space_id"], + "comment_count": item["comment_count"], "document_type": "Confluence Page", "connector_type": "Confluence", } @@ -383,17 +391,15 @@ async def index_confluence_pages( summary_content, summary_embedding, ) = await generate_document_summary( - item['full_content'], user_llm, document_metadata + item["full_content"], user_llm, document_metadata ) else: # Fallback to simple summary if no LLM configured - summary_content = ( - f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n" - ) - if item['page_content']: + summary_content = f"Confluence Page: {item['page_title']}\n\nSpace ID: {item['space_id']}\n\n" + if item["page_content"]: # Take first 1000 characters of content for summary - content_preview = item['page_content'][:1000] - if len(item['page_content']) > 1000: + content_preview = item["page_content"][:1000] + if len(item["page_content"]) > 1000: content_preview += "..." summary_content += f"Content Preview: {content_preview}\n\n" summary_content += f"Comments: {item['comment_count']}" @@ -402,18 +408,18 @@ async def index_confluence_pages( ) # Process chunks - using the full page content with comments - chunks = await create_document_chunks(item['full_content']) + chunks = await create_document_chunks(item["full_content"]) # Update document to READY with actual content - document.title = item['page_title'] + document.title = item["page_title"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "page_id": item['page_id'], - "page_title": item['page_title'], - "space_id": item['space_id'], - "comment_count": item['comment_count'], + "page_id": item["page_id"], + "page_title": item["page_title"], + "space_id": item["space_id"], + "comment_count": item["comment_count"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -440,7 +446,9 @@ async def index_confluence_pages( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue # Skip this page and continue with others diff --git a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py index e5f333531..1595897a0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/discord_indexer.py @@ -352,9 +352,7 @@ async def index_discord_messages( try: channels = await discord_client.get_text_channels(guild_id) if not channels: - logger.info( - f"No channels found in guild {guild_name}. Skipping." - ) + logger.info(f"No channels found in guild {guild_name}. Skipping.") skipped_channels.append(f"{guild_name} (no channels)") else: for channel in channels: @@ -456,25 +454,31 @@ async def index_discord_messages( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): - existing_document.status = DocumentStatus.ready() + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = ( + DocumentStatus.ready() + ) documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - messages_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'combined_document_string': combined_document_string, - 'content_hash': content_hash, - 'guild_name': guild_name, - 'guild_id': guild_id, - 'channel_name': channel_name, - 'channel_id': channel_id, - 'message_id': msg_id, - 'message_timestamp': msg_timestamp, - 'message_user_name': msg_user_name, - }) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "guild_name": guild_name, + "guild_id": guild_id, + "channel_name": channel_name, + "channel_id": channel_id, + "message_id": msg_id, + "message_timestamp": msg_timestamp, + "message_user_name": msg_user_name, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -522,19 +526,21 @@ async def index_discord_messages( session.add(document) new_documents_created = True - messages_to_process.append({ - 'document': document, - 'is_new': True, - 'combined_document_string': combined_document_string, - 'content_hash': content_hash, - 'guild_name': guild_name, - 'guild_id': guild_id, - 'channel_name': channel_name, - 'channel_id': channel_id, - 'message_id': msg_id, - 'message_timestamp': msg_timestamp, - 'message_user_name': msg_user_name, - }) + messages_to_process.append( + { + "document": document, + "is_new": True, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "guild_name": guild_name, + "guild_id": guild_id, + "channel_name": channel_name, + "channel_id": channel_id, + "message_id": msg_id, + "message_timestamp": msg_timestamp, + "message_user_name": msg_user_name, + } + ) except Exception as e: logger.error( @@ -547,7 +553,9 @@ async def index_discord_messages( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -564,31 +572,31 @@ async def index_discord_messages( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() await session.commit() # Heavy processing (embeddings, chunks) - chunks = await create_document_chunks(item['combined_document_string']) + chunks = await create_document_chunks(item["combined_document_string"]) doc_embedding = config.embedding_model_instance.embed( - item['combined_document_string'] + item["combined_document_string"] ) # Update document to READY with actual content document.title = f"{item['guild_name']}#{item['channel_name']}" - document.content = item['combined_document_string'] - document.content_hash = item['content_hash'] + document.content = item["combined_document_string"] + document.content_hash = item["content_hash"] document.embedding = doc_embedding document.document_metadata = { - "guild_name": item['guild_name'], - "guild_id": item['guild_id'], - "channel_name": item['channel_name'], - "channel_id": item['channel_id'], - "message_id": item['message_id'], - "message_timestamp": item['message_timestamp'], - "message_user_name": item['message_user_name'], + "guild_name": item["guild_name"], + "guild_id": item["guild_id"], + "channel_name": item["channel_name"], + "channel_id": item["channel_id"], + "message_id": item["message_id"], + "message_timestamp": item["message_timestamp"], + "message_user_name": item["message_user_name"], "indexed_at": datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -612,7 +620,9 @@ async def index_discord_messages( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue diff --git a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py index 97cd31a09..212afff39 100644 --- a/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/elasticsearch_indexer.py @@ -253,7 +253,9 @@ async def index_elasticsearch_documents( # If content is unchanged, skip. Otherwise queue for update. if existing_doc.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_doc.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_doc.status, DocumentStatus.READY + ): existing_doc.status = DocumentStatus.ready() logger.info( f"Skipping ES doc {doc_id} — already indexed (doc id {existing_doc.id})" @@ -262,17 +264,19 @@ async def index_elasticsearch_documents( continue # Queue existing document for update (will be set to processing in Phase 2) - docs_to_process.append({ - 'document': existing_doc, - 'is_new': False, - 'doc_id': doc_id, - 'title': title, - 'content': content, - 'content_hash': content_hash, - 'unique_identifier_hash': unique_identifier_hash, - 'hit': hit, - 'source': source, - }) + docs_to_process.append( + { + "document": existing_doc, + "is_new": False, + "doc_id": doc_id, + "title": title, + "content": content, + "content_hash": content_hash, + "unique_identifier_hash": unique_identifier_hash, + "hit": hit, + "source": source, + } + ) hits_collected += 1 continue @@ -310,17 +314,19 @@ async def index_elasticsearch_documents( session.add(document) new_documents_created = True - docs_to_process.append({ - 'document': document, - 'is_new': True, - 'doc_id': doc_id, - 'title': title, - 'content': content, - 'content_hash': content_hash, - 'unique_identifier_hash': unique_identifier_hash, - 'hit': hit, - 'source': source, - }) + docs_to_process.append( + { + "document": document, + "is_new": True, + "doc_id": doc_id, + "title": title, + "content": content, + "content_hash": content_hash, + "unique_identifier_hash": unique_identifier_hash, + "hit": hit, + "source": source, + } + ) hits_collected += 1 except Exception as e: @@ -330,7 +336,9 @@ async def index_elasticsearch_documents( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([d for d in docs_to_process if d['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([d for d in docs_to_process if d['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -347,7 +355,7 @@ async def index_elasticsearch_documents( await on_heartbeat_callback(documents_processed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -355,9 +363,9 @@ async def index_elasticsearch_documents( # Build metadata metadata = { - "elasticsearch_id": item['doc_id'], - "elasticsearch_index": item['hit'].get("_index", index_name), - "elasticsearch_score": item['hit'].get("_score"), + "elasticsearch_id": item["doc_id"], + "elasticsearch_index": item["hit"].get("_index", index_name), + "elasticsearch_score": item["hit"].get("_score"), "indexed_at": datetime.now().isoformat(), "source": "ELASTICSEARCH_CONNECTOR", "connector_id": connector_id, @@ -366,17 +374,17 @@ async def index_elasticsearch_documents( # Add any additional metadata fields specified in config if "ELASTICSEARCH_METADATA_FIELDS" in config: for field in config["ELASTICSEARCH_METADATA_FIELDS"]: - if field in item['source']: - metadata[f"es_{field}"] = item['source'][field] + if field in item["source"]: + metadata[f"es_{field}"] = item["source"][field] # Create chunks - chunks = await create_document_chunks(item['content']) + chunks = await create_document_chunks(item["content"]) # Update document to READY with actual content - document.title = item['title'] - document.content = item['content'] - document.content_hash = item['content_hash'] - document.unique_identifier_hash = item['unique_identifier_hash'] + document.title = item["title"] + document.content = item["content"] + document.content_hash = item["content_hash"] + document.unique_identifier_hash = item["unique_identifier_hash"] document.document_metadata = metadata safe_set_chunks(document, chunks) document.updated_at = get_current_timestamp() @@ -399,7 +407,9 @@ async def index_elasticsearch_documents( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue @@ -411,10 +421,14 @@ async def index_elasticsearch_documents( ) # Final commit for any remaining documents not yet committed in batches - logger.info(f"Final commit: Total {documents_processed} Elasticsearch documents processed") + logger.info( + f"Final commit: Total {documents_processed} Elasticsearch documents processed" + ) try: await session.commit() - logger.info("Successfully committed all Elasticsearch document changes to database") + logger.info( + "Successfully committed all Elasticsearch document changes to database" + ) except Exception as e: # Handle any remaining integrity errors gracefully (race conditions, etc.) if ( diff --git a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py index b37989a84..e1a1ddd4d 100644 --- a/surfsense_backend/app/tasks/connector_indexers/github_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/github_indexer.py @@ -17,7 +17,7 @@ from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config -from app.connectors.github_connector import GitHubConnector, RepositoryDigest +from app.connectors.github_connector import GitHubConnector from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.services.llm_service import get_user_long_context_llm from app.services.task_logging_service import TaskLoggingService @@ -237,7 +237,9 @@ async def index_github_repos( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() logger.info(f"Repository {repo_full_name} unchanged. Skipping.") documents_skipped += 1 @@ -247,14 +249,16 @@ async def index_github_repos( logger.info( f"Content changed for repository {repo_full_name}. Queuing for update." ) - repos_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'digest': digest, - 'content_hash': content_hash, - 'repo_full_name': repo_full_name, - 'unique_identifier_hash': unique_identifier_hash, - }) + repos_to_process.append( + { + "document": existing_document, + "is_new": False, + "digest": digest, + "content_hash": content_hash, + "repo_full_name": repo_full_name, + "unique_identifier_hash": unique_identifier_hash, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -298,14 +302,16 @@ async def index_github_repos( session.add(document) new_documents_created = True - repos_to_process.append({ - 'document': document, - 'is_new': True, - 'digest': digest, - 'content_hash': content_hash, - 'repo_full_name': repo_full_name, - 'unique_identifier_hash': unique_identifier_hash, - }) + repos_to_process.append( + { + "document": document, + "is_new": True, + "digest": digest, + "content_hash": content_hash, + "repo_full_name": repo_full_name, + "unique_identifier_hash": unique_identifier_hash, + } + ) except Exception as repo_err: logger.error( @@ -317,7 +323,9 @@ async def index_github_repos( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([r for r in repos_to_process if r['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([r for r in repos_to_process if r['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -334,9 +342,9 @@ async def index_github_repos( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] - digest = item['digest'] - repo_full_name = item['repo_full_name'] + document = item["document"] + digest = item["digest"] + repo_full_name = item["repo_full_name"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only @@ -353,7 +361,9 @@ async def index_github_repos( "document_type": "GitHub Repository", "connector_type": "GitHub", "ingestion_method": "gitingest", - "file_tree": digest.tree[:2000] if len(digest.tree) > 2000 else digest.tree, + "file_tree": digest.tree[:2000] + if len(digest.tree) > 2000 + else digest.tree, "estimated_tokens": digest.estimated_tokens, } @@ -377,13 +387,17 @@ async def index_github_repos( f"## Summary\n{digest.summary}\n\n" f"## File Structure\n{digest.tree[:3000]}" ) - summary_embedding = config.embedding_model_instance.embed(summary_text) + summary_embedding = config.embedding_model_instance.embed( + summary_text + ) # Chunk the full digest content for granular search try: chunks_data = await create_document_chunks(digest.content) except Exception as chunk_err: - logger.error(f"Failed to chunk repository {repo_full_name}: {chunk_err}") + logger.error( + f"Failed to chunk repository {repo_full_name}: {chunk_err}" + ) chunks_data = await _simple_chunk_content(digest.content) # Update document to READY with actual content @@ -401,7 +415,7 @@ async def index_github_repos( document.title = repo_full_name document.content = summary_text - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = doc_metadata safe_set_chunks(document, chunks_data) @@ -433,7 +447,9 @@ async def index_github_repos( document.status = DocumentStatus.failed(str(repo_err)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) errors.append(f"Failed processing {repo_full_name}: {repo_err}") documents_failed += 1 continue @@ -442,7 +458,9 @@ async def index_github_repos( await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit - logger.info(f"Final commit: Total {documents_processed} GitHub repositories processed") + logger.info( + f"Final commit: Total {documents_processed} GitHub repositories processed" + ) try: await session.commit() logger.info( diff --git a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py index ad749e61c..822e58d36 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_calendar_indexer.py @@ -345,25 +345,29 @@ async def index_google_calendar_events( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - events_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'event_markdown': event_markdown, - 'content_hash': content_hash, - 'event_id': event_id, - 'event_summary': event_summary, - 'calendar_id': calendar_id, - 'start_time': start_time, - 'end_time': end_time, - 'location': location, - 'description': description, - }) + events_to_process.append( + { + "document": existing_document, + "is_new": False, + "event_markdown": event_markdown, + "content_hash": content_hash, + "event_id": event_id, + "event_summary": event_summary, + "calendar_id": calendar_id, + "start_time": start_time, + "end_time": end_time, + "location": location, + "description": description, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -411,19 +415,21 @@ async def index_google_calendar_events( session.add(document) new_documents_created = True - events_to_process.append({ - 'document': document, - 'is_new': True, - 'event_markdown': event_markdown, - 'content_hash': content_hash, - 'event_id': event_id, - 'event_summary': event_summary, - 'calendar_id': calendar_id, - 'start_time': start_time, - 'end_time': end_time, - 'location': location, - 'description': description, - }) + events_to_process.append( + { + "document": document, + "is_new": True, + "event_markdown": event_markdown, + "content_hash": content_hash, + "event_id": event_id, + "event_summary": event_summary, + "calendar_id": calendar_id, + "start_time": start_time, + "end_time": end_time, + "location": location, + "description": description, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) @@ -432,7 +438,9 @@ async def index_google_calendar_events( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -449,7 +457,7 @@ async def index_google_calendar_events( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -462,48 +470,53 @@ async def index_google_calendar_events( if user_llm: document_metadata_for_summary = { - "event_id": item['event_id'], - "event_summary": item['event_summary'], - "calendar_id": item['calendar_id'], - "start_time": item['start_time'], - "end_time": item['end_time'], - "location": item['location'] or "No location", + "event_id": item["event_id"], + "event_summary": item["event_summary"], + "calendar_id": item["calendar_id"], + "start_time": item["start_time"], + "end_time": item["end_time"], + "location": item["location"] or "No location", "document_type": "Google Calendar Event", "connector_type": "Google Calendar", } - summary_content, summary_embedding = await generate_document_summary( - item['event_markdown'], user_llm, document_metadata_for_summary + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["event_markdown"], user_llm, document_metadata_for_summary ) else: - summary_content = f"Google Calendar Event: {item['event_summary']}\n\n" + summary_content = ( + f"Google Calendar Event: {item['event_summary']}\n\n" + ) summary_content += f"Calendar: {item['calendar_id']}\n" summary_content += f"Start: {item['start_time']}\n" summary_content += f"End: {item['end_time']}\n" - if item['location']: + if item["location"]: summary_content += f"Location: {item['location']}\n" - if item['description']: - desc_preview = item['description'][:1000] - if len(item['description']) > 1000: + if item["description"]: + desc_preview = item["description"][:1000] + if len(item["description"]) > 1000: desc_preview += "..." summary_content += f"Description: {desc_preview}\n" summary_embedding = config.embedding_model_instance.embed( summary_content ) - chunks = await create_document_chunks(item['event_markdown']) + chunks = await create_document_chunks(item["event_markdown"]) # Update document to READY with actual content - document.title = item['event_summary'] + document.title = item["event_summary"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "event_id": item['event_id'], - "event_summary": item['event_summary'], - "calendar_id": item['calendar_id'], - "start_time": item['start_time'], - "end_time": item['end_time'], - "location": item['location'], + "event_id": item["event_id"], + "event_summary": item["event_summary"], + "calendar_id": item["calendar_id"], + "start_time": item["start_time"], + "end_time": item["end_time"], + "location": item["location"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -527,7 +540,9 @@ async def index_google_calendar_events( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index 8eae35d00..f7624cffe 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -435,7 +435,7 @@ async def _index_full_scan( on_heartbeat_callback: HeartbeatCallbackType | None = None, ) -> tuple[int, int]: """Perform full scan indexing of a folder. - + Implements 2-phase document status updates for real-time UI feedback: - Phase 1: Collect all files and create pending documents (visible in UI immediately) - Phase 2: Process each file: pending → processing → ready/failed @@ -533,7 +533,9 @@ async def _index_full_scan( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([f for f in files_to_process if f[1] and f[1].id is None])} pending documents") + logger.info( + f"Phase 1: Committing {len([f for f in files_to_process if f[1] and f[1].id is None])} pending documents" + ) await session.commit() # ======================================================================= @@ -568,9 +570,7 @@ async def _index_full_scan( if documents_indexed % 10 == 0 and documents_indexed > 0: await session.commit() - logger.info( - f"Committed batch: {documents_indexed} files indexed so far" - ) + logger.info(f"Committed batch: {documents_indexed} files indexed so far") logger.info( f"Full scan complete: {documents_indexed} indexed, {documents_skipped} skipped, {documents_failed} failed" @@ -597,7 +597,7 @@ async def _index_with_delta_sync( Note: include_subfolders is accepted for API consistency but delta sync automatically tracks changes across all folders including subfolders. - + Implements 2-phase document status updates for real-time UI feedback: - Phase 1: Collect all changes and create pending documents (visible in UI immediately) - Phase 2: Process each file: pending → processing → ready/failed @@ -676,7 +676,7 @@ async def _index_with_delta_sync( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing pending documents") + logger.info("Phase 1: Committing pending documents") await session.commit() # ======================================================================= @@ -685,7 +685,7 @@ async def _index_with_delta_sync( # ======================================================================= logger.info(f"Phase 2: Processing {len(changes_to_process)} changes") - for change, file, pending_doc in changes_to_process: + for _, file, pending_doc in changes_to_process: # Check if it's time for a heartbeat update if on_heartbeat_callback: current_time = time.time() @@ -728,17 +728,17 @@ async def _create_pending_document_for_file( ) -> tuple[Document | None, bool]: """ Create a pending document for a Google Drive file if it doesn't exist. - + This is Phase 1 of the 2-phase document status update pattern. Creates documents with 'pending' status so they appear in UI immediately. - + Args: session: Database session file: File metadata from Google Drive API connector_id: ID of the Drive connector search_space_id: ID of the search space user_id: ID of the user - + Returns: Tuple of (document, should_skip): - (existing_doc, False): Existing document that needs update @@ -746,28 +746,28 @@ async def _create_pending_document_for_file( - (None, True): File should be skipped (unchanged, rename-only, or folder) """ from app.connectors.google_drive.file_types import should_skip_file - + file_id = file.get("id") file_name = file.get("name", "Unknown") mime_type = file.get("mimeType", "") - + # Skip folders and shortcuts if should_skip_file(mime_type): return None, True - + if not file_id: return None, True - + # Generate unique identifier hash for this file unique_identifier_hash = generate_unique_identifier_hash( DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id ) - + # Check if document exists existing_document = await check_document_by_unique_identifier( session, unique_identifier_hash ) - + if existing_document: # Check if this is a rename-only update (content unchanged) incoming_md5 = file.get("md5Checksum") @@ -775,7 +775,7 @@ async def _create_pending_document_for_file( doc_metadata = existing_document.document_metadata or {} stored_md5 = doc_metadata.get("md5_checksum") stored_modified_time = doc_metadata.get("modified_time") - + # Determine if content changed content_unchanged = False if incoming_md5 and stored_md5: @@ -783,16 +783,18 @@ async def _create_pending_document_for_file( elif not incoming_md5 and incoming_modified_time and stored_modified_time: # Google Workspace file - use modifiedTime as fallback content_unchanged = incoming_modified_time == stored_modified_time - + if content_unchanged: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() return None, True - + # Content changed - return existing document for update return existing_document, False - + # Create new pending document document = Document( search_space_id=search_space_id, @@ -815,7 +817,7 @@ async def _create_pending_document_for_file( connector_id=connector_id, ) session.add(document) - + return document, False @@ -958,7 +960,7 @@ async def _process_single_file( ) -> tuple[int, int, int]: """ Process a single file by downloading and using Surfsense's file processor. - + Implements Phase 2 of the 2-phase document status update pattern. Updates document status: pending → processing → ready/failed @@ -1042,12 +1044,13 @@ async def _process_single_file( processed_doc = await check_document_by_unique_identifier( session, unique_identifier_hash ) - if processed_doc: - # Ensure status is READY - if not DocumentStatus.is_state(processed_doc.status, DocumentStatus.READY): - processed_doc.status = DocumentStatus.ready() - processed_doc.updated_at = get_current_timestamp() - await session.commit() + # Ensure status is READY + if processed_doc and not DocumentStatus.is_state( + processed_doc.status, DocumentStatus.READY + ): + processed_doc.status = DocumentStatus.ready() + processed_doc.updated_at = get_current_timestamp() + await session.commit() logger.info(f"Successfully indexed Google Drive file: {file_name}") return 1, 0, 0 @@ -1061,7 +1064,9 @@ async def _process_single_file( pending_document.updated_at = get_current_timestamp() await session.commit() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) return 0, 0, 1 diff --git a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py index 89e8796d3..c7caee4da 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_gmail_indexer.py @@ -228,7 +228,9 @@ async def index_google_gmail_messages( documents_indexed = 0 documents_skipped = 0 documents_failed = 0 # Track messages that failed processing - duplicate_content_count = 0 # Track messages skipped due to duplicate content_hash + duplicate_content_count = ( + 0 # Track messages skipped due to duplicate content_hash + ) # Heartbeat tracking - update notification periodically to prevent appearing stuck last_heartbeat_time = time.time() @@ -294,23 +296,27 @@ async def index_google_gmail_messages( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - messages_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'message_id': message_id, - 'thread_id': thread_id, - 'subject': subject, - 'sender': sender, - 'date_str': date_str, - }) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date_str": date_str, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -356,17 +362,19 @@ async def index_google_gmail_messages( session.add(document) new_documents_created = True - messages_to_process.append({ - 'document': document, - 'is_new': True, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'message_id': message_id, - 'thread_id': thread_id, - 'subject': subject, - 'sender': sender, - 'date_str': date_str, - }) + messages_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "message_id": message_id, + "thread_id": thread_id, + "subject": subject, + "sender": sender, + "date_str": date_str, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for message: {e!s}", exc_info=True) @@ -375,7 +383,9 @@ async def index_google_gmail_messages( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -392,7 +402,7 @@ async def index_google_gmail_messages( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -405,16 +415,21 @@ async def index_google_gmail_messages( if user_llm: document_metadata_for_summary = { - "message_id": item['message_id'], - "thread_id": item['thread_id'], - "subject": item['subject'], - "sender": item['sender'], - "date": item['date_str'], + "message_id": item["message_id"], + "thread_id": item["thread_id"], + "subject": item["subject"], + "sender": item["sender"], + "date": item["date_str"], "document_type": "Gmail Message", "connector_type": "Google Gmail", } - summary_content, summary_embedding = await generate_document_summary( - item['markdown_content'], user_llm, document_metadata_for_summary + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["markdown_content"], + user_llm, + document_metadata_for_summary, ) else: summary_content = f"Google Gmail Message: {item['subject']}\n\n" @@ -424,19 +439,19 @@ async def index_google_gmail_messages( summary_content ) - chunks = await create_document_chunks(item['markdown_content']) + chunks = await create_document_chunks(item["markdown_content"]) # Update document to READY with actual content - document.title = item['subject'] + document.title = item["subject"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "message_id": item['message_id'], - "thread_id": item['thread_id'], - "subject": item['subject'], - "sender": item['sender'], - "date": item['date_str'], + "message_id": item["message_id"], + "thread_id": item["thread_id"], + "subject": item["subject"], + "sender": item["sender"], + "date": item["date_str"], "connector_id": connector_id, } safe_set_chunks(document, chunks) @@ -459,7 +474,9 @@ async def index_google_gmail_messages( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue diff --git a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py index 038df0f46..65f56ce46 100644 --- a/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/jira_indexer.py @@ -239,23 +239,27 @@ async def index_jira_issues( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - issues_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'issue_content': issue_content, - 'content_hash': content_hash, - 'issue_id': issue_id, - 'issue_identifier': issue_identifier, - 'issue_title': issue_title, - 'formatted_issue': formatted_issue, - 'comment_count': comment_count, - }) + issues_to_process.append( + { + "document": existing_document, + "is_new": False, + "issue_content": issue_content, + "content_hash": content_hash, + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "formatted_issue": formatted_issue, + "comment_count": comment_count, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -301,17 +305,19 @@ async def index_jira_issues( session.add(document) new_documents_created = True - issues_to_process.append({ - 'document': document, - 'is_new': True, - 'issue_content': issue_content, - 'content_hash': content_hash, - 'issue_id': issue_id, - 'issue_identifier': issue_identifier, - 'issue_title': issue_title, - 'formatted_issue': formatted_issue, - 'comment_count': comment_count, - }) + issues_to_process.append( + { + "document": document, + "is_new": True, + "issue_content": issue_content, + "content_hash": content_hash, + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "formatted_issue": formatted_issue, + "comment_count": comment_count, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True) @@ -320,7 +326,9 @@ async def index_jira_issues( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -337,7 +345,7 @@ async def index_jira_issues( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -350,11 +358,11 @@ async def index_jira_issues( if user_llm: document_metadata = { - "issue_key": item['issue_identifier'], - "issue_title": item['issue_title'], - "status": item['formatted_issue'].get("status", "Unknown"), - "priority": item['formatted_issue'].get("priority", "Unknown"), - "comment_count": item['comment_count'], + "issue_key": item["issue_identifier"], + "issue_title": item["issue_title"], + "status": item["formatted_issue"].get("status", "Unknown"), + "priority": item["formatted_issue"].get("priority", "Unknown"), + "comment_count": item["comment_count"], "document_type": "Jira Issue", "connector_type": "Jira", } @@ -362,34 +370,32 @@ async def index_jira_issues( summary_content, summary_embedding, ) = await generate_document_summary( - item['issue_content'], user_llm, document_metadata + item["issue_content"], user_llm, document_metadata ) else: # Fallback to simple summary if no LLM configured summary_content = f"Jira Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['formatted_issue'].get('status', 'Unknown')}\n\n" - if item['formatted_issue'].get("description"): - summary_content += ( - f"Description: {item['formatted_issue'].get('description')}\n\n" - ) + if item["formatted_issue"].get("description"): + summary_content += f"Description: {item['formatted_issue'].get('description')}\n\n" summary_content += f"Comments: {item['comment_count']}" summary_embedding = config.embedding_model_instance.embed( summary_content ) # Process chunks - using the full issue content with comments - chunks = await create_document_chunks(item['issue_content']) + chunks = await create_document_chunks(item["issue_content"]) # Update document to READY with actual content document.title = f"{item['issue_identifier']}: {item['issue_title']}" document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "issue_id": item['issue_id'], - "issue_identifier": item['issue_identifier'], - "issue_title": item['issue_title'], - "state": item['formatted_issue'].get("status", "Unknown"), - "comment_count": item['comment_count'], + "issue_id": item["issue_id"], + "issue_identifier": item["issue_identifier"], + "issue_title": item["issue_title"], + "state": item["formatted_issue"].get("status", "Unknown"), + "comment_count": item["comment_count"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -416,7 +422,9 @@ async def index_jira_issues( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue # Skip this issue and continue with others diff --git a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py index 45e1e357a..87bafb3c0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/linear_indexer.py @@ -272,7 +272,9 @@ async def index_linear_issues( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() logger.info( f"Document for Linear issue {issue_identifier} unchanged. Skipping." @@ -281,19 +283,21 @@ async def index_linear_issues( continue # Queue existing document for update (will be set to processing in Phase 2) - issues_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'issue_content': issue_content, - 'content_hash': content_hash, - 'issue_id': issue_id, - 'issue_identifier': issue_identifier, - 'issue_title': issue_title, - 'state': state, - 'description': description, - 'comment_count': comment_count, - 'priority': priority, - }) + issues_to_process.append( + { + "document": existing_document, + "is_new": False, + "issue_content": issue_content, + "content_hash": content_hash, + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "state": state, + "description": description, + "comment_count": comment_count, + "priority": priority, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -338,19 +342,21 @@ async def index_linear_issues( session.add(document) new_documents_created = True - issues_to_process.append({ - 'document': document, - 'is_new': True, - 'issue_content': issue_content, - 'content_hash': content_hash, - 'issue_id': issue_id, - 'issue_identifier': issue_identifier, - 'issue_title': issue_title, - 'state': state, - 'description': description, - 'comment_count': comment_count, - 'priority': priority, - }) + issues_to_process.append( + { + "document": document, + "is_new": True, + "issue_content": issue_content, + "content_hash": content_hash, + "issue_id": issue_id, + "issue_identifier": issue_identifier, + "issue_title": issue_title, + "state": state, + "description": description, + "comment_count": comment_count, + "priority": priority, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for issue: {e!s}", exc_info=True) @@ -359,7 +365,9 @@ async def index_linear_issues( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([i for i in issues_to_process if i['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -376,7 +384,7 @@ async def index_linear_issues( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -389,20 +397,23 @@ async def index_linear_issues( if user_llm: document_metadata_for_summary = { - "issue_id": item['issue_identifier'], - "issue_title": item['issue_title'], - "state": item['state'], - "priority": item['priority'], - "comment_count": item['comment_count'], + "issue_id": item["issue_identifier"], + "issue_title": item["issue_title"], + "state": item["state"], + "priority": item["priority"], + "comment_count": item["comment_count"], "document_type": "Linear Issue", "connector_type": "Linear", } - summary_content, summary_embedding = await generate_document_summary( - item['issue_content'], user_llm, document_metadata_for_summary + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["issue_content"], user_llm, document_metadata_for_summary ) else: # Fallback to simple summary if no LLM configured - description = item['description'] + description = item["description"] if description and len(description) > 1000: description = description[:997] + "..." summary_content = f"Linear Issue {item['issue_identifier']}: {item['issue_title']}\n\nStatus: {item['state']}\n\n" @@ -413,19 +424,19 @@ async def index_linear_issues( summary_content ) - chunks = await create_document_chunks(item['issue_content']) + chunks = await create_document_chunks(item["issue_content"]) # Update document to READY with actual content document.title = f"{item['issue_identifier']}: {item['issue_title']}" document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "issue_id": item['issue_id'], - "issue_identifier": item['issue_identifier'], - "issue_title": item['issue_title'], - "state": item['state'], - "comment_count": item['comment_count'], + "issue_id": item["issue_id"], + "issue_identifier": item["issue_identifier"], + "issue_title": item["issue_title"], + "state": item["state"], + "comment_count": item["comment_count"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -452,7 +463,9 @@ async def index_linear_issues( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) skipped_issues.append( f"{item.get('issue_identifier', 'Unknown')} (processing error)" ) @@ -466,7 +479,9 @@ async def index_linear_issues( logger.info(f"Final commit: Total {documents_indexed} Linear issues processed") try: await session.commit() - logger.info("Successfully committed all Linear document changes to database") + logger.info( + "Successfully committed all Linear document changes to database" + ) except Exception as e: # Handle any remaining integrity errors gracefully (race conditions, etc.) if ( diff --git a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py index 80d4ef3cf..04af80e53 100644 --- a/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/luma_indexer.py @@ -305,7 +305,9 @@ async def index_luma_events( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() logger.info( f"Document for Luma event {event_name} unchanged. Skipping." @@ -314,23 +316,25 @@ async def index_luma_events( continue # Queue existing document for update (will be set to processing in Phase 2) - events_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'event_id': event_id, - 'event_name': event_name, - 'event_url': event_url, - 'event_markdown': event_markdown, - 'content_hash': content_hash, - 'start_at': start_at, - 'end_at': end_at, - 'timezone': timezone, - 'location': location, - 'city': city, - 'host_names': host_names, - 'description': description, - 'cover_url': cover_url, - }) + events_to_process.append( + { + "document": existing_document, + "is_new": False, + "event_id": event_id, + "event_name": event_name, + "event_url": event_url, + "event_markdown": event_markdown, + "content_hash": content_hash, + "start_at": start_at, + "end_at": end_at, + "timezone": timezone, + "location": location, + "city": city, + "host_names": host_names, + "description": description, + "cover_url": cover_url, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -380,23 +384,25 @@ async def index_luma_events( session.add(document) new_documents_created = True - events_to_process.append({ - 'document': document, - 'is_new': True, - 'event_id': event_id, - 'event_name': event_name, - 'event_url': event_url, - 'event_markdown': event_markdown, - 'content_hash': content_hash, - 'start_at': start_at, - 'end_at': end_at, - 'timezone': timezone, - 'location': location, - 'city': city, - 'host_names': host_names, - 'description': description, - 'cover_url': cover_url, - }) + events_to_process.append( + { + "document": document, + "is_new": True, + "event_id": event_id, + "event_name": event_name, + "event_url": event_url, + "event_markdown": event_markdown, + "content_hash": content_hash, + "start_at": start_at, + "end_at": end_at, + "timezone": timezone, + "location": location, + "city": city, + "host_names": host_names, + "description": description, + "cover_url": cover_url, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for event: {e!s}", exc_info=True) @@ -405,7 +411,9 @@ async def index_luma_events( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([e for e in events_to_process if e['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -422,7 +430,7 @@ async def index_luma_events( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -435,15 +443,15 @@ async def index_luma_events( if user_llm: document_metadata_for_summary = { - "event_id": item['event_id'], - "event_name": item['event_name'], - "event_url": item['event_url'], - "start_at": item['start_at'], - "end_at": item['end_at'], - "timezone": item['timezone'], - "location": item['location'] or "No location", - "city": item['city'], - "hosts": item['host_names'], + "event_id": item["event_id"], + "event_name": item["event_name"], + "event_url": item["event_url"], + "start_at": item["start_at"], + "end_at": item["end_at"], + "timezone": item["timezone"], + "location": item["location"] or "No location", + "city": item["city"], + "hosts": item["host_names"], "document_type": "Luma Event", "connector_type": "Luma", } @@ -451,26 +459,26 @@ async def index_luma_events( summary_content, summary_embedding, ) = await generate_document_summary( - item['event_markdown'], user_llm, document_metadata_for_summary + item["event_markdown"], user_llm, document_metadata_for_summary ) else: # Fallback to simple summary if no LLM configured summary_content = f"Luma Event: {item['event_name']}\n\n" - if item['event_url']: + if item["event_url"]: summary_content += f"URL: {item['event_url']}\n" summary_content += f"Start: {item['start_at']}\n" summary_content += f"End: {item['end_at']}\n" - if item['timezone']: + if item["timezone"]: summary_content += f"Timezone: {item['timezone']}\n" - if item['location']: + if item["location"]: summary_content += f"Location: {item['location']}\n" - if item['city']: + if item["city"]: summary_content += f"City: {item['city']}\n" - if item['host_names']: + if item["host_names"]: summary_content += f"Hosts: {item['host_names']}\n" - if item['description']: - desc_preview = item['description'][:1000] - if len(item['description']) > 1000: + if item["description"]: + desc_preview = item["description"][:1000] + if len(item["description"]) > 1000: desc_preview += "..." summary_content += f"Description: {desc_preview}\n" @@ -478,24 +486,24 @@ async def index_luma_events( summary_content ) - chunks = await create_document_chunks(item['event_markdown']) + chunks = await create_document_chunks(item["event_markdown"]) # Update document to READY with actual content - document.title = item['event_name'] + document.title = item["event_name"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "event_id": item['event_id'], - "event_name": item['event_name'], - "event_url": item['event_url'], - "start_at": item['start_at'], - "end_at": item['end_at'], - "timezone": item['timezone'], - "location": item['location'], - "city": item['city'], - "hosts": item['host_names'], - "cover_url": item['cover_url'], + "event_id": item["event_id"], + "event_name": item["event_name"], + "event_url": item["event_url"], + "start_at": item["start_at"], + "end_at": item["end_at"], + "timezone": item["timezone"], + "location": item["location"], + "city": item["city"], + "hosts": item["host_names"], + "cover_url": item["cover_url"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -522,7 +530,9 @@ async def index_luma_events( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) skipped_events.append( f"{item.get('event_name', 'Unknown')} (processing error)" ) diff --git a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py index 37927b779..52704e173 100644 --- a/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/notion_indexer.py @@ -354,20 +354,24 @@ async def index_notion_pages( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - pages_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'page_id': page_id, - 'page_title': page_title, - }) + pages_to_process.append( + { + "document": existing_document, + "is_new": False, + "markdown_content": markdown_content, + "content_hash": content_hash, + "page_id": page_id, + "page_title": page_title, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -410,14 +414,16 @@ async def index_notion_pages( session.add(document) new_documents_created = True - pages_to_process.append({ - 'document': document, - 'is_new': True, - 'markdown_content': markdown_content, - 'content_hash': content_hash, - 'page_id': page_id, - 'page_title': page_title, - }) + pages_to_process.append( + { + "document": document, + "is_new": True, + "markdown_content": markdown_content, + "content_hash": content_hash, + "page_id": page_id, + "page_title": page_title, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for page: {e!s}", exc_info=True) @@ -426,7 +432,9 @@ async def index_notion_pages( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([p for p in pages_to_process if p['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -443,7 +451,7 @@ async def index_notion_pages( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() @@ -456,13 +464,18 @@ async def index_notion_pages( if user_llm: document_metadata_for_summary = { - "page_title": item['page_title'], - "page_id": item['page_id'], + "page_title": item["page_title"], + "page_id": item["page_id"], "document_type": "Notion Page", "connector_type": "Notion", } - summary_content, summary_embedding = await generate_document_summary( - item['markdown_content'], user_llm, document_metadata_for_summary + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( + item["markdown_content"], + user_llm, + document_metadata_for_summary, ) else: # Fallback to simple summary if no LLM configured @@ -471,16 +484,16 @@ async def index_notion_pages( summary_content ) - chunks = await create_document_chunks(item['markdown_content']) + chunks = await create_document_chunks(item["markdown_content"]) # Update document to READY with actual content - document.title = item['page_title'] + document.title = item["page_title"] document.content = summary_content - document.content_hash = item['content_hash'] + document.content_hash = item["content_hash"] document.embedding = summary_embedding document.document_metadata = { - "page_title": item['page_title'], - "page_id": item['page_id'], + "page_title": item["page_title"], + "page_id": item["page_id"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -504,7 +517,9 @@ async def index_notion_pages( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) skipped_pages.append(f"{item['page_title']} (processing error)") documents_failed += 1 continue diff --git a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py index 0e6934e2c..6dea1a730 100644 --- a/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/obsidian_indexer.py @@ -382,27 +382,31 @@ async def index_obsidian_vault( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() logger.debug(f"Note {title} unchanged, skipping") skipped_count += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - files_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'file_info': file_info, - 'content': content, - 'body_content': body_content, - 'frontmatter': frontmatter, - 'wiki_links': wiki_links, - 'tags': tags, - 'title': title, - 'relative_path': relative_path, - 'content_hash': content_hash, - 'unique_identifier_hash': unique_identifier_hash, - }) + files_to_process.append( + { + "document": existing_document, + "is_new": False, + "file_info": file_info, + "content": content, + "body_content": body_content, + "frontmatter": frontmatter, + "wiki_links": wiki_links, + "tags": tags, + "title": title, + "relative_path": relative_path, + "content_hash": content_hash, + "unique_identifier_hash": unique_identifier_hash, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -445,20 +449,22 @@ async def index_obsidian_vault( session.add(document) new_documents_created = True - files_to_process.append({ - 'document': document, - 'is_new': True, - 'file_info': file_info, - 'content': content, - 'body_content': body_content, - 'frontmatter': frontmatter, - 'wiki_links': wiki_links, - 'tags': tags, - 'title': title, - 'relative_path': relative_path, - 'content_hash': content_hash, - 'unique_identifier_hash': unique_identifier_hash, - }) + files_to_process.append( + { + "document": document, + "is_new": True, + "file_info": file_info, + "content": content, + "body_content": body_content, + "frontmatter": frontmatter, + "wiki_links": wiki_links, + "tags": tags, + "title": title, + "relative_path": relative_path, + "content_hash": content_hash, + "unique_identifier_hash": unique_identifier_hash, + } + ) except Exception as e: logger.exception( @@ -469,7 +475,9 @@ async def index_obsidian_vault( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([f for f in files_to_process if f['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -491,22 +499,22 @@ async def index_obsidian_vault( await on_heartbeat_callback(indexed_count) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() await session.commit() # Extract data from item - title = item['title'] - relative_path = item['relative_path'] - content = item['content'] - body_content = item['body_content'] - frontmatter = item['frontmatter'] - wiki_links = item['wiki_links'] - tags = item['tags'] - content_hash = item['content_hash'] - file_info = item['file_info'] + title = item["title"] + relative_path = item["relative_path"] + content = item["content"] + body_content = item["body_content"] + frontmatter = item["frontmatter"] + wiki_links = item["wiki_links"] + tags = item["tags"] + content_hash = item["content_hash"] + file_info = item["file_info"] # Build metadata document_metadata = { @@ -584,7 +592,9 @@ async def index_obsidian_vault( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) failed_count += 1 continue @@ -592,9 +602,7 @@ async def index_obsidian_vault( await update_connector_last_indexed(session, connector, update_last_indexed) # Final commit for any remaining documents not yet committed in batches - logger.info( - f"Final commit: Total {indexed_count} Obsidian notes processed" - ) + logger.info(f"Final commit: Total {indexed_count} Obsidian notes processed") try: await session.commit() logger.info( diff --git a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py index 61faa39b3..111552fa6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/slack_indexer.py @@ -314,7 +314,9 @@ async def index_slack_messages( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() logger.info( f"Document for Slack message {msg_ts} in channel {channel_name} unchanged. Skipping." @@ -323,18 +325,20 @@ async def index_slack_messages( continue # Queue existing document for update (will be set to processing in Phase 2) - messages_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'combined_document_string': combined_document_string, - 'content_hash': content_hash, - 'channel_name': channel_name, - 'channel_id': channel_id, - 'msg_ts': msg_ts, - 'start_date': start_date_str, - 'end_date': end_date_str, - 'message_count': len(formatted_messages), - }) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "channel_name": channel_name, + "channel_id": channel_id, + "msg_ts": msg_ts, + "start_date": start_date_str, + "end_date": end_date_str, + "message_count": len(formatted_messages), + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -377,18 +381,20 @@ async def index_slack_messages( session.add(document) new_documents_created = True - messages_to_process.append({ - 'document': document, - 'is_new': True, - 'combined_document_string': combined_document_string, - 'content_hash': content_hash, - 'channel_name': channel_name, - 'channel_id': channel_id, - 'msg_ts': msg_ts, - 'start_date': start_date_str, - 'end_date': end_date_str, - 'message_count': len(formatted_messages), - }) + messages_to_process.append( + { + "document": document, + "is_new": True, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "channel_name": channel_name, + "channel_id": channel_id, + "msg_ts": msg_ts, + "start_date": start_date_str, + "end_date": end_date_str, + "message_count": len(formatted_messages), + } + ) logger.info( f"Phase 1: Collected {len(formatted_messages)} messages from channel {channel_name}" @@ -409,7 +415,9 @@ async def index_slack_messages( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -426,29 +434,29 @@ async def index_slack_messages( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() await session.commit() # Heavy processing (embeddings, chunks) - chunks = await create_document_chunks(item['combined_document_string']) + chunks = await create_document_chunks(item["combined_document_string"]) doc_embedding = config.embedding_model_instance.embed( - item['combined_document_string'] + item["combined_document_string"] ) # Update document to READY with actual content - document.title = item['channel_name'] - document.content = item['combined_document_string'] - document.content_hash = item['content_hash'] + document.title = item["channel_name"] + document.content = item["combined_document_string"] + document.content_hash = item["content_hash"] document.embedding = doc_embedding document.document_metadata = { - "channel_name": item['channel_name'], - "channel_id": item['channel_id'], - "start_date": item['start_date'], - "end_date": item['end_date'], - "message_count": item['message_count'], + "channel_name": item["channel_name"], + "channel_id": item["channel_id"], + "start_date": item["start_date"], + "end_date": item["end_date"], + "message_count": item["message_count"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -475,7 +483,9 @@ async def index_slack_messages( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue diff --git a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py index 27259fd6f..1b13a2c37 100644 --- a/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/teams_indexer.py @@ -332,25 +332,31 @@ async def index_teams_messages( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): - existing_document.status = DocumentStatus.ready() + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = ( + DocumentStatus.ready() + ) documents_skipped += 1 continue # Queue existing document for update (will be set to processing in Phase 2) - messages_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'combined_document_string': combined_document_string, - 'content_hash': content_hash, - 'team_name': team_name, - 'team_id': team_id, - 'channel_name': channel_name, - 'channel_id': channel_id, - 'message_id': message_id, - 'start_date': start_date_str, - 'end_date': end_date_str, - }) + messages_to_process.append( + { + "document": existing_document, + "is_new": False, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "team_name": team_name, + "team_id": team_id, + "channel_name": channel_name, + "channel_id": channel_id, + "message_id": message_id, + "start_date": start_date_str, + "end_date": end_date_str, + } + ) continue # Document doesn't exist by unique_identifier_hash @@ -400,19 +406,21 @@ async def index_teams_messages( session.add(document) new_documents_created = True - messages_to_process.append({ - 'document': document, - 'is_new': True, - 'combined_document_string': combined_document_string, - 'content_hash': content_hash, - 'team_name': team_name, - 'team_id': team_id, - 'channel_name': channel_name, - 'channel_id': channel_id, - 'message_id': message_id, - 'start_date': start_date_str, - 'end_date': end_date_str, - }) + messages_to_process.append( + { + "document": document, + "is_new": True, + "combined_document_string": combined_document_string, + "content_hash": content_hash, + "team_name": team_name, + "team_id": team_id, + "channel_name": channel_name, + "channel_id": channel_id, + "message_id": message_id, + "start_date": start_date_str, + "end_date": end_date_str, + } + ) except Exception as e: logger.error( @@ -432,7 +440,9 @@ async def index_teams_messages( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([m for m in messages_to_process if m['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -449,30 +459,30 @@ async def index_teams_messages( await on_heartbeat_callback(documents_indexed) last_heartbeat_time = current_time - document = item['document'] + document = item["document"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only document.status = DocumentStatus.processing() await session.commit() # Heavy processing (embeddings, chunks) - chunks = await create_document_chunks(item['combined_document_string']) + chunks = await create_document_chunks(item["combined_document_string"]) doc_embedding = config.embedding_model_instance.embed( - item['combined_document_string'] + item["combined_document_string"] ) # Update document to READY with actual content document.title = f"{item['team_name']} - {item['channel_name']}" - document.content = item['combined_document_string'] - document.content_hash = item['content_hash'] + document.content = item["combined_document_string"] + document.content_hash = item["content_hash"] document.embedding = doc_embedding document.document_metadata = { - "team_name": item['team_name'], - "team_id": item['team_id'], - "channel_name": item['channel_name'], - "channel_id": item['channel_id'], - "start_date": item['start_date'], - "end_date": item['end_date'], + "team_name": item["team_name"], + "team_id": item["team_id"], + "channel_name": item["channel_name"], + "channel_id": item["channel_id"], + "start_date": item["start_date"], + "end_date": item["end_date"], "indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "connector_id": connector_id, } @@ -497,7 +507,9 @@ async def index_teams_messages( document.status = DocumentStatus.failed(str(e)) document.updated_at = get_current_timestamp() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue @@ -510,9 +522,7 @@ async def index_teams_messages( ) try: await session.commit() - logger.info( - "Successfully committed all Teams document changes to database" - ) + logger.info("Successfully committed all Teams document changes to database") except Exception as e: # Handle any remaining integrity errors gracefully (race conditions, etc.) if ( diff --git a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py index 5d25b4623..5b3fa02b0 100644 --- a/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/webcrawler_indexer.py @@ -184,22 +184,28 @@ async def index_crawled_urls( if existing_document: # Document exists - check if it's already being processed - if DocumentStatus.is_state(existing_document.status, DocumentStatus.PENDING): + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PENDING + ): logger.info(f"URL {url} already pending. Skipping.") documents_skipped += 1 continue - if DocumentStatus.is_state(existing_document.status, DocumentStatus.PROCESSING): + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PROCESSING + ): logger.info(f"URL {url} already processing. Skipping.") documents_skipped += 1 continue # Queue existing document for potential update check - urls_to_process.append({ - 'document': existing_document, - 'is_new': False, - 'url': url, - 'unique_identifier_hash': unique_identifier_hash, - }) + urls_to_process.append( + { + "document": existing_document, + "is_new": False, + "url": url, + "unique_identifier_hash": unique_identifier_hash, + } + ) continue # Create new document with PENDING status (visible in UI immediately) @@ -224,12 +230,14 @@ async def index_crawled_urls( session.add(document) new_documents_created = True - urls_to_process.append({ - 'document': document, - 'is_new': True, - 'url': url, - 'unique_identifier_hash': unique_identifier_hash, - }) + urls_to_process.append( + { + "document": document, + "is_new": True, + "url": url, + "unique_identifier_hash": unique_identifier_hash, + } + ) except Exception as e: logger.error(f"Error in Phase 1 for URL {url}: {e!s}", exc_info=True) @@ -238,7 +246,9 @@ async def index_crawled_urls( # Commit all pending documents - they all appear in UI now if new_documents_created: - logger.info(f"Phase 1: Committing {len([u for u in urls_to_process if u['is_new']])} pending documents") + logger.info( + f"Phase 1: Committing {len([u for u in urls_to_process if u['is_new']])} pending documents" + ) await session.commit() # ======================================================================= @@ -255,9 +265,9 @@ async def index_crawled_urls( await on_heartbeat_callback(documents_indexed + documents_updated) last_heartbeat_time = current_time - document = item['document'] - url = item['url'] - is_new = item['is_new'] + document = item["document"] + url = item["url"] + is_new = item["is_new"] try: # Set to PROCESSING and commit - shows "processing" in UI for THIS document only @@ -298,7 +308,9 @@ async def index_crawled_urls( continue # Format content as structured document for summary generation - structured_document = crawler.format_to_structured_document(crawl_result) + structured_document = crawler.format_to_structured_document( + crawl_result + ) # Generate content hash using a version WITHOUT metadata structured_document_for_hash = crawler.format_to_structured_document( @@ -339,7 +351,9 @@ async def index_crawled_urls( f"(existing document ID: {duplicate_by_content.id}). " f"Marking as failed." ) - document.status = DocumentStatus.failed("Duplicate content exists") + document.status = DocumentStatus.failed( + "Duplicate content exists" + ) document.updated_at = get_current_timestamp() await session.commit() duplicate_content_count += 1 @@ -360,7 +374,10 @@ async def index_crawled_urls( "document_type": "Crawled URL", "crawler_type": crawler_type, } - summary_content, summary_embedding = await generate_document_summary( + ( + summary_content, + summary_embedding, + ) = await generate_document_summary( structured_document, user_llm, document_metadata_for_summary ) else: @@ -423,7 +440,9 @@ async def index_crawled_urls( document.updated_at = get_current_timestamp() await session.commit() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) documents_failed += 1 continue @@ -438,7 +457,9 @@ async def index_crawled_urls( ) try: await session.commit() - logger.info("Successfully committed all webcrawler document changes to database") + logger.info( + "Successfully committed all webcrawler document changes to database" + ) except Exception as e: # Handle any remaining integrity errors gracefully if "duplicate key value violates unique constraint" in str(e).lower(): diff --git a/surfsense_backend/app/tasks/document_processors/base.py b/surfsense_backend/app/tasks/document_processors/base.py index c8046868c..2047ec63d 100644 --- a/surfsense_backend/app/tasks/document_processors/base.py +++ b/surfsense_backend/app/tasks/document_processors/base.py @@ -17,29 +17,30 @@ md = MarkdownifyTransformer() def safe_set_chunks(document: Document, chunks: list) -> None: """ Safely assign chunks to a document without triggering lazy loading. - + ALWAYS use this instead of `document.chunks = chunks` to avoid SQLAlchemy async errors (MissingGreenlet / greenlet_spawn). - + Why this is needed: - Direct assignment `document.chunks = chunks` triggers SQLAlchemy to load the OLD chunks first (for comparison/orphan detection) - This lazy loading fails in async context with asyncpg driver - set_committed_value bypasses this by setting the value directly - + This function is safe regardless of how the document was loaded (with or without selectinload). - + Args: document: The Document object to update chunks: List of Chunk objects to assign - + Example: # Instead of: document.chunks = chunks (DANGEROUS!) safe_set_chunks(document, chunks) # Always safe """ from sqlalchemy.orm.attributes import set_committed_value - set_committed_value(document, 'chunks', chunks) + + set_committed_value(document, "chunks", chunks) def get_current_timestamp() -> datetime: diff --git a/surfsense_backend/app/tasks/document_processors/circleback_processor.py b/surfsense_backend/app/tasks/document_processors/circleback_processor.py index e9c395c83..a513bcaf0 100644 --- a/surfsense_backend/app/tasks/document_processors/circleback_processor.py +++ b/surfsense_backend/app/tasks/document_processors/circleback_processor.py @@ -91,7 +91,9 @@ async def add_circleback_meeting_document( # Document exists - check if content has changed if existing_document.content_hash == content_hash: # Ensure status is ready (might have been stuck in processing/pending) - if not DocumentStatus.is_state(existing_document.status, DocumentStatus.READY): + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): existing_document.status = DocumentStatus.ready() await session.commit() logger.info(f"Circleback meeting {meeting_id} unchanged. Skipping.") @@ -110,7 +112,7 @@ async def add_circleback_meeting_document( # PHASE 1: Create document with PENDING status # This makes the document visible in the UI immediately # ======================================================================= - + # Fetch the user who set up the Circleback connector (preferred) # or fall back to search space owner if no connector found created_by_user_id = None @@ -173,7 +175,7 @@ async def add_circleback_meeting_document( # ======================================================================= # PHASE 3: Process the document content # ======================================================================= - + # Get LLM for generating summary llm = await get_document_summary_llm(session, search_space_id) if not llm: @@ -243,7 +245,7 @@ async def add_circleback_meeting_document( await session.commit() await session.refresh(document) - + if existing_document: logger.info( f"Updated Circleback meeting document {meeting_id} in search space {search_space_id}" @@ -267,7 +269,9 @@ async def add_circleback_meeting_document( document.updated_at = get_current_timestamp() await session.commit() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) raise db_error except Exception as e: await session.rollback() @@ -279,5 +283,7 @@ async def add_circleback_meeting_document( document.updated_at = get_current_timestamp() await session.commit() except Exception as status_error: - logger.error(f"Failed to update document status to failed: {status_error}") + logger.error( + f"Failed to update document status to failed: {status_error}" + ) raise RuntimeError(f"Failed to process Circleback meeting: {e!s}") from e diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index e14dc3f42..3fa57e998 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -1629,16 +1629,16 @@ async def process_file_in_background_with_document( ) -> Document | None: """ Process file and update existing pending document (2-phase pattern). - + This function is Phase 2 of the real-time document status updates: - Phase 1 (API): Created document with pending status - Phase 2 (this): Process file and update document to ready/failed - + The document already exists with pending status. This function: 1. Parses the file content (markdown, audio, or ETL services) 2. Updates the document with content, embeddings, and chunks 3. Sets status to 'ready' on success - + Args: document: Existing document with pending status file_path: Path to the uploaded file @@ -1650,7 +1650,7 @@ async def process_file_in_background_with_document( log_entry: Log entry for this task connector: Optional connector info for Google Drive files notification: Optional notification for progress updates - + Returns: Updated Document object if successful, None if duplicate content detected """ @@ -1665,13 +1665,18 @@ async def process_file_in_background_with_document( etl_service = None # ===== STEP 1: Parse file content based on type ===== - + # Check if the file is a markdown or text file if filename.lower().endswith((".md", ".markdown", ".txt")): # Update notification: parsing stage if notification: - await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="parsing", stage_message="Reading file" + await ( + NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Reading file", + ) ) await task_logger.log_task_progress( @@ -1695,8 +1700,13 @@ async def process_file_in_background_with_document( ): # Update notification: parsing stage (transcription) if notification: - await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="parsing", stage_message="Transcribing audio" + await ( + NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Transcribing audio", + ) ) await task_logger.log_task_progress( @@ -1708,7 +1718,8 @@ async def process_file_in_background_with_document( # Transcribe audio stt_service_type = ( "local" - if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + if app_config.STT_SERVICE + and app_config.STT_SERVICE.startswith("local/") else "external" ) @@ -1719,7 +1730,9 @@ async def process_file_in_background_with_document( transcribed_text = result.get("text", "") if not transcribed_text: raise ValueError("Transcription returned empty text") - markdown_content = f"# Transcription of {filename}\n\n{transcribed_text}" + markdown_content = ( + f"# Transcription of {filename}\n\n{transcribed_text}" + ) else: with open(file_path, "rb") as audio_file: transcription_kwargs = { @@ -1728,12 +1741,18 @@ async def process_file_in_background_with_document( "api_key": app_config.STT_SERVICE_API_KEY, } if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE - transcription_response = await atranscription(**transcription_kwargs) + transcription_kwargs["api_base"] = ( + app_config.STT_SERVICE_API_BASE + ) + transcription_response = await atranscription( + **transcription_kwargs + ) transcribed_text = transcription_response.get("text", "") if not transcribed_text: raise ValueError("Transcription returned empty text") - markdown_content = f"# Transcription of {filename}\n\n{transcribed_text}" + markdown_content = ( + f"# Transcription of {filename}\n\n{transcribed_text}" + ) etl_service = "AUDIO_TRANSCRIPTION" # Clean up temp file @@ -1742,13 +1761,18 @@ async def process_file_in_background_with_document( else: # Document files - use ETL service - from app.services.page_limit_service import PageLimitExceededError, PageLimitService + from app.services.page_limit_service import ( + PageLimitExceededError, + PageLimitService, + ) page_limit_service = PageLimitService(session) # Estimate page count try: - estimated_pages = page_limit_service.estimate_pages_before_processing(file_path) + estimated_pages = page_limit_service.estimate_pages_before_processing( + file_path + ) except Exception: file_size = os.path.getsize(file_path) estimated_pages = max(1, file_size // (80 * 1024)) @@ -1759,14 +1783,22 @@ async def process_file_in_background_with_document( if app_config.ETL_SERVICE == "UNSTRUCTURED": if notification: await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="parsing", stage_message="Extracting content" + session, + notification, + stage="parsing", + stage_message="Extracting content", ) from langchain_unstructured import UnstructuredLoader loader = UnstructuredLoader( - file_path, mode="elements", post_processors=[], languages=["eng"], - include_orig_elements=False, include_metadata=False, strategy="auto" + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", ) docs = await loader.aload() markdown_content = await convert_document_to_markdown(docs) @@ -1775,37 +1807,55 @@ async def process_file_in_background_with_document( etl_service = "UNSTRUCTURED" # Update page usage - await page_limit_service.update_page_usage(user_id, final_page_count, allow_exceed=True) + await page_limit_service.update_page_usage( + user_id, final_page_count, allow_exceed=True + ) elif app_config.ETL_SERVICE == "LLAMACLOUD": if notification: await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="parsing", stage_message="Extracting content" + session, + notification, + stage="parsing", + stage_message="Extracting content", ) result = await parse_with_llamacloud_retry( - file_path=file_path, estimated_pages=estimated_pages, - task_logger=task_logger, log_entry=log_entry + file_path=file_path, + estimated_pages=estimated_pages, + task_logger=task_logger, + log_entry=log_entry, + ) + markdown_documents = await result.aget_markdown_documents( + split_by_page=False ) - markdown_documents = await result.aget_markdown_documents(split_by_page=False) if not markdown_documents: - raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}") + raise RuntimeError( + f"LlamaCloud parsing returned no documents: {filename}" + ) markdown_content = markdown_documents[0].text etl_service = "LLAMACLOUD" # Update page usage - await page_limit_service.update_page_usage(user_id, estimated_pages, allow_exceed=True) + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) elif app_config.ETL_SERVICE == "DOCLING": if notification: await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="parsing", stage_message="Extracting content" + session, + notification, + stage="parsing", + stage_message="Extracting content", ) # Suppress logging during Docling import getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) getLogger("docling.document_converter").setLevel(ERROR) - getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel(ERROR) + getLogger( + "docling_core.transforms.chunker.hierarchical_chunker" + ).setLevel(ERROR) from docling.document_converter import DocumentConverter @@ -1815,7 +1865,9 @@ async def process_file_in_background_with_document( etl_service = "DOCLING" # Update page usage - await page_limit_service.update_page_usage(user_id, estimated_pages, allow_exceed=True) + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) else: raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") @@ -1829,7 +1881,7 @@ async def process_file_in_background_with_document( # ===== STEP 2: Check for duplicate content ===== content_hash = generate_content_hash(markdown_content, search_space_id) - + existing_by_content = await check_duplicate_document(session, content_hash) if existing_by_content and existing_by_content.id != document.id: # Duplicate content found - mark this document as failed @@ -1846,7 +1898,7 @@ async def process_file_in_background_with_document( ) user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - + if user_llm: document_metadata = { "file_name": filename, @@ -1881,10 +1933,10 @@ async def process_file_in_background_with_document( **(document.document_metadata or {}), } flag_modified(document, "document_metadata") - + # Use safe_set_chunks to avoid async issues safe_set_chunks(document, chunks) - + document.blocknote_document = blocknote_json document.content_needs_reindexing = False document.updated_at = get_current_timestamp() @@ -1922,7 +1974,11 @@ async def process_file_in_background_with_document( log_entry, error_message, str(e), - {"error_type": type(e).__name__, "filename": filename, "document_id": document.id}, + { + "error_type": type(e).__name__, + "filename": filename, + "document_id": document.id, + }, ) logging.error(f"Error processing file with document: {error_message}") raise diff --git a/surfsense_backend/app/tasks/document_processors/youtube_processor.py b/surfsense_backend/app/tasks/document_processors/youtube_processor.py index 19092b592..e83d7c855 100644 --- a/surfsense_backend/app/tasks/document_processors/youtube_processor.py +++ b/surfsense_backend/app/tasks/document_processors/youtube_processor.py @@ -136,11 +136,19 @@ async def add_youtube_video_document( document = existing_document is_new_document = False # Check if already being processed - if DocumentStatus.is_state(existing_document.status, DocumentStatus.PENDING): - logging.info(f"YouTube video {video_id} already pending. Returning existing.") + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PENDING + ): + logging.info( + f"YouTube video {video_id} already pending. Returning existing." + ) return existing_document - if DocumentStatus.is_state(existing_document.status, DocumentStatus.PROCESSING): - logging.info(f"YouTube video {video_id} already processing. Returning existing.") + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PROCESSING + ): + logging.info( + f"YouTube video {video_id} already processing. Returning existing." + ) return existing_document else: # Create new document with PENDING status (visible in UI immediately) @@ -300,7 +308,9 @@ async def add_youtube_video_document( "video_id": video_id, }, ) - logging.info(f"Document for YouTube video {video_id} unchanged. Marking as ready.") + logging.info( + f"Document for YouTube video {video_id} unchanged. Marking as ready." + ) document.status = DocumentStatus.ready() await session.commit() return document @@ -408,7 +418,9 @@ async def add_youtube_video_document( # Mark document as failed if it exists if document: try: - document.status = DocumentStatus.failed(f"Database error: {str(db_error)[:150]}") + document.status = DocumentStatus.failed( + f"Database error: {str(db_error)[:150]}" + ) document.updated_at = get_current_timestamp() await session.commit() except Exception: diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx index 2bba85085..b214c96be 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon.tsx @@ -38,7 +38,9 @@ export function DocumentTypeChip({ type, className }: { type: string; className? className={`inline-flex items-center gap-1.5 rounded bg-muted/40 px-2 py-1 text-xs text-muted-foreground max-w-full overflow-hidden ${className ?? ""}`} > {icon} - {fullLabel} + + {fullLabel} + ); diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx index 028f38098..6bd5f8460 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx @@ -68,9 +68,7 @@ export function DocumentsFilters({ const filteredTypes = useMemo(() => { if (!typeSearchQuery.trim()) return uniqueTypes; const query = typeSearchQuery.toLowerCase(); - return uniqueTypes.filter((type) => - getDocumentTypeLabel(type).toLowerCase().includes(query) - ); + return uniqueTypes.filter((type) => getDocumentTypeLabel(type).toLowerCase().includes(query)); }, [uniqueTypes, typeSearchQuery]); const typeCounts = useMemo(() => { @@ -156,94 +154,95 @@ export function DocumentsFilters({ {/* Filter Buttons Group */}
- {/* Type Filter */} - - - - - -
- {/* Search input */} -
-
- - setTypeSearchQuery(e.target.value)} - className="h-6 pl-6 text-sm bg-transparent border-0 focus-visible:ring-0" - /> -
-
- -
- {filteredTypes.length === 0 ? ( -
- No types found + {/* Type Filter */} + + + + + +
+ {/* Search input */} +
+
+ + setTypeSearchQuery(e.target.value)} + className="h-6 pl-6 text-sm bg-transparent border-0 focus-visible:ring-0" + />
- ) : ( - filteredTypes.map((value: DocumentTypeEnum, i) => ( -
+ +
+ {filteredTypes.length === 0 ? ( +
+ No types found +
+ ) : ( + filteredTypes.map((value: DocumentTypeEnum, i) => ( + + )) + )} +
+ {activeTypes.length > 0 && ( +
+ - )) + Clear filters + +
)}
- {activeTypes.length > 0 && ( -
- -
- )} -
- - + + {/* Bulk Delete Button */} {selectedIds.size > 0 && ( @@ -255,22 +254,14 @@ export function DocumentsFilters({ exit={{ opacity: 0, scale: 0.9 }} > {/* Mobile: icon with count */} - {/* Desktop: full button */} -
- Delete {selectedIds.size} document{selectedIds.size !== 1 ? "s" : ""}? + + Delete {selectedIds.size} document{selectedIds.size !== 1 ? "s" : ""}? + - This action cannot be undone. This will permanently delete the selected {selectedIds.size === 1 ? "document" : "documents"} from your search space. + This action cannot be undone. This will permanently delete the selected{" "} + {selectedIds.size === 1 ? "document" : "documents"} from your search space.
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index fb0d72fae..d5ee00dfb 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -1,7 +1,20 @@ "use client"; import { formatDistanceToNow } from "date-fns"; -import { AlertCircle, Calendar, CheckCircle2, ChevronDown, ChevronUp, Clock, FileText, FileX, Loader2, Network, Plus, User } from "lucide-react"; +import { + AlertCircle, + Calendar, + CheckCircle2, + ChevronDown, + ChevronUp, + Clock, + FileText, + FileX, + Loader2, + Network, + Plus, + User, +} from "lucide-react"; import { motion } from "motion/react"; import { useTranslations } from "next-intl"; import React, { useRef, useState, useEffect, useCallback } from "react"; @@ -10,12 +23,7 @@ import { JsonMetadataViewer } from "@/components/json-metadata-viewer"; import { MarkdownViewer } from "@/components/markdown-viewer"; import { Button } from "@/components/ui/button"; import { Checkbox } from "@/components/ui/checkbox"; -import { - Dialog, - DialogContent, - DialogHeader, - DialogTitle, -} from "@/components/ui/dialog"; +import { Dialog, DialogContent, DialogHeader, DialogTitle } from "@/components/ui/dialog"; import { Skeleton } from "@/components/ui/skeleton"; import { Spinner } from "@/components/ui/spinner"; import { @@ -35,7 +43,7 @@ import type { ColumnVisibility, Document, DocumentStatus } from "./types"; // Status indicator component for document processing status function StatusIndicator({ status }: { status?: DocumentStatus }) { const state = status?.state ?? "ready"; - + switch (state) { case "pending": return ( @@ -176,12 +184,10 @@ function SortableHeader({ > {icon && {icon}} {children} - - {isActive && sortDesc ? ( - - ) : ( - - )} + + {isActive && sortDesc ? : } ); @@ -300,8 +306,10 @@ export function DocumentsTableShell({ // Only consider selectable documents for "select all" logic const selectableDocs = sorted.filter(isSelectable); - const allSelectedOnPage = selectableDocs.length > 0 && selectableDocs.every((d) => selectedIds.has(d.id)); - const someSelectedOnPage = selectableDocs.some((d) => selectedIds.has(d.id)) && !allSelectedOnPage; + const allSelectedOnPage = + selectableDocs.length > 0 && selectableDocs.every((d) => selectedIds.has(d.id)); + const someSelectedOnPage = + selectableDocs.some((d) => selectedIds.has(d.id)) && !allSelectedOnPage; const toggleAll = (checked: boolean) => { const next = new Set(selectedIds); @@ -388,10 +396,7 @@ export function DocumentsTableShell({
- + {columnVisibility.document_type && ( @@ -429,24 +434,15 @@ export function DocumentsTableShell({
- +
- {columnVisibility.created_by && ( - - )} - {columnVisibility.created_at && ( - - )} + {columnVisibility.created_by && } + {columnVisibility.created_at && }
- {columnVisibility.status && ( - - )} + {columnVisibility.status && }
@@ -549,9 +545,7 @@ export function DocumentsTableShell({ )} {columnVisibility.status && ( - - Status - + Status )} @@ -580,9 +574,7 @@ export function DocumentsTableShell({ }, }} className={`border-b border-border/40 transition-colors ${ - isSelected - ? "bg-primary/5 hover:bg-primary/8" - : "hover:bg-muted/30" + isSelected ? "bg-primary/5 hover:bg-primary/8" : "hover:bg-muted/30" }`} > @@ -591,7 +583,9 @@ export function DocumentsTableShell({ checked={isSelected} onCheckedChange={(v) => canSelect && toggleOne(doc.id, !!v)} disabled={!canSelect} - aria-label={canSelect ? "Select row" : "Cannot select while processing"} + aria-label={ + canSelect ? "Select row" : "Cannot select while processing" + } className={`border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary ${!canSelect ? "opacity-40 cursor-not-allowed" : ""}`} />
@@ -639,7 +633,9 @@ export function DocumentsTableShell({ - {formatRelativeDate(doc.created_at)} + + {formatRelativeDate(doc.created_at)} + {formatAbsoluteDate(doc.created_at)} @@ -720,9 +716,7 @@ export function DocumentsTableShell({
{columnVisibility.created_by && doc.created_by_name && ( - - {doc.created_by_name} - + {doc.created_by_name} )} {columnVisibility.created_at && ( diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx index 4f23693ad..ec355f576 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/RowActions.tsx @@ -46,7 +46,8 @@ export function RowActions({ ); // Documents in "pending" or "processing" state should show disabled delete - const isBeingProcessed = document.status?.state === "pending" || document.status?.state === "processing"; + const isBeingProcessed = + document.status?.state === "pending" || document.status?.state === "processing"; // SURFSENSE_DOCS are system-managed and should not show delete at all const shouldShowDelete = !NON_DELETABLE_DOCUMENT_TYPES.includes( @@ -67,8 +68,9 @@ export function RowActions({ } catch (error: unknown) { console.error("Error deleting document:", error); // Check for 409 Conflict (document started processing after UI loaded) - const status = (error as { response?: { status?: number } })?.response?.status - ?? (error as { status?: number })?.status; + const status = + (error as { response?: { status?: number } })?.response?.status ?? + (error as { status?: number })?.status; if (status === 409) { toast.error("Document is now being processed. Please try again later."); } else { @@ -92,7 +94,11 @@ export function RowActions({ // Editable documents: show 3-dot dropdown with edit + delete - @@ -101,7 +107,9 @@ export function RowActions({ !isEditDisabled && handleEdit()} disabled={isEditDisabled} - className={isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : ""} + className={ + isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "" + } > Edit @@ -110,7 +118,11 @@ export function RowActions({ !isDeleteDisabled && setIsDeleteOpen(true)} disabled={isDeleteDisabled} - className={isDeleteDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "text-destructive focus:text-destructive"} + className={ + isDeleteDisabled + ? "text-muted-foreground cursor-not-allowed opacity-50" + : "text-destructive focus:text-destructive" + } > Delete @@ -150,7 +162,9 @@ export function RowActions({ !isEditDisabled && handleEdit()} disabled={isEditDisabled} - className={isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : ""} + className={ + isEditDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "" + } > Edit @@ -159,7 +173,11 @@ export function RowActions({ !isDeleteDisabled && setIsDeleteOpen(true)} disabled={isDeleteDisabled} - className={isDeleteDisabled ? "text-muted-foreground cursor-not-allowed opacity-50" : "text-destructive focus:text-destructive"} + className={ + isDeleteDisabled + ? "text-muted-foreground cursor-not-allowed opacity-50" + : "text-destructive focus:text-destructive" + } > Delete diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx index b85b334d7..8cf2fe8da 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/page.tsx @@ -116,13 +116,15 @@ export default function DocumentsTable() { created_by_id: item.created_by_id ?? null, created_by_name: item.created_by_name ?? null, created_at: item.created_at, - status: (item as { status?: { state: "ready" | "pending" | "processing" | "failed"; reason?: string } }).status ?? { state: "ready" as const }, + status: ( + item as { + status?: { state: "ready" | "pending" | "processing" | "failed"; reason?: string }; + } + ).status ?? { state: "ready" as const }, })) : paginatedRealtimeDocuments; - const displayTotal = isSearchMode - ? searchResponse?.total || 0 - : sortedRealtimeDocuments.length; + const displayTotal = isSearchMode ? searchResponse?.total || 0 : sortedRealtimeDocuments.length; const loading = isSearchMode ? isSearchLoading : realtimeLoading; const error = isSearchMode ? searchError : realtimeError; @@ -149,13 +151,13 @@ export default function DocumentsTable() { // Filter out pending/processing documents - they cannot be deleted // For real-time mode, use sortedRealtimeDocuments (which has status) // For search mode, use searchResponse items (need to safely access status) - const allDocs = isSearchMode - ? (searchResponse?.items || []).map(item => ({ - id: item.id, - status: (item as { status?: { state: string } }).status, - })) - : sortedRealtimeDocuments.map(doc => ({ id: doc.id, status: doc.status })); - + const allDocs = isSearchMode + ? (searchResponse?.items || []).map((item) => ({ + id: item.id, + status: (item as { status?: { state: string } }).status, + })) + : sortedRealtimeDocuments.map((doc) => ({ id: doc.id, status: doc.status })); + const selectedDocs = allDocs.filter((doc) => selectedIds.has(doc.id)); const deletableIds = selectedDocs .filter((doc) => doc.status?.state !== "pending" && doc.status?.state !== "processing") @@ -163,7 +165,9 @@ export default function DocumentsTable() { const inProgressCount = selectedIds.size - deletableIds.length; if (inProgressCount > 0) { - toast.warning(`${inProgressCount} document(s) are pending or processing and cannot be deleted.`); + toast.warning( + `${inProgressCount} document(s) are pending or processing and cannot be deleted.` + ); } if (deletableIds.length === 0) { @@ -180,8 +184,9 @@ export default function DocumentsTable() { await deleteDocumentMutation({ id }); return true; } catch (error: unknown) { - const status = (error as { response?: { status?: number } })?.response?.status - ?? (error as { status?: number })?.status; + const status = + (error as { response?: { status?: number } })?.response?.status ?? + (error as { status?: number })?.status; if (status === 409) conflictCount++; return false; } @@ -195,13 +200,13 @@ export default function DocumentsTable() { } else { toast.error(t("delete_partial_failed")); } - + // If in search mode, refetch search results to reflect deletion if (isSearchMode) { await refetchSearch(); } // Real-time mode: Electric will sync the deletion automatically - + setSelectedIds(new Set()); } catch (e) { console.error(e); @@ -210,21 +215,24 @@ export default function DocumentsTable() { }; // Single document delete handler for RowActions - const handleDeleteDocument = useCallback(async (id: number): Promise => { - try { - await deleteDocumentMutation({ id }); - toast.success(t("delete_success") || "Document deleted"); - // If in search mode, refetch search results to reflect deletion - if (isSearchMode) { - await refetchSearch(); + const handleDeleteDocument = useCallback( + async (id: number): Promise => { + try { + await deleteDocumentMutation({ id }); + toast.success(t("delete_success") || "Document deleted"); + // If in search mode, refetch search results to reflect deletion + if (isSearchMode) { + await refetchSearch(); + } + // Real-time mode: Electric will sync the deletion automatically + return true; + } catch (e) { + console.error("Error deleting document:", e); + return false; } - // Real-time mode: Electric will sync the deletion automatically - return true; - } catch (e) { - console.error("Error deleting document:", e); - return false; - } - }, [deleteDocumentMutation, isSearchMode, refetchSearch, t]); + }, + [deleteDocumentMutation, isSearchMode, refetchSearch, t] + ); const handleSortChange = useCallback((key: SortKey) => { setSortKey((currentKey) => { diff --git a/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts b/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts index 38205a8d2..cbdf17244 100644 --- a/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts +++ b/surfsense_web/atoms/connector-dialog/connector-dialog.atoms.ts @@ -2,4 +2,3 @@ import { atom } from "jotai"; // Atom to control the connector dialog open state from anywhere in the app export const connectorDialogOpenAtom = atom(false); - diff --git a/surfsense_web/components/assistant-ui/connector-popup.tsx b/surfsense_web/components/assistant-ui/connector-popup.tsx index ec8399198..e597770ee 100644 --- a/surfsense_web/components/assistant-ui/connector-popup.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup.tsx @@ -191,7 +191,9 @@ export const ConnectorIndicator: FC<{ hideTrigger?: boolean }> = ({ hideTrigger {!hideTrigger && ( { const connectorId = parseInt(params.connectorId, 10); newConnector = result.data.find((c: SearchSourceConnector) => c.id === connectorId); - // If we found the connector, find the matching OAuth/Composio connector by type - if (newConnector) { - const connectorType = newConnector.connector_type; - oauthConnector = - OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType) || - COMPOSIO_CONNECTORS.find((c) => c.connectorType === connectorType); - } + // If we found the connector, find the matching OAuth/Composio connector by type + if (newConnector) { + const connectorType = newConnector.connector_type; + oauthConnector = + OAUTH_CONNECTORS.find((c) => c.connectorType === connectorType) || + COMPOSIO_CONNECTORS.find((c) => c.connectorType === connectorType); + } } // If we don't have a connector yet, try to find by connector param @@ -361,12 +361,12 @@ export const useConnectorDialog = () => { OAUTH_CONNECTORS.find((c) => c.id === params.connector) || COMPOSIO_CONNECTORS.find((c) => c.id === params.connector); - if (oauthConnector) { - const oauthConnectorType = oauthConnector.connectorType; - newConnector = result.data.find( - (c: SearchSourceConnector) => c.connector_type === oauthConnectorType - ); - } + if (oauthConnector) { + const oauthConnectorType = oauthConnector.connectorType; + newConnector = result.data.find( + (c: SearchSourceConnector) => c.connector_type === oauthConnectorType + ); + } } if (newConnector && oauthConnector) { @@ -679,11 +679,11 @@ export const useConnectorDialog = () => { }, }); - const successMessage = - currentConnectorType === "MCP_CONNECTOR" - ? `${connector.name} added successfully` - : `${connectorTitle} connected and syncing started!`; - toast.success(successMessage); + const successMessage = + currentConnectorType === "MCP_CONNECTOR" + ? `${connector.name} added successfully` + : `${connectorTitle} connected and syncing started!`; + toast.success(successMessage); const url = new URL(window.location.href); url.searchParams.delete("modal"); diff --git a/surfsense_web/components/theme/theme-toggle.tsx b/surfsense_web/components/theme/theme-toggle.tsx index 382d11087..b9b23656b 100644 --- a/surfsense_web/components/theme/theme-toggle.tsx +++ b/surfsense_web/components/theme/theme-toggle.tsx @@ -8,172 +8,167 @@ import { cn } from "@/lib/utils"; // /////////////////////////////////////////////////////////////////////////// // Types -export type AnimationVariant = - | "circle" - | "rectangle" - | "gif" - | "polygon" - | "circle-blur"; +export type AnimationVariant = "circle" | "rectangle" | "gif" | "polygon" | "circle-blur"; export type AnimationStart = - | "top-left" - | "top-right" - | "bottom-left" - | "bottom-right" - | "center" - | "top-center" - | "bottom-center" - | "bottom-up" - | "top-down" - | "left-right" - | "right-left"; + | "top-left" + | "top-right" + | "bottom-left" + | "bottom-right" + | "center" + | "top-center" + | "bottom-center" + | "bottom-up" + | "top-down" + | "left-right" + | "right-left"; interface Animation { - name: string; - css: string; + name: string; + css: string; } // /////////////////////////////////////////////////////////////////////////// // Helper functions const getPositionCoords = (position: AnimationStart) => { - switch (position) { - case "top-left": - return { cx: "0", cy: "0" }; - case "top-right": - return { cx: "40", cy: "0" }; - case "bottom-left": - return { cx: "0", cy: "40" }; - case "bottom-right": - return { cx: "40", cy: "40" }; - case "top-center": - return { cx: "20", cy: "0" }; - case "bottom-center": - return { cx: "20", cy: "40" }; - case "bottom-up": - case "top-down": - case "left-right": - case "right-left": - return { cx: "20", cy: "20" }; - } + switch (position) { + case "top-left": + return { cx: "0", cy: "0" }; + case "top-right": + return { cx: "40", cy: "0" }; + case "bottom-left": + return { cx: "0", cy: "40" }; + case "bottom-right": + return { cx: "40", cy: "40" }; + case "top-center": + return { cx: "20", cy: "0" }; + case "bottom-center": + return { cx: "20", cy: "40" }; + case "bottom-up": + case "top-down": + case "left-right": + case "right-left": + return { cx: "20", cy: "20" }; + } }; const generateSVG = (variant: AnimationVariant, start: AnimationStart) => { - if (variant === "circle-blur") { - if (start === "center") { - return `data:image/svg+xml,`; - } - const positionCoords = getPositionCoords(start); - if (!positionCoords) { - throw new Error(`Invalid start position: ${start}`); - } - const { cx, cy } = positionCoords; - return `data:image/svg+xml,`; - } + if (variant === "circle-blur") { + if (start === "center") { + return `data:image/svg+xml,`; + } + const positionCoords = getPositionCoords(start); + if (!positionCoords) { + throw new Error(`Invalid start position: ${start}`); + } + const { cx, cy } = positionCoords; + return `data:image/svg+xml,`; + } - if (start === "center") return; + if (start === "center") return; - if (variant === "rectangle") return ""; + if (variant === "rectangle") return ""; - const positionCoords = getPositionCoords(start); - if (!positionCoords) { - throw new Error(`Invalid start position: ${start}`); - } - const { cx, cy } = positionCoords; + const positionCoords = getPositionCoords(start); + if (!positionCoords) { + throw new Error(`Invalid start position: ${start}`); + } + const { cx, cy } = positionCoords; - if (variant === "circle") { - return `data:image/svg+xml,`; - } + if (variant === "circle") { + return `data:image/svg+xml,`; + } - return ""; + return ""; }; const getTransformOrigin = (start: AnimationStart) => { - switch (start) { - case "top-left": - return "top left"; - case "top-right": - return "top right"; - case "bottom-left": - return "bottom left"; - case "bottom-right": - return "bottom right"; - case "top-center": - return "top center"; - case "bottom-center": - return "bottom center"; - case "bottom-up": - case "top-down": - case "left-right": - case "right-left": - return "center"; - } + switch (start) { + case "top-left": + return "top left"; + case "top-right": + return "top right"; + case "bottom-left": + return "bottom left"; + case "bottom-right": + return "bottom right"; + case "top-center": + return "top center"; + case "bottom-center": + return "bottom center"; + case "bottom-up": + case "top-down": + case "left-right": + case "right-left": + return "center"; + } }; export const createAnimation = ( - variant: AnimationVariant, - start: AnimationStart = "center", - blur = false, - url?: string, + variant: AnimationVariant, + start: AnimationStart = "center", + blur = false, + url?: string ): Animation => { - const svg = generateSVG(variant, start); - const transformOrigin = getTransformOrigin(start); + const svg = generateSVG(variant, start); + const transformOrigin = getTransformOrigin(start); - if (variant === "rectangle") { - const getClipPath = (direction: AnimationStart) => { - switch (direction) { - case "bottom-up": - return { - from: "polygon(0% 100%, 100% 100%, 100% 100%, 0% 100%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - case "top-down": - return { - from: "polygon(0% 0%, 100% 0%, 100% 0%, 0% 0%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - case "left-right": - return { - from: "polygon(0% 0%, 0% 0%, 0% 100%, 0% 100%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - case "right-left": - return { - from: "polygon(100% 0%, 100% 0%, 100% 100%, 100% 100%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - case "top-left": - return { - from: "polygon(0% 0%, 0% 0%, 0% 0%, 0% 0%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - case "top-right": - return { - from: "polygon(100% 0%, 100% 0%, 100% 0%, 100% 0%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - case "bottom-left": - return { - from: "polygon(0% 100%, 0% 100%, 0% 100%, 0% 100%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - case "bottom-right": - return { - from: "polygon(100% 100%, 100% 100%, 100% 100%, 100% 100%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - default: - return { - from: "polygon(0% 100%, 100% 100%, 100% 100%, 0% 100%)", - to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", - }; - } - }; + if (variant === "rectangle") { + const getClipPath = (direction: AnimationStart) => { + switch (direction) { + case "bottom-up": + return { + from: "polygon(0% 100%, 100% 100%, 100% 100%, 0% 100%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + case "top-down": + return { + from: "polygon(0% 0%, 100% 0%, 100% 0%, 0% 0%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + case "left-right": + return { + from: "polygon(0% 0%, 0% 0%, 0% 100%, 0% 100%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + case "right-left": + return { + from: "polygon(100% 0%, 100% 0%, 100% 100%, 100% 100%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + case "top-left": + return { + from: "polygon(0% 0%, 0% 0%, 0% 0%, 0% 0%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + case "top-right": + return { + from: "polygon(100% 0%, 100% 0%, 100% 0%, 100% 0%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + case "bottom-left": + return { + from: "polygon(0% 100%, 0% 100%, 0% 100%, 0% 100%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + case "bottom-right": + return { + from: "polygon(100% 100%, 100% 100%, 100% 100%, 100% 100%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + default: + return { + from: "polygon(0% 100%, 100% 100%, 100% 100%, 0% 100%)", + to: "polygon(0% 0%, 100% 0%, 100% 100%, 0% 100%)", + }; + } + }; - const clipPath = getClipPath(start); + const clipPath = getClipPath(start); - return { - name: `${variant}-${start}${blur ? "-blur" : ""}`, - css: ` + return { + name: `${variant}-${start}${blur ? "-blur" : ""}`, + css: ` ::view-transition-group(root) { animation-duration: 0.7s; animation-timing-function: var(--expo-out); @@ -218,12 +213,12 @@ export const createAnimation = ( } } `, - }; - } - if (variant === "circle" && start == "center") { - return { - name: `${variant}-${start}${blur ? "-blur" : ""}`, - css: ` + }; + } + if (variant === "circle" && start == "center") { + return { + name: `${variant}-${start}${blur ? "-blur" : ""}`, + css: ` ::view-transition-group(root) { animation-duration: 0.7s; animation-timing-function: var(--expo-out); @@ -268,12 +263,12 @@ export const createAnimation = ( } } `, - }; - } - if (variant === "gif") { - return { - name: `${variant}-${start}`, - css: ` + }; + } + if (variant === "gif") { + return { + name: `${variant}-${start}`, + css: ` ::view-transition-group(root) { animation-timing-function: var(--expo-in); } @@ -302,14 +297,14 @@ export const createAnimation = ( mask-size: 2000vmax; } }`, - }; - } + }; + } - if (variant === "circle-blur") { - if (start === "center") { - return { - name: `${variant}-${start}`, - css: ` + if (variant === "circle-blur") { + if (start === "center") { + return { + name: `${variant}-${start}`, + css: ` ::view-transition-group(root) { animation-timing-function: var(--expo-out); } @@ -334,12 +329,12 @@ export const createAnimation = ( } } `, - }; - } + }; + } - return { - name: `${variant}-${start}`, - css: ` + return { + name: `${variant}-${start}`, + css: ` ::view-transition-group(root) { animation-timing-function: var(--expo-out); } @@ -364,41 +359,41 @@ export const createAnimation = ( } } `, - }; - } + }; + } - if (variant === "polygon") { - const getPolygonClipPaths = (position: AnimationStart) => { - switch (position) { - case "top-left": - return { - darkFrom: "polygon(50% -71%, -50% 71%, -50% 71%, 50% -71%)", - darkTo: "polygon(50% -71%, -50% 71%, 50% 171%, 171% 50%)", - lightFrom: "polygon(171% 50%, 50% 171%, 50% 171%, 171% 50%)", - lightTo: "polygon(171% 50%, 50% 171%, -50% 71%, 50% -71%)", - }; - case "top-right": - return { - darkFrom: "polygon(150% -71%, 250% 71%, 250% 71%, 150% -71%)", - darkTo: "polygon(150% -71%, 250% 71%, 50% 171%, -71% 50%)", - lightFrom: "polygon(-71% 50%, 50% 171%, 50% 171%, -71% 50%)", - lightTo: "polygon(-71% 50%, 50% 171%, 250% 71%, 150% -71%)", - }; - default: - return { - darkFrom: "polygon(50% -71%, -50% 71%, -50% 71%, 50% -71%)", - darkTo: "polygon(50% -71%, -50% 71%, 50% 171%, 171% 50%)", - lightFrom: "polygon(171% 50%, 50% 171%, 50% 171%, 171% 50%)", - lightTo: "polygon(171% 50%, 50% 171%, -50% 71%, 50% -71%)", - }; - } - }; + if (variant === "polygon") { + const getPolygonClipPaths = (position: AnimationStart) => { + switch (position) { + case "top-left": + return { + darkFrom: "polygon(50% -71%, -50% 71%, -50% 71%, 50% -71%)", + darkTo: "polygon(50% -71%, -50% 71%, 50% 171%, 171% 50%)", + lightFrom: "polygon(171% 50%, 50% 171%, 50% 171%, 171% 50%)", + lightTo: "polygon(171% 50%, 50% 171%, -50% 71%, 50% -71%)", + }; + case "top-right": + return { + darkFrom: "polygon(150% -71%, 250% 71%, 250% 71%, 150% -71%)", + darkTo: "polygon(150% -71%, 250% 71%, 50% 171%, -71% 50%)", + lightFrom: "polygon(-71% 50%, 50% 171%, 50% 171%, -71% 50%)", + lightTo: "polygon(-71% 50%, 50% 171%, 250% 71%, 150% -71%)", + }; + default: + return { + darkFrom: "polygon(50% -71%, -50% 71%, -50% 71%, 50% -71%)", + darkTo: "polygon(50% -71%, -50% 71%, 50% 171%, 171% 50%)", + lightFrom: "polygon(171% 50%, 50% 171%, 50% 171%, 171% 50%)", + lightTo: "polygon(171% 50%, 50% 171%, -50% 71%, 50% -71%)", + }; + } + }; - const clipPaths = getPolygonClipPaths(start); + const clipPaths = getPolygonClipPaths(start); - return { - name: `${variant}-${start}${blur ? "-blur" : ""}`, - css: ` + return { + name: `${variant}-${start}${blur ? "-blur" : ""}`, + css: ` ::view-transition-group(root) { animation-duration: 0.7s; animation-timing-function: var(--expo-out); @@ -443,35 +438,35 @@ export const createAnimation = ( } } `, - }; - } + }; + } - // Handle circle variants with start positions using clip-path - if (variant === "circle" && start !== "center") { - const getClipPathPosition = (position: AnimationStart) => { - switch (position) { - case "top-left": - return "0% 0%"; - case "top-right": - return "100% 0%"; - case "bottom-left": - return "0% 100%"; - case "bottom-right": - return "100% 100%"; - case "top-center": - return "50% 0%"; - case "bottom-center": - return "50% 100%"; - default: - return "50% 50%"; - } - }; + // Handle circle variants with start positions using clip-path + if (variant === "circle" && start !== "center") { + const getClipPathPosition = (position: AnimationStart) => { + switch (position) { + case "top-left": + return "0% 0%"; + case "top-right": + return "100% 0%"; + case "bottom-left": + return "0% 100%"; + case "bottom-right": + return "100% 100%"; + case "top-center": + return "50% 0%"; + case "bottom-center": + return "50% 100%"; + default: + return "50% 50%"; + } + }; - const clipPosition = getClipPathPosition(start); + const clipPosition = getClipPathPosition(start); - return { - name: `${variant}-${start}${blur ? "-blur" : ""}`, - css: ` + return { + name: `${variant}-${start}${blur ? "-blur" : ""}`, + css: ` ::view-transition-group(root) { animation-duration: 1s; animation-timing-function: var(--expo-out); @@ -516,12 +511,12 @@ export const createAnimation = ( } } `, - }; - } + }; + } - return { - name: `${variant}-${start}${blur ? "-blur" : ""}`, - css: ` + return { + name: `${variant}-${start}${blur ? "-blur" : ""}`, + css: ` ::view-transition-group(root) { animation-timing-function: var(--expo-in); } @@ -549,237 +544,229 @@ export const createAnimation = ( } } `, - }; + }; }; // /////////////////////////////////////////////////////////////////////////// // Custom hook for theme toggle functionality export const useThemeToggle = ({ - variant = "circle", - start = "center", - blur = false, - gifUrl = "", + variant = "circle", + start = "center", + blur = false, + gifUrl = "", }: { - variant?: AnimationVariant; - start?: AnimationStart; - blur?: boolean; - gifUrl?: string; + variant?: AnimationVariant; + start?: AnimationStart; + blur?: boolean; + gifUrl?: string; } = {}) => { - const { theme, setTheme, resolvedTheme } = useTheme(); + const { theme, setTheme, resolvedTheme } = useTheme(); - const [isDark, setIsDark] = useState(false); + const [isDark, setIsDark] = useState(false); - // Sync isDark state with resolved theme after hydration - useEffect(() => { - setIsDark(resolvedTheme === "dark"); - }, [resolvedTheme]); + // Sync isDark state with resolved theme after hydration + useEffect(() => { + setIsDark(resolvedTheme === "dark"); + }, [resolvedTheme]); - const styleId = "theme-transition-styles"; + const styleId = "theme-transition-styles"; - const updateStyles = useCallback((css: string) => { - if (typeof window === "undefined") return; + const updateStyles = useCallback((css: string) => { + if (typeof window === "undefined") return; - let styleElement = document.getElementById(styleId) as HTMLStyleElement; + let styleElement = document.getElementById(styleId) as HTMLStyleElement; - if (!styleElement) { - styleElement = document.createElement("style"); - styleElement.id = styleId; - document.head.appendChild(styleElement); - } + if (!styleElement) { + styleElement = document.createElement("style"); + styleElement.id = styleId; + document.head.appendChild(styleElement); + } - styleElement.textContent = css; - }, []); + styleElement.textContent = css; + }, []); - const toggleTheme = useCallback(() => { - setIsDark(!isDark); + const toggleTheme = useCallback(() => { + setIsDark(!isDark); - const animation = createAnimation(variant, start, blur, gifUrl); + const animation = createAnimation(variant, start, blur, gifUrl); - updateStyles(animation.css); + updateStyles(animation.css); - if (typeof window === "undefined") return; + if (typeof window === "undefined") return; - const switchTheme = () => { - setTheme(theme === "light" ? "dark" : "light"); - }; + const switchTheme = () => { + setTheme(theme === "light" ? "dark" : "light"); + }; - if (!document.startViewTransition) { - switchTheme(); - return; - } + if (!document.startViewTransition) { + switchTheme(); + return; + } - document.startViewTransition(switchTheme); - }, [theme, setTheme, variant, start, blur, gifUrl, updateStyles, isDark]); + document.startViewTransition(switchTheme); + }, [theme, setTheme, variant, start, blur, gifUrl, updateStyles, isDark]); - const setCrazyLightTheme = useCallback(() => { - setIsDark(false); + const setCrazyLightTheme = useCallback(() => { + setIsDark(false); - const animation = createAnimation(variant, start, blur, gifUrl); + const animation = createAnimation(variant, start, blur, gifUrl); - updateStyles(animation.css); + updateStyles(animation.css); - if (typeof window === "undefined") return; + if (typeof window === "undefined") return; - const switchTheme = () => { - setTheme("light"); - }; + const switchTheme = () => { + setTheme("light"); + }; - if (!document.startViewTransition) { - switchTheme(); - return; - } + if (!document.startViewTransition) { + switchTheme(); + return; + } - document.startViewTransition(switchTheme); - }, [setTheme, variant, start, blur, gifUrl, updateStyles]); + document.startViewTransition(switchTheme); + }, [setTheme, variant, start, blur, gifUrl, updateStyles]); - const setCrazyDarkTheme = useCallback(() => { - setIsDark(true); + const setCrazyDarkTheme = useCallback(() => { + setIsDark(true); - const animation = createAnimation(variant, start, blur, gifUrl); + const animation = createAnimation(variant, start, blur, gifUrl); - updateStyles(animation.css); + updateStyles(animation.css); - if (typeof window === "undefined") return; + if (typeof window === "undefined") return; - const switchTheme = () => { - setTheme("dark"); - }; + const switchTheme = () => { + setTheme("dark"); + }; - if (!document.startViewTransition) { - switchTheme(); - return; - } + if (!document.startViewTransition) { + switchTheme(); + return; + } - document.startViewTransition(switchTheme); - }, [setTheme, variant, start, blur, gifUrl, updateStyles]); + document.startViewTransition(switchTheme); + }, [setTheme, variant, start, blur, gifUrl, updateStyles]); - const setCrazySystemTheme = useCallback(() => { - if (typeof window === "undefined") return; + const setCrazySystemTheme = useCallback(() => { + if (typeof window === "undefined") return; - const prefersDark = window.matchMedia( - "(prefers-color-scheme: dark)", - ).matches; - setIsDark(prefersDark); + const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches; + setIsDark(prefersDark); - const animation = createAnimation(variant, start, blur, gifUrl); + const animation = createAnimation(variant, start, blur, gifUrl); - updateStyles(animation.css); + updateStyles(animation.css); - const switchTheme = () => { - setTheme("system"); - }; + const switchTheme = () => { + setTheme("system"); + }; - if (!document.startViewTransition) { - switchTheme(); - return; - } + if (!document.startViewTransition) { + switchTheme(); + return; + } - document.startViewTransition(switchTheme); - }, [setTheme, variant, start, blur, gifUrl, updateStyles]); + document.startViewTransition(switchTheme); + }, [setTheme, variant, start, blur, gifUrl, updateStyles]); - return { - isDark, - setIsDark, - toggleTheme, - setCrazyLightTheme, - setCrazyDarkTheme, - setCrazySystemTheme, - }; + return { + isDark, + setIsDark, + toggleTheme, + setCrazyLightTheme, + setCrazyDarkTheme, + setCrazySystemTheme, + }; }; // /////////////////////////////////////////////////////////////////////////// // Theme Toggle Button Component (Sun/Moon Style) export const ThemeToggleButton = ({ - className = "", - variant = "circle", - start = "center", - blur = false, - gifUrl = "", + className = "", + variant = "circle", + start = "center", + blur = false, + gifUrl = "", }: { - className?: string; - variant?: AnimationVariant; - start?: AnimationStart; - blur?: boolean; - gifUrl?: string; + className?: string; + variant?: AnimationVariant; + start?: AnimationStart; + blur?: boolean; + gifUrl?: string; }) => { - const { isDark, toggleTheme } = useThemeToggle({ - variant, - start, - blur, - gifUrl, - }); - const clipId = useId(); - const clipPathId = `theme-toggle-clip-${clipId}`; + const { isDark, toggleTheme } = useThemeToggle({ + variant, + start, + blur, + gifUrl, + }); + const clipId = useId(); + const clipPathId = `theme-toggle-clip-${clipId}`; - return ( - - ); + return ( + + ); }; // /////////////////////////////////////////////////////////////////////////// // Backwards compatible export (alias for ThemeToggleButton with default settings) export function ThemeTogglerComponent() { - return ( - - ); + return ; } /** diff --git a/surfsense_web/hooks/use-documents.ts b/surfsense_web/hooks/use-documents.ts index 442c836b2..369cc7b41 100644 --- a/surfsense_web/hooks/use-documents.ts +++ b/surfsense_web/hooks/use-documents.ts @@ -144,7 +144,7 @@ export function useDocuments( (doc: DocumentElectric): DocumentDisplay => ({ ...doc, created_by_name: doc.created_by_id - ? userCacheRef.current.get(doc.created_by_id) ?? null + ? (userCacheRef.current.get(doc.created_by_id) ?? null) : null, status: doc.status ?? { state: "ready" }, }), @@ -232,7 +232,15 @@ export function useDocuments( const handle = await client.syncShape({ table: "documents", where: `search_space_id = ${spaceId}`, - columns: ["id", "document_type", "search_space_id", "title", "created_by_id", "created_at", "status"], + columns: [ + "id", + "document_type", + "search_space_id", + "title", + "created_by_id", + "created_at", + "status", + ], primaryKey: ["id"], }); @@ -258,7 +266,10 @@ export function useDocuments( // Set up live query const db = client.db as { live?: { - query: (sql: string, params?: (number | string)[]) => Promise<{ + query: ( + sql: string, + params?: (number | string)[] + ) => Promise<{ subscribe: (cb: (result: { rows: T[] }) => void) => void; unsubscribe?: () => void; }>; @@ -297,8 +308,7 @@ export function useDocuments( if (!mounted || !result.rows) return; // DEBUG: Log first few raw documents to see what's coming from Electric - console.log("[useDocuments] Raw data sample:", result.rows.slice(0, 3)); - + console.log("[useDocuments] Raw data sample:", result.rows.slice(0, 3)); const validItems = result.rows.filter(isValidDocument); const isFullySynced = syncHandleRef.current?.isUpToDate ?? false; @@ -309,8 +319,9 @@ export function useDocuments( // Fetch user names for new users (non-blocking) const unknownUserIds = validItems - .filter((doc): doc is DocumentElectric & { created_by_id: string } => - doc.created_by_id !== null && !userCacheRef.current.has(doc.created_by_id) + .filter( + (doc): doc is DocumentElectric & { created_by_id: string } => + doc.created_by_id !== null && !userCacheRef.current.has(doc.created_by_id) ) .map((doc) => doc.created_by_id); @@ -326,7 +337,7 @@ export function useDocuments( prev.map((doc) => ({ ...doc, created_by_name: doc.created_by_id - ? userCacheRef.current.get(doc.created_by_id) ?? null + ? (userCacheRef.current.get(doc.created_by_id) ?? null) : null, })) ); @@ -358,7 +369,9 @@ export function useDocuments( // Case 2: Electric is fully synced - TRUST IT COMPLETELY (handles bulk deletes) if (isFullySynced) { const liveDocs = deduplicateAndSort(validItems.map(electricToDisplayDoc)); - console.log(`[useDocuments] Synced update: ${liveDocs.length} docs (was ${prev.length})`); + console.log( + `[useDocuments] Synced update: ${liveDocs.length} docs (was ${prev.length})` + ); return liveDocs; } diff --git a/surfsense_web/lib/electric/client.ts b/surfsense_web/lib/electric/client.ts index 3fa4586ac..9d596a261 100644 --- a/surfsense_web/lib/electric/client.ts +++ b/surfsense_web/lib/electric/client.ts @@ -444,9 +444,9 @@ export async function initElectric(userId: string): Promise { // in use-inbox.ts generating different sync keys on each render. // That's now fixed (rounded to midnight UTC in getSyncCutoffDate). // We can safely use shapeKey for fast incremental sync. - + const shapeKey = `${userId}_v${SYNC_VERSION}_${table}_${where?.replace(/[^a-zA-Z0-9]/g, "_") || "all"}`; - + // Type assertion to PGlite with electric extension const pgWithElectric = db as unknown as { electric: { @@ -495,9 +495,7 @@ export async function initElectric(userId: string): Promise { // Parse the WHERE clause to build a DELETE statement // The WHERE clause is already validated and formatted await tx.exec(`DELETE FROM ${table} WHERE ${validatedWhere}`); - debugLog( - `[Electric] 🗑️ Cleared ${table} rows matching: ${validatedWhere}` - ); + debugLog(`[Electric] 🗑️ Cleared ${table} rows matching: ${validatedWhere}`); } else { // No WHERE clause means we're syncing the entire table await tx.exec(`DELETE FROM ${table}`); @@ -514,10 +512,7 @@ export async function initElectric(userId: string): Promise { }, }; - debugLog( - "[Electric] syncShapeToTable config:", - JSON.stringify(shapeConfig, null, 2) - ); + debugLog("[Electric] syncShapeToTable config:", JSON.stringify(shapeConfig, null, 2)); let shape: { unsubscribe: () => void; isUpToDate: boolean; stream: unknown }; try { @@ -550,9 +545,7 @@ export async function initElectric(userId: string): Promise { retryError instanceof Error ? retryError.message : String(retryError); if (retryMessage.includes("Already syncing")) { // Still syncing - create a placeholder handle that indicates the table is being synced - debugWarn( - `[Electric] ${table} still syncing, creating placeholder handle` - ); + debugWarn(`[Electric] ${table} still syncing, creating placeholder handle`); const placeholderHandle: SyncHandle = { unsubscribe: () => { debugLog(`[Electric] Placeholder unsubscribe for: ${cacheKey}`); @@ -656,9 +649,7 @@ export async function initElectric(userId: string): Promise { // Also check stream's isUpToDate property immediately if (stream?.isUpToDate) { - debugLog( - `[Electric] ✅ Stream isUpToDate is true immediately for ${table}` - ); + debugLog(`[Electric] ✅ Stream isUpToDate is true immediately for ${table}`); resolveInitialSync(); } } @@ -671,9 +662,7 @@ export async function initElectric(userId: string): Promise { } if (shape.isUpToDate || stream?.isUpToDate) { - debugLog( - `[Electric] ✅ Sync completed (detected via polling) for ${table}` - ); + debugLog(`[Electric] ✅ Sync completed (detected via polling) for ${table}`); clearInterval(pollInterval); resolveInitialSync(); } From 0f92b37b66ef27db4ea2c356cf62546693643cdd Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 05:36:18 +0530 Subject: [PATCH 34/36] feat: add status column to documents table for per-document processing tracking --- ..._status_column.py => 93_add_document_status_column.py} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename surfsense_backend/alembic/versions/{92_add_document_status_column.py => 93_add_document_status_column.py} (95%) diff --git a/surfsense_backend/alembic/versions/92_add_document_status_column.py b/surfsense_backend/alembic/versions/93_add_document_status_column.py similarity index 95% rename from surfsense_backend/alembic/versions/92_add_document_status_column.py rename to surfsense_backend/alembic/versions/93_add_document_status_column.py index 8204096aa..382db6109 100644 --- a/surfsense_backend/alembic/versions/92_add_document_status_column.py +++ b/surfsense_backend/alembic/versions/93_add_document_status_column.py @@ -1,7 +1,7 @@ """Add status column to documents table for per-document processing status -Revision ID: 92 -Revises: 91 +Revision ID: 93 +Revises: 92 Create Date: 2026-02-05 Changes: @@ -16,8 +16,8 @@ from collections.abc import Sequence from alembic import op # revision identifiers, used by Alembic. -revision: str = "92" -down_revision: str | None = "91" +revision: str = "93" +down_revision: str | None = "92" branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None From b41c22842f35d19308e5fcd512dc05c57a35d103 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Fri, 6 Feb 2026 12:32:55 +0530 Subject: [PATCH 35/36] refactor: change alelmbic migration number and made migrations idempotent --- ...ications_table_and_electric_replication.py | 36 +++++++------ ...4_add_access_token_to_image_generations.py | 50 ++++++++++++++----- ...mn.py => 95_add_document_status_column.py} | 8 +-- 3 files changed, 63 insertions(+), 31 deletions(-) rename surfsense_backend/alembic/versions/{93_add_document_status_column.py => 95_add_document_status_column.py} (95%) diff --git a/surfsense_backend/alembic/versions/66_add_notifications_table_and_electric_replication.py b/surfsense_backend/alembic/versions/66_add_notifications_table_and_electric_replication.py index dc25a1edd..182bf981c 100644 --- a/surfsense_backend/alembic/versions/66_add_notifications_table_and_electric_replication.py +++ b/surfsense_backend/alembic/versions/66_add_notifications_table_and_electric_replication.py @@ -17,13 +17,6 @@ from collections.abc import Sequence from alembic import context, op -# Get Electric SQL user credentials from env.py configuration -_config = context.config -ELECTRIC_DB_USER = _config.get_main_option("electric_db_user", "electric") -ELECTRIC_DB_PASSWORD = _config.get_main_option( - "electric_db_password", "electric_password" -) - # revision identifiers, used by Alembic. revision: str = "66" down_revision: str | None = "65" @@ -31,8 +24,21 @@ branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None +def _get_electric_credentials() -> tuple[str, str]: + """Get Electric SQL credentials from Alembic config. + + Must be called inside upgrade()/downgrade(), not at module level, + because context.config is only available during migration execution. + """ + _config = context.config + user = _config.get_main_option("electric_db_user", "electric") + password = _config.get_main_option("electric_db_password", "electric_password") + return user, password + + def upgrade() -> None: """Upgrade schema - add notifications table and Electric SQL replication.""" + electric_db_user, electric_db_password = _get_electric_credentials() # Create notifications table op.execute( """ @@ -74,8 +80,8 @@ def upgrade() -> None: f""" DO $$ BEGIN - IF NOT EXISTS (SELECT FROM pg_user WHERE usename = '{ELECTRIC_DB_USER}') THEN - CREATE USER {ELECTRIC_DB_USER} WITH REPLICATION PASSWORD '{ELECTRIC_DB_PASSWORD}'; + IF NOT EXISTS (SELECT FROM pg_user WHERE usename = '{electric_db_user}') THEN + CREATE USER {electric_db_user} WITH REPLICATION PASSWORD '{electric_db_password}'; END IF; END $$; @@ -89,19 +95,19 @@ def upgrade() -> None: DECLARE db_name TEXT := current_database(); BEGIN - EXECUTE format('GRANT CONNECT ON DATABASE %I TO {ELECTRIC_DB_USER}', db_name); + EXECUTE format('GRANT CONNECT ON DATABASE %I TO {electric_db_user}', db_name); END $$; """ ) - op.execute(f"GRANT USAGE ON SCHEMA public TO {ELECTRIC_DB_USER};") - op.execute(f"GRANT SELECT ON ALL TABLES IN SCHEMA public TO {ELECTRIC_DB_USER};") - op.execute(f"GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO {ELECTRIC_DB_USER};") + op.execute(f"GRANT USAGE ON SCHEMA public TO {electric_db_user};") + op.execute(f"GRANT SELECT ON ALL TABLES IN SCHEMA public TO {electric_db_user};") + op.execute(f"GRANT SELECT ON ALL SEQUENCES IN SCHEMA public TO {electric_db_user};") op.execute( - f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO {ELECTRIC_DB_USER};" + f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO {electric_db_user};" ) op.execute( - f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO {ELECTRIC_DB_USER};" + f"ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON SEQUENCES TO {electric_db_user};" ) # Create the publication if not exists diff --git a/surfsense_backend/alembic/versions/94_add_access_token_to_image_generations.py b/surfsense_backend/alembic/versions/94_add_access_token_to_image_generations.py index 09bea2c19..92f027e00 100644 --- a/surfsense_backend/alembic/versions/94_add_access_token_to_image_generations.py +++ b/surfsense_backend/alembic/versions/94_add_access_token_to_image_generations.py @@ -10,8 +10,6 @@ SECRET_KEY rotation. from collections.abc import Sequence -import sqlalchemy as sa - from alembic import op # revision identifiers, used by Alembic. @@ -23,17 +21,45 @@ depends_on: str | Sequence[str] | None = None def upgrade() -> None: # Add access_token column (nullable so existing rows are unaffected) - op.add_column( - "image_generations", - sa.Column("access_token", sa.String(64), nullable=True), - ) - op.create_index( - "ix_image_generations_access_token", - "image_generations", - ["access_token"], + # Guard: skip entirely if image_generations table doesn't exist + op.execute( + """ + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_name = 'image_generations' + ) THEN + -- Add column if not exists + IF NOT EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'image_generations' AND column_name = 'access_token' + ) THEN + ALTER TABLE image_generations + ADD COLUMN access_token VARCHAR(64); + END IF; + + -- Create index if not exists + CREATE INDEX IF NOT EXISTS ix_image_generations_access_token + ON image_generations (access_token); + END IF; + END$$; + """ ) def downgrade() -> None: - op.drop_index("ix_image_generations_access_token", table_name="image_generations") - op.drop_column("image_generations", "access_token") + op.execute("DROP INDEX IF EXISTS ix_image_generations_access_token") + op.execute( + """ + DO $$ + BEGIN + IF EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name = 'image_generations' AND column_name = 'access_token' + ) THEN + ALTER TABLE image_generations DROP COLUMN access_token; + END IF; + END$$; + """ + ) diff --git a/surfsense_backend/alembic/versions/93_add_document_status_column.py b/surfsense_backend/alembic/versions/95_add_document_status_column.py similarity index 95% rename from surfsense_backend/alembic/versions/93_add_document_status_column.py rename to surfsense_backend/alembic/versions/95_add_document_status_column.py index 382db6109..f5a6fa65d 100644 --- a/surfsense_backend/alembic/versions/93_add_document_status_column.py +++ b/surfsense_backend/alembic/versions/95_add_document_status_column.py @@ -1,7 +1,7 @@ """Add status column to documents table for per-document processing status -Revision ID: 93 -Revises: 92 +Revision ID: 95 +Revises: 94 Create Date: 2026-02-05 Changes: @@ -16,8 +16,8 @@ from collections.abc import Sequence from alembic import op # revision identifiers, used by Alembic. -revision: str = "93" -down_revision: str | None = "92" +revision: str = "95" +down_revision: str | None = "94" branch_labels: str | Sequence[str] | None = None depends_on: str | Sequence[str] | None = None From 017c2628425970c9c9beaebc88f6ccceb79be4ad Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Thu, 5 Feb 2026 23:45:01 -0800 Subject: [PATCH 36/36] fix: update DocumentsFilters component for accessibility and add success message for document deletion --- .../(manage)/components/DocumentsFilters.tsx | 13 ++++++++++--- surfsense_web/messages/en.json | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx index 6bd5f8460..ebdf431e4 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsFilters.tsx @@ -193,11 +193,18 @@ export function DocumentsFilters({
) : ( filteredTypes.map((value: DocumentTypeEnum, i) => ( - +
)) )}
diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 68ea533ac..fae4c7265 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -308,6 +308,7 @@ "no_rows_selected": "No rows selected", "delete_success_count": "Successfully deleted {count} document(s)", "delete_partial_failed": "Some documents could not be deleted", + "delete_success": "Document deleted successfully", "delete_error": "Error deleting documents", "filter_by_title": "Filter by title...", "bulk_delete": "Delete Selected",