diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index 53da9b607..d3cf456d6 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -65,7 +65,8 @@ _CONNECTOR_TYPE_TO_SEARCHABLE: dict[str, str] = {
"BOOKSTACK_CONNECTOR": "BOOKSTACK_CONNECTOR",
"CIRCLEBACK_CONNECTOR": "CIRCLEBACK", # Connector type differs from document type
"OBSIDIAN_CONNECTOR": "OBSIDIAN_CONNECTOR",
- # Composio connectors (unified to native document types)
+ # Composio connectors (unified to native document types).
+ # Reverse of NATIVE_TO_LEGACY_DOCTYPE in app.db.
"COMPOSIO_GOOGLE_DRIVE_CONNECTOR": "GOOGLE_DRIVE_FILE",
"COMPOSIO_GMAIL_CONNECTOR": "GOOGLE_GMAIL_CONNECTOR",
"COMPOSIO_GOOGLE_CALENDAR_CONNECTOR": "GOOGLE_CALENDAR_CONNECTOR",
diff --git a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
index 2ab1dc704..57f9bb124 100644
--- a/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
+++ b/surfsense_backend/app/agents/new_chat/tools/knowledge_base.py
@@ -19,7 +19,7 @@ from langchain_core.tools import StructuredTool
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
-from app.db import shielded_async_session
+from app.db import NATIVE_TO_LEGACY_DOCTYPE, shielded_async_session
from app.services.connector_service import ConnectorService
from app.utils.perf import get_perf_logger
@@ -43,7 +43,6 @@ _DEGENERATE_QUERY_RE = re.compile(
# a real search. We want breadth (many docs) over depth (many chunks).
_BROWSE_MAX_CHUNKS_PER_DOC = 5
-
def _is_degenerate_query(query: str) -> bool:
"""Return True when the query carries no meaningful search signal.
@@ -614,14 +613,6 @@ async def search_knowledge_base_async(
connectors = _normalize_connectors(connectors_to_search, available_connectors)
# --- Optimization 1: skip connectors that have zero indexed documents ---
- # Native Google types must also match their legacy Composio equivalents
- # (old documents may still carry the Composio type until re-indexed).
- _NATIVE_TO_LEGACY: dict[str, str] = {
- "GOOGLE_DRIVE_FILE": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
- "GOOGLE_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
- "GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
- }
-
if available_document_types:
doc_types_set = set(available_document_types)
before_count = len(connectors)
@@ -629,7 +620,7 @@ async def search_knowledge_base_async(
c
for c in connectors
if c in doc_types_set
- or _NATIVE_TO_LEGACY.get(c, "") in doc_types_set
+ or NATIVE_TO_LEGACY_DOCTYPE.get(c, "") in doc_types_set
]
skipped = before_count - len(connectors)
if skipped:
@@ -667,17 +658,10 @@ async def search_knowledge_base_async(
)
browse_connectors = connectors if connectors else [None] # type: ignore[list-item]
- # Expand native Google types to include legacy Composio equivalents
- # so old documents remain searchable until re-indexed.
- _LEGACY_ALIASES: dict[str, str] = {
- "GOOGLE_DRIVE_FILE": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
- "GOOGLE_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
- "GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
- }
expanded_browse = []
for c in browse_connectors:
- if c is not None and c in _LEGACY_ALIASES:
- expanded_browse.append([c, _LEGACY_ALIASES[c]])
+ if c is not None and c in NATIVE_TO_LEGACY_DOCTYPE:
+ expanded_browse.append([c, NATIVE_TO_LEGACY_DOCTYPE[c]])
else:
expanded_browse.append(c)
diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py
index 95ae8e728..888422eae 100644
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@@ -63,6 +63,16 @@ class DocumentType(StrEnum):
COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR"
+# Native Google document types → their legacy Composio equivalents.
+# Old documents may still carry the Composio type until they are re-indexed;
+# search, browse, and indexing must transparently handle both.
+NATIVE_TO_LEGACY_DOCTYPE: dict[str, str] = {
+ "GOOGLE_DRIVE_FILE": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
+ "GOOGLE_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
+ "GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
+}
+
+
class SearchSourceConnectorType(StrEnum):
SERPER_API = "SERPER_API" # NOT IMPLEMENTED YET : DON'T REMEMBER WHY : MOST PROBABLY BECAUSE WE NEED TO CRAWL THE RESULTS RETURNED BY IT
TAVILY_API = "TAVILY_API"
diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py
index 730b946a3..7c55da2e5 100644
--- a/surfsense_backend/app/services/connector_service.py
+++ b/surfsense_backend/app/services/connector_service.py
@@ -11,6 +11,7 @@ from sqlalchemy.future import select
from tavily import TavilyClient
from app.db import (
+ NATIVE_TO_LEGACY_DOCTYPE,
Chunk,
Document,
SearchSourceConnector,
@@ -215,15 +216,6 @@ class ConnectorService:
return result_object, files_docs
- # Composio connectors that were unified into native Google pipelines.
- # Old documents may still carry the legacy type until re-indexed; searching
- # for a native type should transparently include its legacy equivalent.
- _LEGACY_TYPE_ALIASES: dict[str, str] = {
- "GOOGLE_DRIVE_FILE": "COMPOSIO_GOOGLE_DRIVE_CONNECTOR",
- "GOOGLE_GMAIL_CONNECTOR": "COMPOSIO_GMAIL_CONNECTOR",
- "GOOGLE_CALENDAR_CONNECTOR": "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR",
- }
-
async def _combined_rrf_search(
self,
query_text: str,
@@ -266,10 +258,10 @@ class ConnectorService:
# Expand native Google types to include legacy Composio equivalents
# so old documents remain searchable until re-indexed.
- if isinstance(document_type, str) and document_type in self._LEGACY_TYPE_ALIASES:
+ if isinstance(document_type, str) and document_type in NATIVE_TO_LEGACY_DOCTYPE:
resolved_type: str | list[str] = [
document_type,
- self._LEGACY_TYPE_ALIASES[document_type],
+ NATIVE_TO_LEGACY_DOCTYPE[document_type],
]
else:
resolved_type = document_type
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 35494cc92..78d9315e9 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -170,7 +170,34 @@ async def handle_existing_document_update(
logging.info(f"Document for file {filename} unchanged. Skipping.")
return True, existing_document
else:
- # Content has changed - need to re-process
+ # Content has changed — guard against content_hash collision before
+ # expensive ETL processing. A collision means the exact same content
+ # already lives in a *different* document (e.g. a manual upload of the
+ # same file). Proceeding would trigger a unique-constraint violation
+ # on ix_documents_content_hash.
+ collision_doc = await check_duplicate_document(session, content_hash)
+ if collision_doc and collision_doc.id != existing_document.id:
+ logging.warning(
+ "Content-hash collision for %s: identical content exists in "
+ "document #%s (%s). Skipping re-processing.",
+ filename,
+ collision_doc.id,
+ collision_doc.document_type,
+ )
+ if DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.PENDING
+ ) or DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.PROCESSING
+ ):
+ # Pending/processing doc has no real content yet — remove it
+ # so the UI doesn't show a contentless entry.
+ await session.delete(existing_document)
+ await session.commit()
+ return True, None
+
+ # Document already has valid content — keep it as-is.
+ return True, existing_document
+
logging.info(f"Content changed for file {filename}. Updating document.")
return False, None
@@ -537,6 +564,12 @@ async def add_received_file_document_using_unstructured(
return document
except SQLAlchemyError as db_error:
await session.rollback()
+ if "ix_documents_content_hash" in str(db_error):
+ logging.warning(
+ "content_hash collision during commit for %s (Unstructured). Skipping.",
+ file_name,
+ )
+ return None
raise db_error
except Exception as e:
await session.rollback()
@@ -673,6 +706,12 @@ async def add_received_file_document_using_llamacloud(
return document
except SQLAlchemyError as db_error:
await session.rollback()
+ if "ix_documents_content_hash" in str(db_error):
+ logging.warning(
+ "content_hash collision during commit for %s (LlamaCloud). Skipping.",
+ file_name,
+ )
+ return None
raise db_error
except Exception as e:
await session.rollback()
@@ -828,6 +867,12 @@ async def add_received_file_document_using_docling(
return document
except SQLAlchemyError as db_error:
await session.rollback()
+ if "ix_documents_content_hash" in str(db_error):
+ logging.warning(
+ "content_hash collision during commit for %s (Docling). Skipping.",
+ file_name,
+ )
+ return None
raise db_error
except Exception as e:
await session.rollback()
diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
index d598bf9dd..2fb711bf8 100644
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@@ -158,6 +158,28 @@ async def _handle_existing_document_update(
logging.info(f"Document for markdown file {filename} unchanged. Skipping.")
return True, existing_document
else:
+ # Content has changed — guard against content_hash collision (same
+ # content already lives in a different document).
+ collision_doc = await check_duplicate_document(session, content_hash)
+ if collision_doc and collision_doc.id != existing_document.id:
+ logging.warning(
+ "Content-hash collision for markdown %s: identical content "
+ "exists in document #%s (%s). Skipping re-processing.",
+ filename,
+ collision_doc.id,
+ collision_doc.document_type,
+ )
+ if DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.PENDING
+ ) or DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.PROCESSING
+ ):
+ await session.delete(existing_document)
+ await session.commit()
+ return True, None
+
+ return True, existing_document
+
logging.info(
f"Content changed for markdown file {filename}. Updating document."
)
@@ -312,6 +334,12 @@ async def add_received_markdown_file_document(
return document
except SQLAlchemyError as db_error:
await session.rollback()
+ if "ix_documents_content_hash" in str(db_error):
+ logging.warning(
+ "content_hash collision during commit for %s (markdown). Skipping.",
+ file_name,
+ )
+ return None
await task_logger.log_task_failure(
log_entry,
f"Database error processing markdown file: {file_name}",
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
index 511ad3151..85d5db28a 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
@@ -643,6 +643,16 @@ export function DocumentsTableShell({
return ;
}
if (state === "failed") {
+ if (isMentioned) {
+ return (
+ handleRowToggle()}
+ aria-label="Remove from chat"
+ className="border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary"
+ />
+ );
+ }
return (
<>
@@ -652,7 +662,7 @@ export function DocumentsTableShell({
handleRowToggle()}
- aria-label={isMentioned ? "Remove from chat" : "Add to chat"}
+ aria-label="Add to chat"
className="border-foreground data-[state=checked]:bg-primary data-[state=checked]:border-primary"
/>