feat(connectors): add connector parameter to file processor for source tracking

- Add optional 'connector' parameter with 'type' and 'metadata' fields - Create helper function _update_document_from_connector - Use document_metadata column (not metadata) for JSON field - Merge metadata with existing using dict spread operator - Google Drive documents now marked as GOOGLE_DRIVE_CONNECTOR - Backward compatible - no changes to existing logic - Simple and clean implementation
2026-04-26 09:16:22 +02:00 · 2025-12-28 18:01:39 +02:00 · 2025-12-28 18:01:39 +02:00 · a5935bc677
commit a5935bc677
parent 8da58be9e0
3 changed files with 60 additions and 71 deletions
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -447,6 +447,24 @@ async def add_received_file_document_using_docling(
        ) from e


+async def _update_document_from_connector(
+    document: Document | None, connector: dict | None, session: AsyncSession
+) -> None:
+    """Helper to update document type and metadata from connector info."""
+    if document and connector:
+        if "type" in connector:
+            document.document_type = connector["type"]
+        if "metadata" in connector:
+            # Merge with existing document_metadata (the actual column name)
+            if not document.document_metadata:
+                document.document_metadata = connector["metadata"]
+            else:
+                # Expand existing metadata with connector metadata
+                merged = {**document.document_metadata, **connector["metadata"]}
+                document.document_metadata = merged
+        await session.commit()
+
+
 async def process_file_in_background(
    file_path: str,
    filename: str,
@ -455,6 +473,7 @@ async def process_file_in_background(
    session: AsyncSession,
    task_logger: TaskLoggingService,
    log_entry: Log,
+    connector: dict | None = None,  # Optional: {"type": "GOOGLE_DRIVE_CONNECTOR", "metadata": {...}}
 ):
    try:
        # Check if the file is a markdown or text file
@ -492,6 +511,9 @@ async def process_file_in_background(
                session, filename, markdown_content, search_space_id, user_id
            )

+            # Update from connector if provided
+            await _update_document_from_connector(result, connector, session)
+
            if result:
                await task_logger.log_task_success(
                    log_entry,
@ -608,6 +630,9 @@ async def process_file_in_background(
                session, filename, transcribed_text, search_space_id, user_id
            )

+            # Update from connector if provided
+            await _update_document_from_connector(result, connector, session)
+
            if result:
                await task_logger.log_task_success(
                    log_entry,
@ -753,6 +778,9 @@ async def process_file_in_background(
                    session, filename, docs, search_space_id, user_id
                )

+                # Update from connector if provided
+                await _update_document_from_connector(result, connector, session)
+
                if result:
                    # Update page usage after successful processing
                    # allow_exceed=True because document was already created after passing initial check
@ -897,6 +925,9 @@ async def process_file_in_background(
                        user_id, final_page_count, allow_exceed=True
                    )

+                    # Update from connector if provided
+                    await _update_document_from_connector(last_created_doc, connector, session)
+
                    await task_logger.log_task_success(
                        log_entry,
                        f"Successfully processed file with LlamaCloud: {filename}",
@ -1021,6 +1052,9 @@ async def process_file_in_background(
                        user_id, final_page_count, allow_exceed=True
                    )

+                    # Update from connector if provided
+                    await _update_document_from_connector(doc_result, connector, session)
+
                    await task_logger.log_task_success(
                        log_entry,
                        f"Successfully processed file with Docling: {filename}",