feat: add native Excel parsing and improve Google Drive content extraction

- Introduced a new utility for parsing .xlsx files into markdown format, enhancing the ability to process Excel documents natively. - Updated the Google Drive content extractor to utilize the new Excel parsing functionality, allowing for better handling of spreadsheet files. - Enhanced file type detection and export logic to support various document formats, improving overall content extraction accuracy. - Added unit tests to ensure the correctness of the new Excel parsing feature and its integration with existing content extraction workflows.
2026-04-27 01:36:30 +02:00 · 2026-03-27 21:47:14 +05:30 · 2026-03-27 21:47:14 +05:30 · 3da0ffd683
commit 3da0ffd683
parent 4e0749f907
7 changed files with 390 additions and 61 deletions
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -1134,6 +1134,59 @@ async def process_file_in_background(
                )
                return None

+        elif filename.lower().endswith((".xlsx",)):
+            from app.utils.office_parsers import parse_excel_to_markdown
+
+            if notification:
+                await (
+                    NotificationService.document_processing.notify_processing_progress(
+                        session,
+                        notification,
+                        stage="parsing",
+                        stage_message="Parsing spreadsheet",
+                    )
+                )
+
+            await task_logger.log_task_progress(
+                log_entry,
+                f"Processing Excel file natively: {filename}",
+                {"file_type": "excel", "processing_stage": "native_parse"},
+            )
+
+            excel_markdown = await parse_excel_to_markdown(file_path, filename)
+
+            try:
+                os.unlink(file_path)
+            except Exception as e:
+                print("Error deleting temp file", e)
+
+            result = await add_received_markdown_file_document(
+                session, filename, excel_markdown, search_space_id, user_id, connector
+            )
+
+            if connector:
+                await _update_document_from_connector(result, connector, session)
+
+            if result:
+                await task_logger.log_task_success(
+                    log_entry,
+                    f"Successfully parsed and processed Excel file: {filename}",
+                    {
+                        "document_id": result.id,
+                        "content_hash": result.content_hash,
+                        "file_type": "excel",
+                        "etl_service": "NATIVE_EXCEL",
+                    },
+                )
+                return result
+            else:
+                await task_logger.log_task_success(
+                    log_entry,
+                    f"Excel file already exists (duplicate): {filename}",
+                    {"duplicate_detected": True, "file_type": "excel"},
+                )
+                return None
+
        else:
            # Import page limit service
            from app.services.page_limit_service import (
@ -1797,6 +1850,31 @@ async def process_file_in_background_with_document(
            with contextlib.suppress(Exception):
                os.unlink(file_path)

+        elif filename.lower().endswith((".xlsx",)):
+            from app.utils.office_parsers import parse_excel_to_markdown
+
+            if notification:
+                await (
+                    NotificationService.document_processing.notify_processing_progress(
+                        session,
+                        notification,
+                        stage="parsing",
+                        stage_message="Parsing spreadsheet",
+                    )
+                )
+
+            await task_logger.log_task_progress(
+                log_entry,
+                f"Processing Excel file natively: {filename}",
+                {"file_type": "excel", "processing_stage": "native_parse"},
+            )
+
+            markdown_content = await parse_excel_to_markdown(file_path, filename)
+            etl_service = "NATIVE_EXCEL"
+
+            with contextlib.suppress(Exception):
+                os.unlink(file_path)
+
        else:
            # Document files - use ETL service
            from app.services.page_limit_service import (