fix: revert native excel parsing

2026-05-02 12:22:40 +02:00 · 2026-03-27 22:15:24 +05:30 · 2026-03-27 22:15:24 +05:30 · 489e48644f
commit 489e48644f
parent dff8a1df37
6 changed files with 0 additions and 289 deletions
--- a/surfsense_backend/app/connectors/google_drive/content_extractor.py
+++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py
@ -14,8 +14,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.db import Log
 from app.services.task_logging_service import TaskLoggingService

-from app.utils.office_parsers import EXCEL_EXTENSIONS
-
 from .client import GoogleDriveClient
 from .file_types import (
    get_export_mime_type,
@ -150,11 +148,6 @@ async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
            raise ValueError("Transcription returned empty text")
        return f"# Transcription of {filename}\n\n{text}"

-    if lower.endswith(EXCEL_EXTENSIONS):
-        from app.utils.office_parsers import parse_excel_to_markdown
-
-        return await parse_excel_to_markdown(file_path, filename)
-
    # Document files -- use configured ETL service
    from app.config import config as app_config

--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@ -1134,59 +1134,6 @@ async def process_file_in_background(
                )
                return None

-        elif filename.lower().endswith((".xlsx",)):
-            from app.utils.office_parsers import parse_excel_to_markdown
-
-            if notification:
-                await (
-                    NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Parsing spreadsheet",
-                    )
-                )
-
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Processing Excel file natively: {filename}",
-                {"file_type": "excel", "processing_stage": "native_parse"},
-            )
-
-            excel_markdown = await parse_excel_to_markdown(file_path, filename)
-
-            try:
-                os.unlink(file_path)
-            except Exception as e:
-                print("Error deleting temp file", e)
-
-            result = await add_received_markdown_file_document(
-                session, filename, excel_markdown, search_space_id, user_id, connector
-            )
-
-            if connector:
-                await _update_document_from_connector(result, connector, session)
-
-            if result:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Successfully parsed and processed Excel file: {filename}",
-                    {
-                        "document_id": result.id,
-                        "content_hash": result.content_hash,
-                        "file_type": "excel",
-                        "etl_service": "NATIVE_EXCEL",
-                    },
-                )
-                return result
-            else:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Excel file already exists (duplicate): {filename}",
-                    {"duplicate_detected": True, "file_type": "excel"},
-                )
-                return None
-
        else:
            # Import page limit service
            from app.services.page_limit_service import (
@ -1850,31 +1797,6 @@ async def process_file_in_background_with_document(
            with contextlib.suppress(Exception):
                os.unlink(file_path)

-        elif filename.lower().endswith((".xlsx",)):
-            from app.utils.office_parsers import parse_excel_to_markdown
-
-            if notification:
-                await (
-                    NotificationService.document_processing.notify_processing_progress(
-                        session,
-                        notification,
-                        stage="parsing",
-                        stage_message="Parsing spreadsheet",
-                    )
-                )
-
-            await task_logger.log_task_progress(
-                log_entry,
-                f"Processing Excel file natively: {filename}",
-                {"file_type": "excel", "processing_stage": "native_parse"},
-            )
-
-            markdown_content = await parse_excel_to_markdown(file_path, filename)
-            etl_service = "NATIVE_EXCEL"
-
-            with contextlib.suppress(Exception):
-                os.unlink(file_path)
-
        else:
            # Document files - use ETL service
            from app.services.page_limit_service import (
--- a/surfsense_backend/app/utils/office_parsers.py
+++ b/surfsense_backend/app/utils/office_parsers.py
@ -1,72 +0,0 @@
-"""Native parsers for Office file formats."""
-
-import asyncio
-import logging
-import threading
-import time
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-EXCEL_EXTENSIONS = (".xlsx",)
-
-
-def _parse_excel_sync(file_path: str) -> str:
-    """Parse an .xlsx file into markdown tables (synchronous)."""
-    from openpyxl import load_workbook
-
-    wb = load_workbook(file_path, read_only=True, data_only=True)
-    markdown_parts: list[str] = []
-
-    for sheet_name in wb.sheetnames:
-        ws = wb[sheet_name]
-        rows = list(ws.iter_rows(values_only=True))
-        non_empty_rows = [r for r in rows if any(c is not None for c in r)]
-        if not non_empty_rows:
-            continue
-
-        markdown_parts.append(f"## {sheet_name}\n")
-        max_cols = max(len(row) for row in non_empty_rows)
-
-        header = non_empty_rows[0]
-        hdr = [str(c if c is not None else "") for c in header]
-        hdr.extend([""] * (max_cols - len(hdr)))
-        markdown_parts.append("| " + " | ".join(hdr) + " |")
-        markdown_parts.append("| " + " | ".join("---" for _ in range(max_cols)) + " |")
-
-        for row in non_empty_rows[1:]:
-            cells = [str(c if c is not None else "") for c in row]
-            cells.extend([""] * (max_cols - len(cells)))
-            markdown_parts.append("| " + " | ".join(cells) + " |")
-
-        markdown_parts.append("")
-
-    wb.close()
-    return "\n".join(markdown_parts)
-
-
-async def parse_excel_to_markdown(file_path: str, filename: str = "") -> str:
-    """Parse an .xlsx file into markdown tables (async wrapper).
-
-    Raises ``ValueError`` if no data is found in the workbook.
-    """
-    t0 = time.monotonic()
-    logger.info(
-        "[excel-parse] START file=%s thread=%s",
-        filename,
-        threading.current_thread().name,
-    )
-
-    result = await asyncio.to_thread(_parse_excel_sync, file_path)
-
-    logger.info(
-        "[excel-parse] END file=%s elapsed=%.2fs",
-        filename,
-        time.monotonic() - t0,
-    )
-
-    if not result.strip():
-        raise ValueError(f"No data found in Excel file: {filename or file_path}")
-
-    title = f"# {filename}\n\n" if filename else ""
-    return title + result
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -73,7 +73,6 @@ dependencies = [
    "langchain-daytona>=0.0.2",
    "pypandoc>=1.16.2",
    "notion-markdown>=0.7.0",
-    "openpyxl>=3.1.5",
 ]

 [dependency-groups]
--- a/surfsense_backend/tests/unit/test_office_parsers.py
+++ b/surfsense_backend/tests/unit/test_office_parsers.py
@ -1,129 +0,0 @@
-"""Unit tests for native Office file parsers (no DB, no external services)."""
-
-import tempfile
-
-import pytest
-from openpyxl import Workbook
-
-pytestmark = pytest.mark.unit
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _create_xlsx(sheets: dict[str, list[list]]) -> str:
-    """Create a real .xlsx file on disk and return its path.
-
-    ``sheets`` maps sheet name -> list of rows, where each row is a list of
-    cell values.
-    """
-    wb = Workbook()
-    first = True
-    for name, rows in sheets.items():
-        ws = wb.active if first else wb.create_sheet(title=name)
-        if first:
-            ws.title = name
-            first = False
-        for row in rows:
-            ws.append(row)
-    tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
-    wb.save(tmp.name)
-    wb.close()
-    tmp.close()
-    return tmp.name
-
-
-# ---------------------------------------------------------------------------
-# Tracer bullet: cell values appear in markdown
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_parse_excel_produces_markdown_with_cell_values():
-    """A single-sheet .xlsx with known data produces markdown containing those values."""
-    from app.utils.office_parsers import parse_excel_to_markdown
-
-    path = _create_xlsx(
-        {"Sales": [["Product", "Revenue"], ["Widget", 1500], ["Gadget", 3200]]}
-    )
-
-    md = await parse_excel_to_markdown(path, filename="report.xlsx")
-
-    assert "Product" in md
-    assert "Revenue" in md
-    assert "Widget" in md
-    assert "1500" in md
-    assert "Gadget" in md
-    assert "3200" in md
-    assert "report.xlsx" in md
-    assert "|" in md
-
-
-# ---------------------------------------------------------------------------
-# Multi-sheet workbooks include all sheets
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_parse_excel_includes_all_sheets():
-    """Both sheet names and their data appear in the output."""
-    from app.utils.office_parsers import parse_excel_to_markdown
-
-    path = _create_xlsx(
-        {
-            "Inventory": [["Item", "Qty"], ["Bolts", 200]],
-            "Pricing": [["Item", "Price"], ["Bolts", 4.50]],
-        }
-    )
-
-    md = await parse_excel_to_markdown(path, filename="multi.xlsx")
-
-    assert "Inventory" in md
-    assert "Pricing" in md
-    assert "Bolts" in md
-    assert "200" in md
-    assert "4.5" in md
-
-
-# ---------------------------------------------------------------------------
-# Empty spreadsheet raises ValueError
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_parse_excel_raises_on_empty_file():
-    """An .xlsx with no data raises ValueError."""
-    from app.utils.office_parsers import parse_excel_to_markdown
-
-    wb = Workbook()
-    tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
-    wb.save(tmp.name)
-    wb.close()
-    tmp.close()
-
-    with pytest.raises(ValueError, match="No data found"):
-        await parse_excel_to_markdown(tmp.name, filename="empty.xlsx")
-
-
-# ---------------------------------------------------------------------------
-# _parse_file_to_markdown routes .xlsx natively (no ETL call)
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.asyncio
-async def test_parse_file_to_markdown_routes_xlsx_natively():
-    """content_extractor._parse_file_to_markdown uses native parser for .xlsx."""
-    from app.connectors.google_drive.content_extractor import _parse_file_to_markdown
-
-    path = _create_xlsx(
-        {"Data": [["Name", "Score"], ["Alice", 95], ["Bob", 82]]}
-    )
-
-    md = await _parse_file_to_markdown(path, "grades.xlsx")
-
-    assert "Alice" in md
-    assert "95" in md
-    assert "Bob" in md
-    assert "82" in md
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@ -7919,7 +7919,6 @@ dependencies = [
    { name = "notion-client" },
    { name = "notion-markdown" },
    { name = "numpy" },
-    { name = "openpyxl" },
    { name = "pgvector" },
    { name = "playwright" },
    { name = "psycopg", extra = ["binary", "pool"] },
@ -8002,7 +8001,6 @@ requires-dist = [
    { name = "notion-client", specifier = ">=2.3.0" },
    { name = "notion-markdown", specifier = ">=0.7.0" },
    { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openpyxl", specifier = ">=3.1.5" },
    { name = "pgvector", specifier = ">=0.3.6" },
    { name = "playwright", specifier = ">=1.50.0" },
    { name = "psycopg", extras = ["binary", "pool"], specifier = ">=3.3.2" },