diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 272a71403..de8e16156 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -14,8 +14,6 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.db import Log from app.services.task_logging_service import TaskLoggingService -from app.utils.office_parsers import EXCEL_EXTENSIONS - from .client import GoogleDriveClient from .file_types import ( get_export_mime_type, @@ -150,11 +148,6 @@ async def _parse_file_to_markdown(file_path: str, filename: str) -> str: raise ValueError("Transcription returned empty text") return f"# Transcription of {filename}\n\n{text}" - if lower.endswith(EXCEL_EXTENSIONS): - from app.utils.office_parsers import parse_excel_to_markdown - - return await parse_excel_to_markdown(file_path, filename) - # Document files -- use configured ETL service from app.config import config as app_config diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index c69c6fa95..6c0ae1870 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -1134,59 +1134,6 @@ async def process_file_in_background( ) return None - elif filename.lower().endswith((".xlsx",)): - from app.utils.office_parsers import parse_excel_to_markdown - - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Parsing spreadsheet", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing Excel file natively: {filename}", - {"file_type": "excel", "processing_stage": "native_parse"}, - ) - - excel_markdown = await parse_excel_to_markdown(file_path, filename) - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - - result = await add_received_markdown_file_document( - session, filename, excel_markdown, search_space_id, user_id, connector - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully parsed and processed Excel file: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "excel", - "etl_service": "NATIVE_EXCEL", - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Excel file already exists (duplicate): {filename}", - {"duplicate_detected": True, "file_type": "excel"}, - ) - return None - else: # Import page limit service from app.services.page_limit_service import ( @@ -1850,31 +1797,6 @@ async def process_file_in_background_with_document( with contextlib.suppress(Exception): os.unlink(file_path) - elif filename.lower().endswith((".xlsx",)): - from app.utils.office_parsers import parse_excel_to_markdown - - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Parsing spreadsheet", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing Excel file natively: {filename}", - {"file_type": "excel", "processing_stage": "native_parse"}, - ) - - markdown_content = await parse_excel_to_markdown(file_path, filename) - etl_service = "NATIVE_EXCEL" - - with contextlib.suppress(Exception): - os.unlink(file_path) - else: # Document files - use ETL service from app.services.page_limit_service import ( diff --git a/surfsense_backend/app/utils/office_parsers.py b/surfsense_backend/app/utils/office_parsers.py deleted file mode 100644 index a1550e110..000000000 --- a/surfsense_backend/app/utils/office_parsers.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Native parsers for Office file formats.""" - -import asyncio -import logging -import threading -import time -from pathlib import Path - -logger = logging.getLogger(__name__) - -EXCEL_EXTENSIONS = (".xlsx",) - - -def _parse_excel_sync(file_path: str) -> str: - """Parse an .xlsx file into markdown tables (synchronous).""" - from openpyxl import load_workbook - - wb = load_workbook(file_path, read_only=True, data_only=True) - markdown_parts: list[str] = [] - - for sheet_name in wb.sheetnames: - ws = wb[sheet_name] - rows = list(ws.iter_rows(values_only=True)) - non_empty_rows = [r for r in rows if any(c is not None for c in r)] - if not non_empty_rows: - continue - - markdown_parts.append(f"## {sheet_name}\n") - max_cols = max(len(row) for row in non_empty_rows) - - header = non_empty_rows[0] - hdr = [str(c if c is not None else "") for c in header] - hdr.extend([""] * (max_cols - len(hdr))) - markdown_parts.append("| " + " | ".join(hdr) + " |") - markdown_parts.append("| " + " | ".join("---" for _ in range(max_cols)) + " |") - - for row in non_empty_rows[1:]: - cells = [str(c if c is not None else "") for c in row] - cells.extend([""] * (max_cols - len(cells))) - markdown_parts.append("| " + " | ".join(cells) + " |") - - markdown_parts.append("") - - wb.close() - return "\n".join(markdown_parts) - - -async def parse_excel_to_markdown(file_path: str, filename: str = "") -> str: - """Parse an .xlsx file into markdown tables (async wrapper). - - Raises ``ValueError`` if no data is found in the workbook. - """ - t0 = time.monotonic() - logger.info( - "[excel-parse] START file=%s thread=%s", - filename, - threading.current_thread().name, - ) - - result = await asyncio.to_thread(_parse_excel_sync, file_path) - - logger.info( - "[excel-parse] END file=%s elapsed=%.2fs", - filename, - time.monotonic() - t0, - ) - - if not result.strip(): - raise ValueError(f"No data found in Excel file: {filename or file_path}") - - title = f"# {filename}\n\n" if filename else "" - return title + result diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 724e6db4c..017994c75 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -73,7 +73,6 @@ dependencies = [ "langchain-daytona>=0.0.2", "pypandoc>=1.16.2", "notion-markdown>=0.7.0", - "openpyxl>=3.1.5", ] [dependency-groups] diff --git a/surfsense_backend/tests/unit/test_office_parsers.py b/surfsense_backend/tests/unit/test_office_parsers.py deleted file mode 100644 index 11429a71d..000000000 --- a/surfsense_backend/tests/unit/test_office_parsers.py +++ /dev/null @@ -1,129 +0,0 @@ -"""Unit tests for native Office file parsers (no DB, no external services).""" - -import tempfile - -import pytest -from openpyxl import Workbook - -pytestmark = pytest.mark.unit - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _create_xlsx(sheets: dict[str, list[list]]) -> str: - """Create a real .xlsx file on disk and return its path. - - ``sheets`` maps sheet name -> list of rows, where each row is a list of - cell values. - """ - wb = Workbook() - first = True - for name, rows in sheets.items(): - ws = wb.active if first else wb.create_sheet(title=name) - if first: - ws.title = name - first = False - for row in rows: - ws.append(row) - tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) - wb.save(tmp.name) - wb.close() - tmp.close() - return tmp.name - - -# --------------------------------------------------------------------------- -# Tracer bullet: cell values appear in markdown -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_parse_excel_produces_markdown_with_cell_values(): - """A single-sheet .xlsx with known data produces markdown containing those values.""" - from app.utils.office_parsers import parse_excel_to_markdown - - path = _create_xlsx( - {"Sales": [["Product", "Revenue"], ["Widget", 1500], ["Gadget", 3200]]} - ) - - md = await parse_excel_to_markdown(path, filename="report.xlsx") - - assert "Product" in md - assert "Revenue" in md - assert "Widget" in md - assert "1500" in md - assert "Gadget" in md - assert "3200" in md - assert "report.xlsx" in md - assert "|" in md - - -# --------------------------------------------------------------------------- -# Multi-sheet workbooks include all sheets -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_parse_excel_includes_all_sheets(): - """Both sheet names and their data appear in the output.""" - from app.utils.office_parsers import parse_excel_to_markdown - - path = _create_xlsx( - { - "Inventory": [["Item", "Qty"], ["Bolts", 200]], - "Pricing": [["Item", "Price"], ["Bolts", 4.50]], - } - ) - - md = await parse_excel_to_markdown(path, filename="multi.xlsx") - - assert "Inventory" in md - assert "Pricing" in md - assert "Bolts" in md - assert "200" in md - assert "4.5" in md - - -# --------------------------------------------------------------------------- -# Empty spreadsheet raises ValueError -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_parse_excel_raises_on_empty_file(): - """An .xlsx with no data raises ValueError.""" - from app.utils.office_parsers import parse_excel_to_markdown - - wb = Workbook() - tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) - wb.save(tmp.name) - wb.close() - tmp.close() - - with pytest.raises(ValueError, match="No data found"): - await parse_excel_to_markdown(tmp.name, filename="empty.xlsx") - - -# --------------------------------------------------------------------------- -# _parse_file_to_markdown routes .xlsx natively (no ETL call) -# --------------------------------------------------------------------------- - - -@pytest.mark.asyncio -async def test_parse_file_to_markdown_routes_xlsx_natively(): - """content_extractor._parse_file_to_markdown uses native parser for .xlsx.""" - from app.connectors.google_drive.content_extractor import _parse_file_to_markdown - - path = _create_xlsx( - {"Data": [["Name", "Score"], ["Alice", 95], ["Bob", 82]]} - ) - - md = await _parse_file_to_markdown(path, "grades.xlsx") - - assert "Alice" in md - assert "95" in md - assert "Bob" in md - assert "82" in md diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index e4d148b50..82ae4cc16 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -7919,7 +7919,6 @@ dependencies = [ { name = "notion-client" }, { name = "notion-markdown" }, { name = "numpy" }, - { name = "openpyxl" }, { name = "pgvector" }, { name = "playwright" }, { name = "psycopg", extra = ["binary", "pool"] }, @@ -8002,7 +8001,6 @@ requires-dist = [ { name = "notion-client", specifier = ">=2.3.0" }, { name = "notion-markdown", specifier = ">=0.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pgvector", specifier = ">=0.3.6" }, { name = "playwright", specifier = ">=1.50.0" }, { name = "psycopg", extras = ["binary", "pool"], specifier = ">=3.3.2" },