mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-03 12:52:39 +02:00
fix: revert native excel parsing
This commit is contained in:
parent
dff8a1df37
commit
489e48644f
6 changed files with 0 additions and 289 deletions
|
|
@ -14,8 +14,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from app.db import Log
|
from app.db import Log
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
|
|
||||||
from app.utils.office_parsers import EXCEL_EXTENSIONS
|
|
||||||
|
|
||||||
from .client import GoogleDriveClient
|
from .client import GoogleDriveClient
|
||||||
from .file_types import (
|
from .file_types import (
|
||||||
get_export_mime_type,
|
get_export_mime_type,
|
||||||
|
|
@ -150,11 +148,6 @@ async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
|
||||||
raise ValueError("Transcription returned empty text")
|
raise ValueError("Transcription returned empty text")
|
||||||
return f"# Transcription of {filename}\n\n{text}"
|
return f"# Transcription of {filename}\n\n{text}"
|
||||||
|
|
||||||
if lower.endswith(EXCEL_EXTENSIONS):
|
|
||||||
from app.utils.office_parsers import parse_excel_to_markdown
|
|
||||||
|
|
||||||
return await parse_excel_to_markdown(file_path, filename)
|
|
||||||
|
|
||||||
# Document files -- use configured ETL service
|
# Document files -- use configured ETL service
|
||||||
from app.config import config as app_config
|
from app.config import config as app_config
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1134,59 +1134,6 @@ async def process_file_in_background(
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
elif filename.lower().endswith((".xlsx",)):
|
|
||||||
from app.utils.office_parsers import parse_excel_to_markdown
|
|
||||||
|
|
||||||
if notification:
|
|
||||||
await (
|
|
||||||
NotificationService.document_processing.notify_processing_progress(
|
|
||||||
session,
|
|
||||||
notification,
|
|
||||||
stage="parsing",
|
|
||||||
stage_message="Parsing spreadsheet",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
await task_logger.log_task_progress(
|
|
||||||
log_entry,
|
|
||||||
f"Processing Excel file natively: {filename}",
|
|
||||||
{"file_type": "excel", "processing_stage": "native_parse"},
|
|
||||||
)
|
|
||||||
|
|
||||||
excel_markdown = await parse_excel_to_markdown(file_path, filename)
|
|
||||||
|
|
||||||
try:
|
|
||||||
os.unlink(file_path)
|
|
||||||
except Exception as e:
|
|
||||||
print("Error deleting temp file", e)
|
|
||||||
|
|
||||||
result = await add_received_markdown_file_document(
|
|
||||||
session, filename, excel_markdown, search_space_id, user_id, connector
|
|
||||||
)
|
|
||||||
|
|
||||||
if connector:
|
|
||||||
await _update_document_from_connector(result, connector, session)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"Successfully parsed and processed Excel file: {filename}",
|
|
||||||
{
|
|
||||||
"document_id": result.id,
|
|
||||||
"content_hash": result.content_hash,
|
|
||||||
"file_type": "excel",
|
|
||||||
"etl_service": "NATIVE_EXCEL",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
else:
|
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"Excel file already exists (duplicate): {filename}",
|
|
||||||
{"duplicate_detected": True, "file_type": "excel"},
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Import page limit service
|
# Import page limit service
|
||||||
from app.services.page_limit_service import (
|
from app.services.page_limit_service import (
|
||||||
|
|
@ -1850,31 +1797,6 @@ async def process_file_in_background_with_document(
|
||||||
with contextlib.suppress(Exception):
|
with contextlib.suppress(Exception):
|
||||||
os.unlink(file_path)
|
os.unlink(file_path)
|
||||||
|
|
||||||
elif filename.lower().endswith((".xlsx",)):
|
|
||||||
from app.utils.office_parsers import parse_excel_to_markdown
|
|
||||||
|
|
||||||
if notification:
|
|
||||||
await (
|
|
||||||
NotificationService.document_processing.notify_processing_progress(
|
|
||||||
session,
|
|
||||||
notification,
|
|
||||||
stage="parsing",
|
|
||||||
stage_message="Parsing spreadsheet",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
await task_logger.log_task_progress(
|
|
||||||
log_entry,
|
|
||||||
f"Processing Excel file natively: {filename}",
|
|
||||||
{"file_type": "excel", "processing_stage": "native_parse"},
|
|
||||||
)
|
|
||||||
|
|
||||||
markdown_content = await parse_excel_to_markdown(file_path, filename)
|
|
||||||
etl_service = "NATIVE_EXCEL"
|
|
||||||
|
|
||||||
with contextlib.suppress(Exception):
|
|
||||||
os.unlink(file_path)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Document files - use ETL service
|
# Document files - use ETL service
|
||||||
from app.services.page_limit_service import (
|
from app.services.page_limit_service import (
|
||||||
|
|
|
||||||
|
|
@ -1,72 +0,0 @@
|
||||||
"""Native parsers for Office file formats."""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
EXCEL_EXTENSIONS = (".xlsx",)
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_excel_sync(file_path: str) -> str:
|
|
||||||
"""Parse an .xlsx file into markdown tables (synchronous)."""
|
|
||||||
from openpyxl import load_workbook
|
|
||||||
|
|
||||||
wb = load_workbook(file_path, read_only=True, data_only=True)
|
|
||||||
markdown_parts: list[str] = []
|
|
||||||
|
|
||||||
for sheet_name in wb.sheetnames:
|
|
||||||
ws = wb[sheet_name]
|
|
||||||
rows = list(ws.iter_rows(values_only=True))
|
|
||||||
non_empty_rows = [r for r in rows if any(c is not None for c in r)]
|
|
||||||
if not non_empty_rows:
|
|
||||||
continue
|
|
||||||
|
|
||||||
markdown_parts.append(f"## {sheet_name}\n")
|
|
||||||
max_cols = max(len(row) for row in non_empty_rows)
|
|
||||||
|
|
||||||
header = non_empty_rows[0]
|
|
||||||
hdr = [str(c if c is not None else "") for c in header]
|
|
||||||
hdr.extend([""] * (max_cols - len(hdr)))
|
|
||||||
markdown_parts.append("| " + " | ".join(hdr) + " |")
|
|
||||||
markdown_parts.append("| " + " | ".join("---" for _ in range(max_cols)) + " |")
|
|
||||||
|
|
||||||
for row in non_empty_rows[1:]:
|
|
||||||
cells = [str(c if c is not None else "") for c in row]
|
|
||||||
cells.extend([""] * (max_cols - len(cells)))
|
|
||||||
markdown_parts.append("| " + " | ".join(cells) + " |")
|
|
||||||
|
|
||||||
markdown_parts.append("")
|
|
||||||
|
|
||||||
wb.close()
|
|
||||||
return "\n".join(markdown_parts)
|
|
||||||
|
|
||||||
|
|
||||||
async def parse_excel_to_markdown(file_path: str, filename: str = "") -> str:
|
|
||||||
"""Parse an .xlsx file into markdown tables (async wrapper).
|
|
||||||
|
|
||||||
Raises ``ValueError`` if no data is found in the workbook.
|
|
||||||
"""
|
|
||||||
t0 = time.monotonic()
|
|
||||||
logger.info(
|
|
||||||
"[excel-parse] START file=%s thread=%s",
|
|
||||||
filename,
|
|
||||||
threading.current_thread().name,
|
|
||||||
)
|
|
||||||
|
|
||||||
result = await asyncio.to_thread(_parse_excel_sync, file_path)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"[excel-parse] END file=%s elapsed=%.2fs",
|
|
||||||
filename,
|
|
||||||
time.monotonic() - t0,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not result.strip():
|
|
||||||
raise ValueError(f"No data found in Excel file: {filename or file_path}")
|
|
||||||
|
|
||||||
title = f"# {filename}\n\n" if filename else ""
|
|
||||||
return title + result
|
|
||||||
|
|
@ -73,7 +73,6 @@ dependencies = [
|
||||||
"langchain-daytona>=0.0.2",
|
"langchain-daytona>=0.0.2",
|
||||||
"pypandoc>=1.16.2",
|
"pypandoc>=1.16.2",
|
||||||
"notion-markdown>=0.7.0",
|
"notion-markdown>=0.7.0",
|
||||||
"openpyxl>=3.1.5",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
|
|
|
||||||
|
|
@ -1,129 +0,0 @@
|
||||||
"""Unit tests for native Office file parsers (no DB, no external services)."""
|
|
||||||
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from openpyxl import Workbook
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.unit
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Helpers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _create_xlsx(sheets: dict[str, list[list]]) -> str:
|
|
||||||
"""Create a real .xlsx file on disk and return its path.
|
|
||||||
|
|
||||||
``sheets`` maps sheet name -> list of rows, where each row is a list of
|
|
||||||
cell values.
|
|
||||||
"""
|
|
||||||
wb = Workbook()
|
|
||||||
first = True
|
|
||||||
for name, rows in sheets.items():
|
|
||||||
ws = wb.active if first else wb.create_sheet(title=name)
|
|
||||||
if first:
|
|
||||||
ws.title = name
|
|
||||||
first = False
|
|
||||||
for row in rows:
|
|
||||||
ws.append(row)
|
|
||||||
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
|
|
||||||
wb.save(tmp.name)
|
|
||||||
wb.close()
|
|
||||||
tmp.close()
|
|
||||||
return tmp.name
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Tracer bullet: cell values appear in markdown
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_parse_excel_produces_markdown_with_cell_values():
|
|
||||||
"""A single-sheet .xlsx with known data produces markdown containing those values."""
|
|
||||||
from app.utils.office_parsers import parse_excel_to_markdown
|
|
||||||
|
|
||||||
path = _create_xlsx(
|
|
||||||
{"Sales": [["Product", "Revenue"], ["Widget", 1500], ["Gadget", 3200]]}
|
|
||||||
)
|
|
||||||
|
|
||||||
md = await parse_excel_to_markdown(path, filename="report.xlsx")
|
|
||||||
|
|
||||||
assert "Product" in md
|
|
||||||
assert "Revenue" in md
|
|
||||||
assert "Widget" in md
|
|
||||||
assert "1500" in md
|
|
||||||
assert "Gadget" in md
|
|
||||||
assert "3200" in md
|
|
||||||
assert "report.xlsx" in md
|
|
||||||
assert "|" in md
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Multi-sheet workbooks include all sheets
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_parse_excel_includes_all_sheets():
|
|
||||||
"""Both sheet names and their data appear in the output."""
|
|
||||||
from app.utils.office_parsers import parse_excel_to_markdown
|
|
||||||
|
|
||||||
path = _create_xlsx(
|
|
||||||
{
|
|
||||||
"Inventory": [["Item", "Qty"], ["Bolts", 200]],
|
|
||||||
"Pricing": [["Item", "Price"], ["Bolts", 4.50]],
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
md = await parse_excel_to_markdown(path, filename="multi.xlsx")
|
|
||||||
|
|
||||||
assert "Inventory" in md
|
|
||||||
assert "Pricing" in md
|
|
||||||
assert "Bolts" in md
|
|
||||||
assert "200" in md
|
|
||||||
assert "4.5" in md
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Empty spreadsheet raises ValueError
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_parse_excel_raises_on_empty_file():
|
|
||||||
"""An .xlsx with no data raises ValueError."""
|
|
||||||
from app.utils.office_parsers import parse_excel_to_markdown
|
|
||||||
|
|
||||||
wb = Workbook()
|
|
||||||
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
|
|
||||||
wb.save(tmp.name)
|
|
||||||
wb.close()
|
|
||||||
tmp.close()
|
|
||||||
|
|
||||||
with pytest.raises(ValueError, match="No data found"):
|
|
||||||
await parse_excel_to_markdown(tmp.name, filename="empty.xlsx")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# _parse_file_to_markdown routes .xlsx natively (no ETL call)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_parse_file_to_markdown_routes_xlsx_natively():
|
|
||||||
"""content_extractor._parse_file_to_markdown uses native parser for .xlsx."""
|
|
||||||
from app.connectors.google_drive.content_extractor import _parse_file_to_markdown
|
|
||||||
|
|
||||||
path = _create_xlsx(
|
|
||||||
{"Data": [["Name", "Score"], ["Alice", 95], ["Bob", 82]]}
|
|
||||||
)
|
|
||||||
|
|
||||||
md = await _parse_file_to_markdown(path, "grades.xlsx")
|
|
||||||
|
|
||||||
assert "Alice" in md
|
|
||||||
assert "95" in md
|
|
||||||
assert "Bob" in md
|
|
||||||
assert "82" in md
|
|
||||||
2
surfsense_backend/uv.lock
generated
2
surfsense_backend/uv.lock
generated
|
|
@ -7919,7 +7919,6 @@ dependencies = [
|
||||||
{ name = "notion-client" },
|
{ name = "notion-client" },
|
||||||
{ name = "notion-markdown" },
|
{ name = "notion-markdown" },
|
||||||
{ name = "numpy" },
|
{ name = "numpy" },
|
||||||
{ name = "openpyxl" },
|
|
||||||
{ name = "pgvector" },
|
{ name = "pgvector" },
|
||||||
{ name = "playwright" },
|
{ name = "playwright" },
|
||||||
{ name = "psycopg", extra = ["binary", "pool"] },
|
{ name = "psycopg", extra = ["binary", "pool"] },
|
||||||
|
|
@ -8002,7 +8001,6 @@ requires-dist = [
|
||||||
{ name = "notion-client", specifier = ">=2.3.0" },
|
{ name = "notion-client", specifier = ">=2.3.0" },
|
||||||
{ name = "notion-markdown", specifier = ">=0.7.0" },
|
{ name = "notion-markdown", specifier = ">=0.7.0" },
|
||||||
{ name = "numpy", specifier = ">=1.24.0" },
|
{ name = "numpy", specifier = ">=1.24.0" },
|
||||||
{ name = "openpyxl", specifier = ">=3.1.5" },
|
|
||||||
{ name = "pgvector", specifier = ">=0.3.6" },
|
{ name = "pgvector", specifier = ">=0.3.6" },
|
||||||
{ name = "playwright", specifier = ">=1.50.0" },
|
{ name = "playwright", specifier = ">=1.50.0" },
|
||||||
{ name = "psycopg", extras = ["binary", "pool"], specifier = ">=3.3.2" },
|
{ name = "psycopg", extras = ["binary", "pool"], specifier = ">=3.3.2" },
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue