mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 01:06:23 +02:00
- Introduced a new utility for parsing .xlsx files into markdown format, enhancing the ability to process Excel documents natively. - Updated the Google Drive content extractor to utilize the new Excel parsing functionality, allowing for better handling of spreadsheet files. - Enhanced file type detection and export logic to support various document formats, improving overall content extraction accuracy. - Added unit tests to ensure the correctness of the new Excel parsing feature and its integration with existing content extraction workflows.
129 lines
3.9 KiB
Python
129 lines
3.9 KiB
Python
"""Unit tests for native Office file parsers (no DB, no external services)."""
|
|
|
|
import tempfile
|
|
|
|
import pytest
|
|
from openpyxl import Workbook
|
|
|
|
pytestmark = pytest.mark.unit
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _create_xlsx(sheets: dict[str, list[list]]) -> str:
|
|
"""Create a real .xlsx file on disk and return its path.
|
|
|
|
``sheets`` maps sheet name -> list of rows, where each row is a list of
|
|
cell values.
|
|
"""
|
|
wb = Workbook()
|
|
first = True
|
|
for name, rows in sheets.items():
|
|
ws = wb.active if first else wb.create_sheet(title=name)
|
|
if first:
|
|
ws.title = name
|
|
first = False
|
|
for row in rows:
|
|
ws.append(row)
|
|
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
|
|
wb.save(tmp.name)
|
|
wb.close()
|
|
tmp.close()
|
|
return tmp.name
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tracer bullet: cell values appear in markdown
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_parse_excel_produces_markdown_with_cell_values():
|
|
"""A single-sheet .xlsx with known data produces markdown containing those values."""
|
|
from app.utils.office_parsers import parse_excel_to_markdown
|
|
|
|
path = _create_xlsx(
|
|
{"Sales": [["Product", "Revenue"], ["Widget", 1500], ["Gadget", 3200]]}
|
|
)
|
|
|
|
md = await parse_excel_to_markdown(path, filename="report.xlsx")
|
|
|
|
assert "Product" in md
|
|
assert "Revenue" in md
|
|
assert "Widget" in md
|
|
assert "1500" in md
|
|
assert "Gadget" in md
|
|
assert "3200" in md
|
|
assert "report.xlsx" in md
|
|
assert "|" in md
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Multi-sheet workbooks include all sheets
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_parse_excel_includes_all_sheets():
|
|
"""Both sheet names and their data appear in the output."""
|
|
from app.utils.office_parsers import parse_excel_to_markdown
|
|
|
|
path = _create_xlsx(
|
|
{
|
|
"Inventory": [["Item", "Qty"], ["Bolts", 200]],
|
|
"Pricing": [["Item", "Price"], ["Bolts", 4.50]],
|
|
}
|
|
)
|
|
|
|
md = await parse_excel_to_markdown(path, filename="multi.xlsx")
|
|
|
|
assert "Inventory" in md
|
|
assert "Pricing" in md
|
|
assert "Bolts" in md
|
|
assert "200" in md
|
|
assert "4.5" in md
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Empty spreadsheet raises ValueError
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_parse_excel_raises_on_empty_file():
|
|
"""An .xlsx with no data raises ValueError."""
|
|
from app.utils.office_parsers import parse_excel_to_markdown
|
|
|
|
wb = Workbook()
|
|
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
|
|
wb.save(tmp.name)
|
|
wb.close()
|
|
tmp.close()
|
|
|
|
with pytest.raises(ValueError, match="No data found"):
|
|
await parse_excel_to_markdown(tmp.name, filename="empty.xlsx")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _parse_file_to_markdown routes .xlsx natively (no ETL call)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_parse_file_to_markdown_routes_xlsx_natively():
|
|
"""content_extractor._parse_file_to_markdown uses native parser for .xlsx."""
|
|
from app.connectors.google_drive.content_extractor import _parse_file_to_markdown
|
|
|
|
path = _create_xlsx(
|
|
{"Data": [["Name", "Score"], ["Alice", 95], ["Bob", 82]]}
|
|
)
|
|
|
|
md = await _parse_file_to_markdown(path, "grades.xlsx")
|
|
|
|
assert "Alice" in md
|
|
assert "95" in md
|
|
assert "Bob" in md
|
|
assert "82" in md
|