mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-15 18:25:18 +02:00
fix: revert native excel parsing
This commit is contained in:
parent
dff8a1df37
commit
489e48644f
6 changed files with 0 additions and 289 deletions
|
|
@ -14,8 +14,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from app.db import Log
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
from app.utils.office_parsers import EXCEL_EXTENSIONS
|
||||
|
||||
from .client import GoogleDriveClient
|
||||
from .file_types import (
|
||||
get_export_mime_type,
|
||||
|
|
@ -150,11 +148,6 @@ async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
|
|||
raise ValueError("Transcription returned empty text")
|
||||
return f"# Transcription of {filename}\n\n{text}"
|
||||
|
||||
if lower.endswith(EXCEL_EXTENSIONS):
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
return await parse_excel_to_markdown(file_path, filename)
|
||||
|
||||
# Document files -- use configured ETL service
|
||||
from app.config import config as app_config
|
||||
|
||||
|
|
|
|||
|
|
@ -1134,59 +1134,6 @@ async def process_file_in_background(
|
|||
)
|
||||
return None
|
||||
|
||||
elif filename.lower().endswith((".xlsx",)):
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
if notification:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Parsing spreadsheet",
|
||||
)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing Excel file natively: {filename}",
|
||||
{"file_type": "excel", "processing_stage": "native_parse"},
|
||||
)
|
||||
|
||||
excel_markdown = await parse_excel_to_markdown(file_path, filename)
|
||||
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except Exception as e:
|
||||
print("Error deleting temp file", e)
|
||||
|
||||
result = await add_received_markdown_file_document(
|
||||
session, filename, excel_markdown, search_space_id, user_id, connector
|
||||
)
|
||||
|
||||
if connector:
|
||||
await _update_document_from_connector(result, connector, session)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully parsed and processed Excel file: {filename}",
|
||||
{
|
||||
"document_id": result.id,
|
||||
"content_hash": result.content_hash,
|
||||
"file_type": "excel",
|
||||
"etl_service": "NATIVE_EXCEL",
|
||||
},
|
||||
)
|
||||
return result
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Excel file already exists (duplicate): {filename}",
|
||||
{"duplicate_detected": True, "file_type": "excel"},
|
||||
)
|
||||
return None
|
||||
|
||||
else:
|
||||
# Import page limit service
|
||||
from app.services.page_limit_service import (
|
||||
|
|
@ -1850,31 +1797,6 @@ async def process_file_in_background_with_document(
|
|||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
elif filename.lower().endswith((".xlsx",)):
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
if notification:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Parsing spreadsheet",
|
||||
)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing Excel file natively: {filename}",
|
||||
{"file_type": "excel", "processing_stage": "native_parse"},
|
||||
)
|
||||
|
||||
markdown_content = await parse_excel_to_markdown(file_path, filename)
|
||||
etl_service = "NATIVE_EXCEL"
|
||||
|
||||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
else:
|
||||
# Document files - use ETL service
|
||||
from app.services.page_limit_service import (
|
||||
|
|
|
|||
|
|
@ -1,72 +0,0 @@
|
|||
"""Native parsers for Office file formats."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EXCEL_EXTENSIONS = (".xlsx",)
|
||||
|
||||
|
||||
def _parse_excel_sync(file_path: str) -> str:
|
||||
"""Parse an .xlsx file into markdown tables (synchronous)."""
|
||||
from openpyxl import load_workbook
|
||||
|
||||
wb = load_workbook(file_path, read_only=True, data_only=True)
|
||||
markdown_parts: list[str] = []
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
rows = list(ws.iter_rows(values_only=True))
|
||||
non_empty_rows = [r for r in rows if any(c is not None for c in r)]
|
||||
if not non_empty_rows:
|
||||
continue
|
||||
|
||||
markdown_parts.append(f"## {sheet_name}\n")
|
||||
max_cols = max(len(row) for row in non_empty_rows)
|
||||
|
||||
header = non_empty_rows[0]
|
||||
hdr = [str(c if c is not None else "") for c in header]
|
||||
hdr.extend([""] * (max_cols - len(hdr)))
|
||||
markdown_parts.append("| " + " | ".join(hdr) + " |")
|
||||
markdown_parts.append("| " + " | ".join("---" for _ in range(max_cols)) + " |")
|
||||
|
||||
for row in non_empty_rows[1:]:
|
||||
cells = [str(c if c is not None else "") for c in row]
|
||||
cells.extend([""] * (max_cols - len(cells)))
|
||||
markdown_parts.append("| " + " | ".join(cells) + " |")
|
||||
|
||||
markdown_parts.append("")
|
||||
|
||||
wb.close()
|
||||
return "\n".join(markdown_parts)
|
||||
|
||||
|
||||
async def parse_excel_to_markdown(file_path: str, filename: str = "") -> str:
|
||||
"""Parse an .xlsx file into markdown tables (async wrapper).
|
||||
|
||||
Raises ``ValueError`` if no data is found in the workbook.
|
||||
"""
|
||||
t0 = time.monotonic()
|
||||
logger.info(
|
||||
"[excel-parse] START file=%s thread=%s",
|
||||
filename,
|
||||
threading.current_thread().name,
|
||||
)
|
||||
|
||||
result = await asyncio.to_thread(_parse_excel_sync, file_path)
|
||||
|
||||
logger.info(
|
||||
"[excel-parse] END file=%s elapsed=%.2fs",
|
||||
filename,
|
||||
time.monotonic() - t0,
|
||||
)
|
||||
|
||||
if not result.strip():
|
||||
raise ValueError(f"No data found in Excel file: {filename or file_path}")
|
||||
|
||||
title = f"# {filename}\n\n" if filename else ""
|
||||
return title + result
|
||||
Loading…
Add table
Add a link
Reference in a new issue