feat: add native Excel parsing and improve Google Drive content extraction

- Introduced a new utility for parsing .xlsx files into markdown format, enhancing the ability to process Excel documents natively.
- Updated the Google Drive content extractor to utilize the new Excel parsing functionality, allowing for better handling of spreadsheet files.
- Enhanced file type detection and export logic to support various document formats, improving overall content extraction accuracy.
- Added unit tests to ensure the correctness of the new Excel parsing feature and its integration with existing content extraction workflows.
This commit is contained in:
Anish Sarkar 2026-03-27 21:47:14 +05:30
parent 4e0749f907
commit 3da0ffd683
7 changed files with 390 additions and 61 deletions

View file

@ -1134,6 +1134,59 @@ async def process_file_in_background(
)
return None
elif filename.lower().endswith((".xlsx",)):
from app.utils.office_parsers import parse_excel_to_markdown
if notification:
await (
NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Parsing spreadsheet",
)
)
await task_logger.log_task_progress(
log_entry,
f"Processing Excel file natively: {filename}",
{"file_type": "excel", "processing_stage": "native_parse"},
)
excel_markdown = await parse_excel_to_markdown(file_path, filename)
try:
os.unlink(file_path)
except Exception as e:
print("Error deleting temp file", e)
result = await add_received_markdown_file_document(
session, filename, excel_markdown, search_space_id, user_id, connector
)
if connector:
await _update_document_from_connector(result, connector, session)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully parsed and processed Excel file: {filename}",
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "excel",
"etl_service": "NATIVE_EXCEL",
},
)
return result
else:
await task_logger.log_task_success(
log_entry,
f"Excel file already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "excel"},
)
return None
else:
# Import page limit service
from app.services.page_limit_service import (
@ -1797,6 +1850,31 @@ async def process_file_in_background_with_document(
with contextlib.suppress(Exception):
os.unlink(file_path)
elif filename.lower().endswith((".xlsx",)):
from app.utils.office_parsers import parse_excel_to_markdown
if notification:
await (
NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Parsing spreadsheet",
)
)
await task_logger.log_task_progress(
log_entry,
f"Processing Excel file natively: {filename}",
{"file_type": "excel", "processing_stage": "native_parse"},
)
markdown_content = await parse_excel_to_markdown(file_path, filename)
etl_service = "NATIVE_EXCEL"
with contextlib.suppress(Exception):
os.unlink(file_path)
else:
# Document files - use ETL service
from app.services.page_limit_service import (