mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-27 01:36:30 +02:00
feat: add native Excel parsing and improve Google Drive content extraction
- Introduced a new utility for parsing .xlsx files into markdown format, enhancing the ability to process Excel documents natively. - Updated the Google Drive content extractor to utilize the new Excel parsing functionality, allowing for better handling of spreadsheet files. - Enhanced file type detection and export logic to support various document formats, improving overall content extraction accuracy. - Added unit tests to ensure the correctness of the new Excel parsing feature and its integration with existing content extraction workflows.
This commit is contained in:
parent
4e0749f907
commit
3da0ffd683
7 changed files with 390 additions and 61 deletions
|
|
@ -1134,6 +1134,59 @@ async def process_file_in_background(
|
|||
)
|
||||
return None
|
||||
|
||||
elif filename.lower().endswith((".xlsx",)):
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
if notification:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Parsing spreadsheet",
|
||||
)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing Excel file natively: {filename}",
|
||||
{"file_type": "excel", "processing_stage": "native_parse"},
|
||||
)
|
||||
|
||||
excel_markdown = await parse_excel_to_markdown(file_path, filename)
|
||||
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except Exception as e:
|
||||
print("Error deleting temp file", e)
|
||||
|
||||
result = await add_received_markdown_file_document(
|
||||
session, filename, excel_markdown, search_space_id, user_id, connector
|
||||
)
|
||||
|
||||
if connector:
|
||||
await _update_document_from_connector(result, connector, session)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully parsed and processed Excel file: {filename}",
|
||||
{
|
||||
"document_id": result.id,
|
||||
"content_hash": result.content_hash,
|
||||
"file_type": "excel",
|
||||
"etl_service": "NATIVE_EXCEL",
|
||||
},
|
||||
)
|
||||
return result
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Excel file already exists (duplicate): {filename}",
|
||||
{"duplicate_detected": True, "file_type": "excel"},
|
||||
)
|
||||
return None
|
||||
|
||||
else:
|
||||
# Import page limit service
|
||||
from app.services.page_limit_service import (
|
||||
|
|
@ -1797,6 +1850,31 @@ async def process_file_in_background_with_document(
|
|||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
elif filename.lower().endswith((".xlsx",)):
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
if notification:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Parsing spreadsheet",
|
||||
)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing Excel file natively: {filename}",
|
||||
{"file_type": "excel", "processing_stage": "native_parse"},
|
||||
)
|
||||
|
||||
markdown_content = await parse_excel_to_markdown(file_path, filename)
|
||||
etl_service = "NATIVE_EXCEL"
|
||||
|
||||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
else:
|
||||
# Document files - use ETL service
|
||||
from app.services.page_limit_service import (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue