diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 0903aea9f..272a71403 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -14,8 +14,15 @@ from sqlalchemy.ext.asyncio import AsyncSession from app.db import Log from app.services.task_logging_service import TaskLoggingService +from app.utils.office_parsers import EXCEL_EXTENSIONS + from .client import GoogleDriveClient -from .file_types import get_export_mime_type, is_google_workspace_file, should_skip_file +from .file_types import ( + get_export_mime_type, + get_extension_from_mime, + is_google_workspace_file, + should_skip_file, +) logger = logging.getLogger(__name__) @@ -58,29 +65,30 @@ async def download_and_extract_content( if "md5Checksum" in file: drive_metadata["md5_checksum"] = file["md5Checksum"] if is_google_workspace_file(mime_type): - drive_metadata["exported_as"] = "pdf" + export_ext = get_extension_from_mime(get_export_mime_type(mime_type) or "") + drive_metadata["exported_as"] = export_ext.lstrip(".") if export_ext else "pdf" drive_metadata["original_workspace_type"] = mime_type.split(".")[-1] temp_file_path = None try: if is_google_workspace_file(mime_type): - # Workspace files (Docs/Sheets/Slides) use export -- returns bytes - # in one shot. These are typically small (a few MB as PDF/text). export_mime = get_export_mime_type(mime_type) if not export_mime: return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}" content_bytes, error = await client.export_google_file(file_id, export_mime) if error: return None, drive_metadata, error - extension = ".pdf" if export_mime == "application/pdf" else ".txt" + extension = get_extension_from_mime(export_mime) or ".pdf" with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp: tmp.write(content_bytes) temp_file_path = tmp.name else: - # Binary files -- stream directly to disk in chunks to avoid - # loading the entire file into memory. - extension = Path(file_name).suffix or ".bin" + extension = ( + Path(file_name).suffix + or get_extension_from_mime(mime_type) + or ".bin" + ) with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp: temp_file_path = tmp.name @@ -142,6 +150,11 @@ async def _parse_file_to_markdown(file_path: str, filename: str) -> str: raise ValueError("Transcription returned empty text") return f"# Transcription of {filename}\n\n{text}" + if lower.endswith(EXCEL_EXTENSIONS): + from app.utils.office_parsers import parse_excel_to_markdown + + return await parse_excel_to_markdown(file_path, filename) + # Document files -- use configured ETL service from app.config import config as app_config @@ -236,14 +249,17 @@ async def download_and_process_file( if error: return None, error - extension = ".pdf" if export_mime == "application/pdf" else ".txt" + extension = get_extension_from_mime(export_mime) or ".pdf" else: content_bytes, error = await client.download_file(file_id) if error: return None, error - # Preserve original file extension - extension = Path(file_name).suffix or ".bin" + extension = ( + Path(file_name).suffix + or get_extension_from_mime(mime_type) + or ".bin" + ) with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file: tmp_file.write(content_bytes) @@ -281,7 +297,12 @@ async def download_and_process_file( connector_info["metadata"]["md5_checksum"] = file["md5Checksum"] if is_google_workspace_file(mime_type): - connector_info["metadata"]["exported_as"] = "pdf" + export_ext = get_extension_from_mime( + get_export_mime_type(mime_type) or "" + ) + connector_info["metadata"]["exported_as"] = ( + export_ext.lstrip(".") if export_ext else "pdf" + ) connector_info["metadata"]["original_workspace_type"] = mime_type.split( "." )[-1] diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py index a66463208..dd3456901 100644 --- a/surfsense_backend/app/connectors/google_drive/file_types.py +++ b/surfsense_backend/app/connectors/google_drive/file_types.py @@ -8,10 +8,33 @@ GOOGLE_SHORTCUT = "application/vnd.google-apps.shortcut" EXPORT_FORMATS = { GOOGLE_DOC: "application/pdf", - GOOGLE_SHEET: "application/pdf", + GOOGLE_SHEET: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", GOOGLE_SLIDE: "application/pdf", } +MIME_TO_EXTENSION: dict[str, str] = { + "application/pdf": ".pdf", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", + "application/vnd.ms-excel": ".xls", + "application/msword": ".doc", + "application/vnd.ms-powerpoint": ".ppt", + "text/plain": ".txt", + "text/csv": ".csv", + "text/html": ".html", + "text/markdown": ".md", + "application/json": ".json", + "application/xml": ".xml", + "image/png": ".png", + "image/jpeg": ".jpg", +} + + +def get_extension_from_mime(mime_type: str) -> str | None: + """Return a file extension (with leading dot) for a MIME type, or None.""" + return MIME_TO_EXTENSION.get(mime_type) + def is_google_workspace_file(mime_type: str) -> bool: """Check if file is a Google Workspace file that needs export.""" diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 6c0ae1870..c69c6fa95 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -1134,6 +1134,59 @@ async def process_file_in_background( ) return None + elif filename.lower().endswith((".xlsx",)): + from app.utils.office_parsers import parse_excel_to_markdown + + if notification: + await ( + NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Parsing spreadsheet", + ) + ) + + await task_logger.log_task_progress( + log_entry, + f"Processing Excel file natively: {filename}", + {"file_type": "excel", "processing_stage": "native_parse"}, + ) + + excel_markdown = await parse_excel_to_markdown(file_path, filename) + + try: + os.unlink(file_path) + except Exception as e: + print("Error deleting temp file", e) + + result = await add_received_markdown_file_document( + session, filename, excel_markdown, search_space_id, user_id, connector + ) + + if connector: + await _update_document_from_connector(result, connector, session) + + if result: + await task_logger.log_task_success( + log_entry, + f"Successfully parsed and processed Excel file: {filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "excel", + "etl_service": "NATIVE_EXCEL", + }, + ) + return result + else: + await task_logger.log_task_success( + log_entry, + f"Excel file already exists (duplicate): {filename}", + {"duplicate_detected": True, "file_type": "excel"}, + ) + return None + else: # Import page limit service from app.services.page_limit_service import ( @@ -1797,6 +1850,31 @@ async def process_file_in_background_with_document( with contextlib.suppress(Exception): os.unlink(file_path) + elif filename.lower().endswith((".xlsx",)): + from app.utils.office_parsers import parse_excel_to_markdown + + if notification: + await ( + NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Parsing spreadsheet", + ) + ) + + await task_logger.log_task_progress( + log_entry, + f"Processing Excel file natively: {filename}", + {"file_type": "excel", "processing_stage": "native_parse"}, + ) + + markdown_content = await parse_excel_to_markdown(file_path, filename) + etl_service = "NATIVE_EXCEL" + + with contextlib.suppress(Exception): + os.unlink(file_path) + else: # Document files - use ETL service from app.services.page_limit_service import ( diff --git a/surfsense_backend/app/utils/office_parsers.py b/surfsense_backend/app/utils/office_parsers.py new file mode 100644 index 000000000..a1550e110 --- /dev/null +++ b/surfsense_backend/app/utils/office_parsers.py @@ -0,0 +1,72 @@ +"""Native parsers for Office file formats.""" + +import asyncio +import logging +import threading +import time +from pathlib import Path + +logger = logging.getLogger(__name__) + +EXCEL_EXTENSIONS = (".xlsx",) + + +def _parse_excel_sync(file_path: str) -> str: + """Parse an .xlsx file into markdown tables (synchronous).""" + from openpyxl import load_workbook + + wb = load_workbook(file_path, read_only=True, data_only=True) + markdown_parts: list[str] = [] + + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + rows = list(ws.iter_rows(values_only=True)) + non_empty_rows = [r for r in rows if any(c is not None for c in r)] + if not non_empty_rows: + continue + + markdown_parts.append(f"## {sheet_name}\n") + max_cols = max(len(row) for row in non_empty_rows) + + header = non_empty_rows[0] + hdr = [str(c if c is not None else "") for c in header] + hdr.extend([""] * (max_cols - len(hdr))) + markdown_parts.append("| " + " | ".join(hdr) + " |") + markdown_parts.append("| " + " | ".join("---" for _ in range(max_cols)) + " |") + + for row in non_empty_rows[1:]: + cells = [str(c if c is not None else "") for c in row] + cells.extend([""] * (max_cols - len(cells))) + markdown_parts.append("| " + " | ".join(cells) + " |") + + markdown_parts.append("") + + wb.close() + return "\n".join(markdown_parts) + + +async def parse_excel_to_markdown(file_path: str, filename: str = "") -> str: + """Parse an .xlsx file into markdown tables (async wrapper). + + Raises ``ValueError`` if no data is found in the workbook. + """ + t0 = time.monotonic() + logger.info( + "[excel-parse] START file=%s thread=%s", + filename, + threading.current_thread().name, + ) + + result = await asyncio.to_thread(_parse_excel_sync, file_path) + + logger.info( + "[excel-parse] END file=%s elapsed=%.2fs", + filename, + time.monotonic() - t0, + ) + + if not result.strip(): + raise ValueError(f"No data found in Excel file: {filename or file_path}") + + title = f"# {filename}\n\n" if filename else "" + return title + result diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 017994c75..724e6db4c 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -73,6 +73,7 @@ dependencies = [ "langchain-daytona>=0.0.2", "pypandoc>=1.16.2", "notion-markdown>=0.7.0", + "openpyxl>=3.1.5", ] [dependency-groups] diff --git a/surfsense_backend/tests/unit/test_office_parsers.py b/surfsense_backend/tests/unit/test_office_parsers.py new file mode 100644 index 000000000..11429a71d --- /dev/null +++ b/surfsense_backend/tests/unit/test_office_parsers.py @@ -0,0 +1,129 @@ +"""Unit tests for native Office file parsers (no DB, no external services).""" + +import tempfile + +import pytest +from openpyxl import Workbook + +pytestmark = pytest.mark.unit + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _create_xlsx(sheets: dict[str, list[list]]) -> str: + """Create a real .xlsx file on disk and return its path. + + ``sheets`` maps sheet name -> list of rows, where each row is a list of + cell values. + """ + wb = Workbook() + first = True + for name, rows in sheets.items(): + ws = wb.active if first else wb.create_sheet(title=name) + if first: + ws.title = name + first = False + for row in rows: + ws.append(row) + tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) + wb.save(tmp.name) + wb.close() + tmp.close() + return tmp.name + + +# --------------------------------------------------------------------------- +# Tracer bullet: cell values appear in markdown +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_parse_excel_produces_markdown_with_cell_values(): + """A single-sheet .xlsx with known data produces markdown containing those values.""" + from app.utils.office_parsers import parse_excel_to_markdown + + path = _create_xlsx( + {"Sales": [["Product", "Revenue"], ["Widget", 1500], ["Gadget", 3200]]} + ) + + md = await parse_excel_to_markdown(path, filename="report.xlsx") + + assert "Product" in md + assert "Revenue" in md + assert "Widget" in md + assert "1500" in md + assert "Gadget" in md + assert "3200" in md + assert "report.xlsx" in md + assert "|" in md + + +# --------------------------------------------------------------------------- +# Multi-sheet workbooks include all sheets +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_parse_excel_includes_all_sheets(): + """Both sheet names and their data appear in the output.""" + from app.utils.office_parsers import parse_excel_to_markdown + + path = _create_xlsx( + { + "Inventory": [["Item", "Qty"], ["Bolts", 200]], + "Pricing": [["Item", "Price"], ["Bolts", 4.50]], + } + ) + + md = await parse_excel_to_markdown(path, filename="multi.xlsx") + + assert "Inventory" in md + assert "Pricing" in md + assert "Bolts" in md + assert "200" in md + assert "4.5" in md + + +# --------------------------------------------------------------------------- +# Empty spreadsheet raises ValueError +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_parse_excel_raises_on_empty_file(): + """An .xlsx with no data raises ValueError.""" + from app.utils.office_parsers import parse_excel_to_markdown + + wb = Workbook() + tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) + wb.save(tmp.name) + wb.close() + tmp.close() + + with pytest.raises(ValueError, match="No data found"): + await parse_excel_to_markdown(tmp.name, filename="empty.xlsx") + + +# --------------------------------------------------------------------------- +# _parse_file_to_markdown routes .xlsx natively (no ETL call) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_parse_file_to_markdown_routes_xlsx_natively(): + """content_extractor._parse_file_to_markdown uses native parser for .xlsx.""" + from app.connectors.google_drive.content_extractor import _parse_file_to_markdown + + path = _create_xlsx( + {"Data": [["Name", "Score"], ["Alice", 95], ["Bob", 82]]} + ) + + md = await _parse_file_to_markdown(path, "grades.xlsx") + + assert "Alice" in md + assert "95" in md + assert "Bob" in md + assert "82" in md diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 2770c659a..e4d148b50 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -1171,7 +1171,7 @@ name = "contourpy" version = "1.3.3" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" } wheels = [ @@ -2596,6 +2596,7 @@ dependencies = [ { name = "griffecli" }, { name = "griffelib" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/04/56/28a0accac339c164b52a92c6cfc45a903acc0c174caa5c1713803467b533/griffe-2.0.0.tar.gz", hash = "sha256:c68979cd8395422083a51ea7cf02f9c119d889646d99b7b656ee43725de1b80f", size = 293906, upload-time = "2026-03-23T21:06:53.402Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/8b/94/ee21d41e7eb4f823b94603b9d40f86d3c7fde80eacc2c3c71845476dddaa/griffe-2.0.0-py3-none-any.whl", hash = "sha256:5418081135a391c3e6e757a7f3f156f1a1a746cc7b4023868ff7d5e2f9a980aa", size = 5214, upload-time = "2026-02-09T19:09:44.105Z" }, ] @@ -2608,6 +2609,7 @@ dependencies = [ { name = "colorama" }, { name = "griffelib" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/a4/f8/2e129fd4a86e52e58eefe664de05e7d502decf766e7316cc9e70fdec3e18/griffecli-2.0.0.tar.gz", hash = "sha256:312fa5ebb4ce6afc786356e2d0ce85b06c1c20d45abc42d74f0cda65e159f6ef", size = 56213, upload-time = "2026-03-23T21:06:54.8Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e6/ed/d93f7a447bbf7a935d8868e9617cbe1cadf9ee9ee6bd275d3040fbf93d60/griffecli-2.0.0-py3-none-any.whl", hash = "sha256:9f7cd9ee9b21d55e91689358978d2385ae65c22f307a63fb3269acf3f21e643d", size = 9345, upload-time = "2026-02-09T19:09:42.554Z" }, ] @@ -2616,6 +2618,7 @@ wheels = [ name = "griffelib" version = "2.0.0" source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ad/06/eccbd311c9e2b3ca45dbc063b93134c57a1ccc7607c5e545264ad092c4a9/griffelib-2.0.0.tar.gz", hash = "sha256:e504d637a089f5cab9b5daf18f7645970509bf4f53eda8d79ed71cce8bd97934", size = 166312, upload-time = "2026-03-23T21:06:55.954Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/4d/51/c936033e16d12b627ea334aaaaf42229c37620d0f15593456ab69ab48161/griffelib-2.0.0-py3-none-any.whl", hash = "sha256:01284878c966508b6d6f1dbff9b6fa607bc062d8261c5c7253cb285b06422a7f", size = 142004, upload-time = "2026-02-09T19:09:40.561Z" }, ] @@ -4082,15 +4085,15 @@ name = "matplotlib" version = "3.10.8" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "contourpy" }, - { name = "cycler" }, - { name = "fonttools" }, - { name = "kiwisolver" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "pyparsing" }, - { name = "python-dateutil" }, + { name = "contourpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "cycler", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "fonttools", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "kiwisolver", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "packaging", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "pillow", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "pyparsing", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "python-dateutil", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/8a/76/d3c6e3a13fe484ebe7718d14e269c9569c4eb0020a968a327acb3b9a8fe6/matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3", size = 34806269, upload-time = "2025-12-10T22:56:51.155Z" } wheels = [ @@ -4201,7 +4204,7 @@ name = "ml-dtypes" version = "0.5.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "numpy" }, + { name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" } wheels = [ @@ -4967,9 +4970,9 @@ name = "ocrmac" version = "1.0.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "click" }, - { name = "pillow" }, - { name = "pyobjc-framework-vision" }, + { name = "click", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "pillow", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "pyobjc-framework-vision", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5e/07/3e15ab404f75875c5e48c47163300eb90b7409044d8711fc3aaf52503f2e/ocrmac-1.0.1.tar.gz", hash = "sha256:507fe5e4cbd67b2d03f6729a52bbc11f9d0b58241134eb958a5daafd4b9d93d9", size = 1454317, upload-time = "2026-01-08T16:44:26.412Z" } wheels = [ @@ -5003,10 +5006,10 @@ name = "onnx" version = "1.20.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "ml-dtypes" }, - { name = "numpy" }, - { name = "protobuf" }, - { name = "typing-extensions" }, + { name = "ml-dtypes", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "protobuf", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/3b/8a/335c03a8683a88a32f9a6bb98899ea6df241a41df64b37b9696772414794/onnx-1.20.1.tar.gz", hash = "sha256:ded16de1df563d51fbc1ad885f2a426f814039d8b5f4feb77febe09c0295ad67", size = 12048980, upload-time = "2026-01-10T01:40:03.043Z" } wheels = [ @@ -6493,7 +6496,7 @@ name = "pyobjc-framework-cocoa" version = "12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, + { name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/02/a3/16ca9a15e77c061a9250afbae2eae26f2e1579eb8ca9462ae2d2c71e1169/pyobjc_framework_cocoa-12.1.tar.gz", hash = "sha256:5556c87db95711b985d5efdaaf01c917ddd41d148b1e52a0c66b1a2e2c5c1640", size = 2772191, upload-time = "2025-11-14T10:13:02.069Z" } wheels = [ @@ -6509,8 +6512,8 @@ name = "pyobjc-framework-coreml" version = "12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, + { name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/30/2d/baa9ea02cbb1c200683cb7273b69b4bee5070e86f2060b77e6a27c2a9d7e/pyobjc_framework_coreml-12.1.tar.gz", hash = "sha256:0d1a4216891a18775c9e0170d908714c18e4f53f9dc79fb0f5263b2aa81609ba", size = 40465, upload-time = "2025-11-14T10:14:02.265Z" } wheels = [ @@ -6526,8 +6529,8 @@ name = "pyobjc-framework-quartz" version = "12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, + { name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/94/18/cc59f3d4355c9456fc945eae7fe8797003c4da99212dd531ad1b0de8a0c6/pyobjc_framework_quartz-12.1.tar.gz", hash = "sha256:27f782f3513ac88ec9b6c82d9767eef95a5cf4175ce88a1e5a65875fee799608", size = 3159099, upload-time = "2025-11-14T10:21:24.31Z" } wheels = [ @@ -6543,10 +6546,10 @@ name = "pyobjc-framework-vision" version = "12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, - { name = "pyobjc-framework-coreml" }, - { name = "pyobjc-framework-quartz" }, + { name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, + { name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c2/5a/08bb3e278f870443d226c141af14205ff41c0274da1e053b72b11dfc9fb2/pyobjc_framework_vision-12.1.tar.gz", hash = "sha256:a30959100e85dcede3a786c544e621ad6eb65ff6abf85721f805822b8c5fe9b0", size = 59538, upload-time = "2025-11-14T10:23:21.979Z" } wheels = [ @@ -7916,6 +7919,7 @@ dependencies = [ { name = "notion-client" }, { name = "notion-markdown" }, { name = "numpy" }, + { name = "openpyxl" }, { name = "pgvector" }, { name = "playwright" }, { name = "psycopg", extra = ["binary", "pool"] }, @@ -7998,6 +8002,7 @@ requires-dist = [ { name = "notion-client", specifier = ">=2.3.0" }, { name = "notion-markdown", specifier = ">=0.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, + { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pgvector", specifier = ">=0.3.6" }, { name = "playwright", specifier = ">=1.50.0" }, { name = "psycopg", extras = ["binary", "pool"], specifier = ">=3.3.2" }, @@ -8188,11 +8193,11 @@ name = "timm" version = "1.0.25" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "huggingface-hub" }, - { name = "pyyaml" }, - { name = "safetensors" }, - { name = "torch" }, - { name = "torchvision" }, + { name = "huggingface-hub", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "pyyaml", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "safetensors", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "torch", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "torchvision", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/d7/2c/593109822fe735e637382aca6640c1102c19797f7791f1fd1dab2d6c3cb1/timm-1.0.25.tar.gz", hash = "sha256:47f59fc2754725735cc81bb83bcbfce5bec4ebd5d4bb9e69da57daa92fcfa768", size = 2414743, upload-time = "2026-02-23T16:49:00.137Z" } wheels = [ @@ -8819,22 +8824,22 @@ name = "unstructured-inference" version = "1.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "accelerate" }, - { name = "huggingface-hub" }, - { name = "matplotlib" }, - { name = "numpy" }, - { name = "onnx" }, - { name = "onnxruntime" }, - { name = "opencv-python" }, - { name = "pandas" }, - { name = "pdfminer-six" }, - { name = "pypdfium2" }, - { name = "python-multipart" }, - { name = "rapidfuzz" }, - { name = "scipy" }, - { name = "timm" }, - { name = "torch" }, - { name = "transformers" }, + { name = "accelerate", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "huggingface-hub", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "matplotlib", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "onnx", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "onnxruntime", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "opencv-python", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "pandas", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "pdfminer-six", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "pypdfium2", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "python-multipart", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "rapidfuzz", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "scipy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "timm", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "torch", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, + { name = "transformers", marker = "python_full_version < '3.13' or sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/ce/10/8f3bccfa9f1e0101a402ae1f529e07876541c6b18004747f0e793ed41f9e/unstructured_inference-1.2.0.tar.gz", hash = "sha256:19ca28512f3649c70a759cf2a4e98663e942a1b83c1acdb9506b0445f4862f23", size = 45732, upload-time = "2026-01-30T20:57:58.019Z" } wheels = [