feat: add native Excel parsing and improve Google Drive content extraction

- Introduced a new utility for parsing .xlsx files into markdown format, enhancing the ability to process Excel documents natively.
- Updated the Google Drive content extractor to utilize the new Excel parsing functionality, allowing for better handling of spreadsheet files.
- Enhanced file type detection and export logic to support various document formats, improving overall content extraction accuracy.
- Added unit tests to ensure the correctness of the new Excel parsing feature and its integration with existing content extraction workflows.
This commit is contained in:
Anish Sarkar 2026-03-27 21:47:14 +05:30
parent 4e0749f907
commit 3da0ffd683
7 changed files with 390 additions and 61 deletions

View file

@ -14,8 +14,15 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Log
from app.services.task_logging_service import TaskLoggingService
from app.utils.office_parsers import EXCEL_EXTENSIONS
from .client import GoogleDriveClient
from .file_types import get_export_mime_type, is_google_workspace_file, should_skip_file
from .file_types import (
get_export_mime_type,
get_extension_from_mime,
is_google_workspace_file,
should_skip_file,
)
logger = logging.getLogger(__name__)
@ -58,29 +65,30 @@ async def download_and_extract_content(
if "md5Checksum" in file:
drive_metadata["md5_checksum"] = file["md5Checksum"]
if is_google_workspace_file(mime_type):
drive_metadata["exported_as"] = "pdf"
export_ext = get_extension_from_mime(get_export_mime_type(mime_type) or "")
drive_metadata["exported_as"] = export_ext.lstrip(".") if export_ext else "pdf"
drive_metadata["original_workspace_type"] = mime_type.split(".")[-1]
temp_file_path = None
try:
if is_google_workspace_file(mime_type):
# Workspace files (Docs/Sheets/Slides) use export -- returns bytes
# in one shot. These are typically small (a few MB as PDF/text).
export_mime = get_export_mime_type(mime_type)
if not export_mime:
return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
content_bytes, error = await client.export_google_file(file_id, export_mime)
if error:
return None, drive_metadata, error
extension = ".pdf" if export_mime == "application/pdf" else ".txt"
extension = get_extension_from_mime(export_mime) or ".pdf"
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
tmp.write(content_bytes)
temp_file_path = tmp.name
else:
# Binary files -- stream directly to disk in chunks to avoid
# loading the entire file into memory.
extension = Path(file_name).suffix or ".bin"
extension = (
Path(file_name).suffix
or get_extension_from_mime(mime_type)
or ".bin"
)
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
temp_file_path = tmp.name
@ -142,6 +150,11 @@ async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
raise ValueError("Transcription returned empty text")
return f"# Transcription of {filename}\n\n{text}"
if lower.endswith(EXCEL_EXTENSIONS):
from app.utils.office_parsers import parse_excel_to_markdown
return await parse_excel_to_markdown(file_path, filename)
# Document files -- use configured ETL service
from app.config import config as app_config
@ -236,14 +249,17 @@ async def download_and_process_file(
if error:
return None, error
extension = ".pdf" if export_mime == "application/pdf" else ".txt"
extension = get_extension_from_mime(export_mime) or ".pdf"
else:
content_bytes, error = await client.download_file(file_id)
if error:
return None, error
# Preserve original file extension
extension = Path(file_name).suffix or ".bin"
extension = (
Path(file_name).suffix
or get_extension_from_mime(mime_type)
or ".bin"
)
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
tmp_file.write(content_bytes)
@ -281,7 +297,12 @@ async def download_and_process_file(
connector_info["metadata"]["md5_checksum"] = file["md5Checksum"]
if is_google_workspace_file(mime_type):
connector_info["metadata"]["exported_as"] = "pdf"
export_ext = get_extension_from_mime(
get_export_mime_type(mime_type) or ""
)
connector_info["metadata"]["exported_as"] = (
export_ext.lstrip(".") if export_ext else "pdf"
)
connector_info["metadata"]["original_workspace_type"] = mime_type.split(
"."
)[-1]

View file

@ -8,10 +8,33 @@ GOOGLE_SHORTCUT = "application/vnd.google-apps.shortcut"
EXPORT_FORMATS = {
GOOGLE_DOC: "application/pdf",
GOOGLE_SHEET: "application/pdf",
GOOGLE_SHEET: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
GOOGLE_SLIDE: "application/pdf",
}
MIME_TO_EXTENSION: dict[str, str] = {
"application/pdf": ".pdf",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
"application/vnd.ms-excel": ".xls",
"application/msword": ".doc",
"application/vnd.ms-powerpoint": ".ppt",
"text/plain": ".txt",
"text/csv": ".csv",
"text/html": ".html",
"text/markdown": ".md",
"application/json": ".json",
"application/xml": ".xml",
"image/png": ".png",
"image/jpeg": ".jpg",
}
def get_extension_from_mime(mime_type: str) -> str | None:
"""Return a file extension (with leading dot) for a MIME type, or None."""
return MIME_TO_EXTENSION.get(mime_type)
def is_google_workspace_file(mime_type: str) -> bool:
"""Check if file is a Google Workspace file that needs export."""

View file

@ -1134,6 +1134,59 @@ async def process_file_in_background(
)
return None
elif filename.lower().endswith((".xlsx",)):
from app.utils.office_parsers import parse_excel_to_markdown
if notification:
await (
NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Parsing spreadsheet",
)
)
await task_logger.log_task_progress(
log_entry,
f"Processing Excel file natively: {filename}",
{"file_type": "excel", "processing_stage": "native_parse"},
)
excel_markdown = await parse_excel_to_markdown(file_path, filename)
try:
os.unlink(file_path)
except Exception as e:
print("Error deleting temp file", e)
result = await add_received_markdown_file_document(
session, filename, excel_markdown, search_space_id, user_id, connector
)
if connector:
await _update_document_from_connector(result, connector, session)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully parsed and processed Excel file: {filename}",
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "excel",
"etl_service": "NATIVE_EXCEL",
},
)
return result
else:
await task_logger.log_task_success(
log_entry,
f"Excel file already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "excel"},
)
return None
else:
# Import page limit service
from app.services.page_limit_service import (
@ -1797,6 +1850,31 @@ async def process_file_in_background_with_document(
with contextlib.suppress(Exception):
os.unlink(file_path)
elif filename.lower().endswith((".xlsx",)):
from app.utils.office_parsers import parse_excel_to_markdown
if notification:
await (
NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Parsing spreadsheet",
)
)
await task_logger.log_task_progress(
log_entry,
f"Processing Excel file natively: {filename}",
{"file_type": "excel", "processing_stage": "native_parse"},
)
markdown_content = await parse_excel_to_markdown(file_path, filename)
etl_service = "NATIVE_EXCEL"
with contextlib.suppress(Exception):
os.unlink(file_path)
else:
# Document files - use ETL service
from app.services.page_limit_service import (

View file

@ -0,0 +1,72 @@
"""Native parsers for Office file formats."""
import asyncio
import logging
import threading
import time
from pathlib import Path
logger = logging.getLogger(__name__)
EXCEL_EXTENSIONS = (".xlsx",)
def _parse_excel_sync(file_path: str) -> str:
"""Parse an .xlsx file into markdown tables (synchronous)."""
from openpyxl import load_workbook
wb = load_workbook(file_path, read_only=True, data_only=True)
markdown_parts: list[str] = []
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
rows = list(ws.iter_rows(values_only=True))
non_empty_rows = [r for r in rows if any(c is not None for c in r)]
if not non_empty_rows:
continue
markdown_parts.append(f"## {sheet_name}\n")
max_cols = max(len(row) for row in non_empty_rows)
header = non_empty_rows[0]
hdr = [str(c if c is not None else "") for c in header]
hdr.extend([""] * (max_cols - len(hdr)))
markdown_parts.append("| " + " | ".join(hdr) + " |")
markdown_parts.append("| " + " | ".join("---" for _ in range(max_cols)) + " |")
for row in non_empty_rows[1:]:
cells = [str(c if c is not None else "") for c in row]
cells.extend([""] * (max_cols - len(cells)))
markdown_parts.append("| " + " | ".join(cells) + " |")
markdown_parts.append("")
wb.close()
return "\n".join(markdown_parts)
async def parse_excel_to_markdown(file_path: str, filename: str = "") -> str:
"""Parse an .xlsx file into markdown tables (async wrapper).
Raises ``ValueError`` if no data is found in the workbook.
"""
t0 = time.monotonic()
logger.info(
"[excel-parse] START file=%s thread=%s",
filename,
threading.current_thread().name,
)
result = await asyncio.to_thread(_parse_excel_sync, file_path)
logger.info(
"[excel-parse] END file=%s elapsed=%.2fs",
filename,
time.monotonic() - t0,
)
if not result.strip():
raise ValueError(f"No data found in Excel file: {filename or file_path}")
title = f"# {filename}\n\n" if filename else ""
return title + result

View file

@ -73,6 +73,7 @@ dependencies = [
"langchain-daytona>=0.0.2",
"pypandoc>=1.16.2",
"notion-markdown>=0.7.0",
"openpyxl>=3.1.5",
]
[dependency-groups]

View file

@ -0,0 +1,129 @@
"""Unit tests for native Office file parsers (no DB, no external services)."""
import tempfile
import pytest
from openpyxl import Workbook
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _create_xlsx(sheets: dict[str, list[list]]) -> str:
"""Create a real .xlsx file on disk and return its path.
``sheets`` maps sheet name -> list of rows, where each row is a list of
cell values.
"""
wb = Workbook()
first = True
for name, rows in sheets.items():
ws = wb.active if first else wb.create_sheet(title=name)
if first:
ws.title = name
first = False
for row in rows:
ws.append(row)
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
wb.save(tmp.name)
wb.close()
tmp.close()
return tmp.name
# ---------------------------------------------------------------------------
# Tracer bullet: cell values appear in markdown
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_parse_excel_produces_markdown_with_cell_values():
"""A single-sheet .xlsx with known data produces markdown containing those values."""
from app.utils.office_parsers import parse_excel_to_markdown
path = _create_xlsx(
{"Sales": [["Product", "Revenue"], ["Widget", 1500], ["Gadget", 3200]]}
)
md = await parse_excel_to_markdown(path, filename="report.xlsx")
assert "Product" in md
assert "Revenue" in md
assert "Widget" in md
assert "1500" in md
assert "Gadget" in md
assert "3200" in md
assert "report.xlsx" in md
assert "|" in md
# ---------------------------------------------------------------------------
# Multi-sheet workbooks include all sheets
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_parse_excel_includes_all_sheets():
"""Both sheet names and their data appear in the output."""
from app.utils.office_parsers import parse_excel_to_markdown
path = _create_xlsx(
{
"Inventory": [["Item", "Qty"], ["Bolts", 200]],
"Pricing": [["Item", "Price"], ["Bolts", 4.50]],
}
)
md = await parse_excel_to_markdown(path, filename="multi.xlsx")
assert "Inventory" in md
assert "Pricing" in md
assert "Bolts" in md
assert "200" in md
assert "4.5" in md
# ---------------------------------------------------------------------------
# Empty spreadsheet raises ValueError
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_parse_excel_raises_on_empty_file():
"""An .xlsx with no data raises ValueError."""
from app.utils.office_parsers import parse_excel_to_markdown
wb = Workbook()
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
wb.save(tmp.name)
wb.close()
tmp.close()
with pytest.raises(ValueError, match="No data found"):
await parse_excel_to_markdown(tmp.name, filename="empty.xlsx")
# ---------------------------------------------------------------------------
# _parse_file_to_markdown routes .xlsx natively (no ETL call)
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_parse_file_to_markdown_routes_xlsx_natively():
"""content_extractor._parse_file_to_markdown uses native parser for .xlsx."""
from app.connectors.google_drive.content_extractor import _parse_file_to_markdown
path = _create_xlsx(
{"Data": [["Name", "Score"], ["Alice", 95], ["Bob", 82]]}
)
md = await _parse_file_to_markdown(path, "grades.xlsx")
assert "Alice" in md
assert "95" in md
assert "Bob" in md
assert "82" in md

View file

@ -1171,7 +1171,7 @@ name = "contourpy"
version = "1.3.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy" },
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" }
wheels = [
@ -2596,6 +2596,7 @@ dependencies = [
{ name = "griffecli" },
{ name = "griffelib" },
]
sdist = { url = "https://files.pythonhosted.org/packages/04/56/28a0accac339c164b52a92c6cfc45a903acc0c174caa5c1713803467b533/griffe-2.0.0.tar.gz", hash = "sha256:c68979cd8395422083a51ea7cf02f9c119d889646d99b7b656ee43725de1b80f", size = 293906, upload-time = "2026-03-23T21:06:53.402Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8b/94/ee21d41e7eb4f823b94603b9d40f86d3c7fde80eacc2c3c71845476dddaa/griffe-2.0.0-py3-none-any.whl", hash = "sha256:5418081135a391c3e6e757a7f3f156f1a1a746cc7b4023868ff7d5e2f9a980aa", size = 5214, upload-time = "2026-02-09T19:09:44.105Z" },
]
@ -2608,6 +2609,7 @@ dependencies = [
{ name = "colorama" },
{ name = "griffelib" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a4/f8/2e129fd4a86e52e58eefe664de05e7d502decf766e7316cc9e70fdec3e18/griffecli-2.0.0.tar.gz", hash = "sha256:312fa5ebb4ce6afc786356e2d0ce85b06c1c20d45abc42d74f0cda65e159f6ef", size = 56213, upload-time = "2026-03-23T21:06:54.8Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/e6/ed/d93f7a447bbf7a935d8868e9617cbe1cadf9ee9ee6bd275d3040fbf93d60/griffecli-2.0.0-py3-none-any.whl", hash = "sha256:9f7cd9ee9b21d55e91689358978d2385ae65c22f307a63fb3269acf3f21e643d", size = 9345, upload-time = "2026-02-09T19:09:42.554Z" },
]
@ -2616,6 +2618,7 @@ wheels = [
name = "griffelib"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/ad/06/eccbd311c9e2b3ca45dbc063b93134c57a1ccc7607c5e545264ad092c4a9/griffelib-2.0.0.tar.gz", hash = "sha256:e504d637a089f5cab9b5daf18f7645970509bf4f53eda8d79ed71cce8bd97934", size = 166312, upload-time = "2026-03-23T21:06:55.954Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4d/51/c936033e16d12b627ea334aaaaf42229c37620d0f15593456ab69ab48161/griffelib-2.0.0-py3-none-any.whl", hash = "sha256:01284878c966508b6d6f1dbff9b6fa607bc062d8261c5c7253cb285b06422a7f", size = 142004, upload-time = "2026-02-09T19:09:40.561Z" },
]
@ -4082,15 +4085,15 @@ name = "matplotlib"
version = "3.10.8"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "contourpy" },
{ name = "cycler" },
{ name = "fonttools" },
{ name = "kiwisolver" },
{ name = "numpy" },
{ name = "packaging" },
{ name = "pillow" },
{ name = "pyparsing" },
{ name = "python-dateutil" },
{ name = "contourpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "cycler", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "fonttools", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "kiwisolver", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "packaging", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "pillow", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "pyparsing", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "python-dateutil", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/8a/76/d3c6e3a13fe484ebe7718d14e269c9569c4eb0020a968a327acb3b9a8fe6/matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3", size = 34806269, upload-time = "2025-12-10T22:56:51.155Z" }
wheels = [
@ -4201,7 +4204,7 @@ name = "ml-dtypes"
version = "0.5.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy" },
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" }
wheels = [
@ -4967,9 +4970,9 @@ name = "ocrmac"
version = "1.0.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "pillow" },
{ name = "pyobjc-framework-vision" },
{ name = "click", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pillow", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-vision", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/5e/07/3e15ab404f75875c5e48c47163300eb90b7409044d8711fc3aaf52503f2e/ocrmac-1.0.1.tar.gz", hash = "sha256:507fe5e4cbd67b2d03f6729a52bbc11f9d0b58241134eb958a5daafd4b9d93d9", size = 1454317, upload-time = "2026-01-08T16:44:26.412Z" }
wheels = [
@ -5003,10 +5006,10 @@ name = "onnx"
version = "1.20.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "ml-dtypes" },
{ name = "numpy" },
{ name = "protobuf" },
{ name = "typing-extensions" },
{ name = "ml-dtypes", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "protobuf", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "typing-extensions", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3b/8a/335c03a8683a88a32f9a6bb98899ea6df241a41df64b37b9696772414794/onnx-1.20.1.tar.gz", hash = "sha256:ded16de1df563d51fbc1ad885f2a426f814039d8b5f4feb77febe09c0295ad67", size = 12048980, upload-time = "2026-01-10T01:40:03.043Z" }
wheels = [
@ -6493,7 +6496,7 @@ name = "pyobjc-framework-cocoa"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/02/a3/16ca9a15e77c061a9250afbae2eae26f2e1579eb8ca9462ae2d2c71e1169/pyobjc_framework_cocoa-12.1.tar.gz", hash = "sha256:5556c87db95711b985d5efdaaf01c917ddd41d148b1e52a0c66b1a2e2c5c1640", size = 2772191, upload-time = "2025-11-14T10:13:02.069Z" }
wheels = [
@ -6509,8 +6512,8 @@ name = "pyobjc-framework-coreml"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/30/2d/baa9ea02cbb1c200683cb7273b69b4bee5070e86f2060b77e6a27c2a9d7e/pyobjc_framework_coreml-12.1.tar.gz", hash = "sha256:0d1a4216891a18775c9e0170d908714c18e4f53f9dc79fb0f5263b2aa81609ba", size = 40465, upload-time = "2025-11-14T10:14:02.265Z" }
wheels = [
@ -6526,8 +6529,8 @@ name = "pyobjc-framework-quartz"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/94/18/cc59f3d4355c9456fc945eae7fe8797003c4da99212dd531ad1b0de8a0c6/pyobjc_framework_quartz-12.1.tar.gz", hash = "sha256:27f782f3513ac88ec9b6c82d9767eef95a5cf4175ce88a1e5a65875fee799608", size = 3159099, upload-time = "2025-11-14T10:21:24.31Z" }
wheels = [
@ -6543,10 +6546,10 @@ name = "pyobjc-framework-vision"
version = "12.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pyobjc-core" },
{ name = "pyobjc-framework-cocoa" },
{ name = "pyobjc-framework-coreml" },
{ name = "pyobjc-framework-quartz" },
{ name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
{ name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c2/5a/08bb3e278f870443d226c141af14205ff41c0274da1e053b72b11dfc9fb2/pyobjc_framework_vision-12.1.tar.gz", hash = "sha256:a30959100e85dcede3a786c544e621ad6eb65ff6abf85721f805822b8c5fe9b0", size = 59538, upload-time = "2025-11-14T10:23:21.979Z" }
wheels = [
@ -7916,6 +7919,7 @@ dependencies = [
{ name = "notion-client" },
{ name = "notion-markdown" },
{ name = "numpy" },
{ name = "openpyxl" },
{ name = "pgvector" },
{ name = "playwright" },
{ name = "psycopg", extra = ["binary", "pool"] },
@ -7998,6 +8002,7 @@ requires-dist = [
{ name = "notion-client", specifier = ">=2.3.0" },
{ name = "notion-markdown", specifier = ">=0.7.0" },
{ name = "numpy", specifier = ">=1.24.0" },
{ name = "openpyxl", specifier = ">=3.1.5" },
{ name = "pgvector", specifier = ">=0.3.6" },
{ name = "playwright", specifier = ">=1.50.0" },
{ name = "psycopg", extras = ["binary", "pool"], specifier = ">=3.3.2" },
@ -8188,11 +8193,11 @@ name = "timm"
version = "1.0.25"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "huggingface-hub" },
{ name = "pyyaml" },
{ name = "safetensors" },
{ name = "torch" },
{ name = "torchvision" },
{ name = "huggingface-hub", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "pyyaml", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "safetensors", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "torch", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "torchvision", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/d7/2c/593109822fe735e637382aca6640c1102c19797f7791f1fd1dab2d6c3cb1/timm-1.0.25.tar.gz", hash = "sha256:47f59fc2754725735cc81bb83bcbfce5bec4ebd5d4bb9e69da57daa92fcfa768", size = 2414743, upload-time = "2026-02-23T16:49:00.137Z" }
wheels = [
@ -8819,22 +8824,22 @@ name = "unstructured-inference"
version = "1.2.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "accelerate" },
{ name = "huggingface-hub" },
{ name = "matplotlib" },
{ name = "numpy" },
{ name = "onnx" },
{ name = "onnxruntime" },
{ name = "opencv-python" },
{ name = "pandas" },
{ name = "pdfminer-six" },
{ name = "pypdfium2" },
{ name = "python-multipart" },
{ name = "rapidfuzz" },
{ name = "scipy" },
{ name = "timm" },
{ name = "torch" },
{ name = "transformers" },
{ name = "accelerate", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "huggingface-hub", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "matplotlib", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "onnx", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "onnxruntime", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "opencv-python", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "pandas", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "pdfminer-six", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "pypdfium2", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "python-multipart", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "rapidfuzz", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "scipy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "timm", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "torch", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
{ name = "transformers", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ce/10/8f3bccfa9f1e0101a402ae1f529e07876541c6b18004747f0e793ed41f9e/unstructured_inference-1.2.0.tar.gz", hash = "sha256:19ca28512f3649c70a759cf2a4e98663e942a1b83c1acdb9506b0445f4862f23", size = 45732, upload-time = "2026-01-30T20:57:58.019Z" }
wheels = [