mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-10 20:35:17 +02:00
feat: add native Excel parsing and improve Google Drive content extraction
- Introduced a new utility for parsing .xlsx files into markdown format, enhancing the ability to process Excel documents natively. - Updated the Google Drive content extractor to utilize the new Excel parsing functionality, allowing for better handling of spreadsheet files. - Enhanced file type detection and export logic to support various document formats, improving overall content extraction accuracy. - Added unit tests to ensure the correctness of the new Excel parsing feature and its integration with existing content extraction workflows.
This commit is contained in:
parent
4e0749f907
commit
3da0ffd683
7 changed files with 390 additions and 61 deletions
|
|
@ -14,8 +14,15 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from app.db import Log
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
from app.utils.office_parsers import EXCEL_EXTENSIONS
|
||||
|
||||
from .client import GoogleDriveClient
|
||||
from .file_types import get_export_mime_type, is_google_workspace_file, should_skip_file
|
||||
from .file_types import (
|
||||
get_export_mime_type,
|
||||
get_extension_from_mime,
|
||||
is_google_workspace_file,
|
||||
should_skip_file,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -58,29 +65,30 @@ async def download_and_extract_content(
|
|||
if "md5Checksum" in file:
|
||||
drive_metadata["md5_checksum"] = file["md5Checksum"]
|
||||
if is_google_workspace_file(mime_type):
|
||||
drive_metadata["exported_as"] = "pdf"
|
||||
export_ext = get_extension_from_mime(get_export_mime_type(mime_type) or "")
|
||||
drive_metadata["exported_as"] = export_ext.lstrip(".") if export_ext else "pdf"
|
||||
drive_metadata["original_workspace_type"] = mime_type.split(".")[-1]
|
||||
|
||||
temp_file_path = None
|
||||
try:
|
||||
if is_google_workspace_file(mime_type):
|
||||
# Workspace files (Docs/Sheets/Slides) use export -- returns bytes
|
||||
# in one shot. These are typically small (a few MB as PDF/text).
|
||||
export_mime = get_export_mime_type(mime_type)
|
||||
if not export_mime:
|
||||
return None, drive_metadata, f"Cannot export Google Workspace type: {mime_type}"
|
||||
content_bytes, error = await client.export_google_file(file_id, export_mime)
|
||||
if error:
|
||||
return None, drive_metadata, error
|
||||
extension = ".pdf" if export_mime == "application/pdf" else ".txt"
|
||||
extension = get_extension_from_mime(export_mime) or ".pdf"
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
|
||||
tmp.write(content_bytes)
|
||||
temp_file_path = tmp.name
|
||||
else:
|
||||
# Binary files -- stream directly to disk in chunks to avoid
|
||||
# loading the entire file into memory.
|
||||
extension = Path(file_name).suffix or ".bin"
|
||||
extension = (
|
||||
Path(file_name).suffix
|
||||
or get_extension_from_mime(mime_type)
|
||||
or ".bin"
|
||||
)
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp:
|
||||
temp_file_path = tmp.name
|
||||
|
||||
|
|
@ -142,6 +150,11 @@ async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
|
|||
raise ValueError("Transcription returned empty text")
|
||||
return f"# Transcription of {filename}\n\n{text}"
|
||||
|
||||
if lower.endswith(EXCEL_EXTENSIONS):
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
return await parse_excel_to_markdown(file_path, filename)
|
||||
|
||||
# Document files -- use configured ETL service
|
||||
from app.config import config as app_config
|
||||
|
||||
|
|
@ -236,14 +249,17 @@ async def download_and_process_file(
|
|||
if error:
|
||||
return None, error
|
||||
|
||||
extension = ".pdf" if export_mime == "application/pdf" else ".txt"
|
||||
extension = get_extension_from_mime(export_mime) or ".pdf"
|
||||
else:
|
||||
content_bytes, error = await client.download_file(file_id)
|
||||
if error:
|
||||
return None, error
|
||||
|
||||
# Preserve original file extension
|
||||
extension = Path(file_name).suffix or ".bin"
|
||||
extension = (
|
||||
Path(file_name).suffix
|
||||
or get_extension_from_mime(mime_type)
|
||||
or ".bin"
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=extension) as tmp_file:
|
||||
tmp_file.write(content_bytes)
|
||||
|
|
@ -281,7 +297,12 @@ async def download_and_process_file(
|
|||
connector_info["metadata"]["md5_checksum"] = file["md5Checksum"]
|
||||
|
||||
if is_google_workspace_file(mime_type):
|
||||
connector_info["metadata"]["exported_as"] = "pdf"
|
||||
export_ext = get_extension_from_mime(
|
||||
get_export_mime_type(mime_type) or ""
|
||||
)
|
||||
connector_info["metadata"]["exported_as"] = (
|
||||
export_ext.lstrip(".") if export_ext else "pdf"
|
||||
)
|
||||
connector_info["metadata"]["original_workspace_type"] = mime_type.split(
|
||||
"."
|
||||
)[-1]
|
||||
|
|
|
|||
|
|
@ -8,10 +8,33 @@ GOOGLE_SHORTCUT = "application/vnd.google-apps.shortcut"
|
|||
|
||||
EXPORT_FORMATS = {
|
||||
GOOGLE_DOC: "application/pdf",
|
||||
GOOGLE_SHEET: "application/pdf",
|
||||
GOOGLE_SHEET: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
GOOGLE_SLIDE: "application/pdf",
|
||||
}
|
||||
|
||||
MIME_TO_EXTENSION: dict[str, str] = {
|
||||
"application/pdf": ".pdf",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||
"application/vnd.ms-excel": ".xls",
|
||||
"application/msword": ".doc",
|
||||
"application/vnd.ms-powerpoint": ".ppt",
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv",
|
||||
"text/html": ".html",
|
||||
"text/markdown": ".md",
|
||||
"application/json": ".json",
|
||||
"application/xml": ".xml",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
}
|
||||
|
||||
|
||||
def get_extension_from_mime(mime_type: str) -> str | None:
|
||||
"""Return a file extension (with leading dot) for a MIME type, or None."""
|
||||
return MIME_TO_EXTENSION.get(mime_type)
|
||||
|
||||
|
||||
def is_google_workspace_file(mime_type: str) -> bool:
|
||||
"""Check if file is a Google Workspace file that needs export."""
|
||||
|
|
|
|||
|
|
@ -1134,6 +1134,59 @@ async def process_file_in_background(
|
|||
)
|
||||
return None
|
||||
|
||||
elif filename.lower().endswith((".xlsx",)):
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
if notification:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Parsing spreadsheet",
|
||||
)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing Excel file natively: {filename}",
|
||||
{"file_type": "excel", "processing_stage": "native_parse"},
|
||||
)
|
||||
|
||||
excel_markdown = await parse_excel_to_markdown(file_path, filename)
|
||||
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except Exception as e:
|
||||
print("Error deleting temp file", e)
|
||||
|
||||
result = await add_received_markdown_file_document(
|
||||
session, filename, excel_markdown, search_space_id, user_id, connector
|
||||
)
|
||||
|
||||
if connector:
|
||||
await _update_document_from_connector(result, connector, session)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully parsed and processed Excel file: {filename}",
|
||||
{
|
||||
"document_id": result.id,
|
||||
"content_hash": result.content_hash,
|
||||
"file_type": "excel",
|
||||
"etl_service": "NATIVE_EXCEL",
|
||||
},
|
||||
)
|
||||
return result
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Excel file already exists (duplicate): {filename}",
|
||||
{"duplicate_detected": True, "file_type": "excel"},
|
||||
)
|
||||
return None
|
||||
|
||||
else:
|
||||
# Import page limit service
|
||||
from app.services.page_limit_service import (
|
||||
|
|
@ -1797,6 +1850,31 @@ async def process_file_in_background_with_document(
|
|||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
elif filename.lower().endswith((".xlsx",)):
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
if notification:
|
||||
await (
|
||||
NotificationService.document_processing.notify_processing_progress(
|
||||
session,
|
||||
notification,
|
||||
stage="parsing",
|
||||
stage_message="Parsing spreadsheet",
|
||||
)
|
||||
)
|
||||
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Processing Excel file natively: {filename}",
|
||||
{"file_type": "excel", "processing_stage": "native_parse"},
|
||||
)
|
||||
|
||||
markdown_content = await parse_excel_to_markdown(file_path, filename)
|
||||
etl_service = "NATIVE_EXCEL"
|
||||
|
||||
with contextlib.suppress(Exception):
|
||||
os.unlink(file_path)
|
||||
|
||||
else:
|
||||
# Document files - use ETL service
|
||||
from app.services.page_limit_service import (
|
||||
|
|
|
|||
72
surfsense_backend/app/utils/office_parsers.py
Normal file
72
surfsense_backend/app/utils/office_parsers.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
"""Native parsers for Office file formats."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EXCEL_EXTENSIONS = (".xlsx",)
|
||||
|
||||
|
||||
def _parse_excel_sync(file_path: str) -> str:
|
||||
"""Parse an .xlsx file into markdown tables (synchronous)."""
|
||||
from openpyxl import load_workbook
|
||||
|
||||
wb = load_workbook(file_path, read_only=True, data_only=True)
|
||||
markdown_parts: list[str] = []
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
rows = list(ws.iter_rows(values_only=True))
|
||||
non_empty_rows = [r for r in rows if any(c is not None for c in r)]
|
||||
if not non_empty_rows:
|
||||
continue
|
||||
|
||||
markdown_parts.append(f"## {sheet_name}\n")
|
||||
max_cols = max(len(row) for row in non_empty_rows)
|
||||
|
||||
header = non_empty_rows[0]
|
||||
hdr = [str(c if c is not None else "") for c in header]
|
||||
hdr.extend([""] * (max_cols - len(hdr)))
|
||||
markdown_parts.append("| " + " | ".join(hdr) + " |")
|
||||
markdown_parts.append("| " + " | ".join("---" for _ in range(max_cols)) + " |")
|
||||
|
||||
for row in non_empty_rows[1:]:
|
||||
cells = [str(c if c is not None else "") for c in row]
|
||||
cells.extend([""] * (max_cols - len(cells)))
|
||||
markdown_parts.append("| " + " | ".join(cells) + " |")
|
||||
|
||||
markdown_parts.append("")
|
||||
|
||||
wb.close()
|
||||
return "\n".join(markdown_parts)
|
||||
|
||||
|
||||
async def parse_excel_to_markdown(file_path: str, filename: str = "") -> str:
|
||||
"""Parse an .xlsx file into markdown tables (async wrapper).
|
||||
|
||||
Raises ``ValueError`` if no data is found in the workbook.
|
||||
"""
|
||||
t0 = time.monotonic()
|
||||
logger.info(
|
||||
"[excel-parse] START file=%s thread=%s",
|
||||
filename,
|
||||
threading.current_thread().name,
|
||||
)
|
||||
|
||||
result = await asyncio.to_thread(_parse_excel_sync, file_path)
|
||||
|
||||
logger.info(
|
||||
"[excel-parse] END file=%s elapsed=%.2fs",
|
||||
filename,
|
||||
time.monotonic() - t0,
|
||||
)
|
||||
|
||||
if not result.strip():
|
||||
raise ValueError(f"No data found in Excel file: {filename or file_path}")
|
||||
|
||||
title = f"# {filename}\n\n" if filename else ""
|
||||
return title + result
|
||||
|
|
@ -73,6 +73,7 @@ dependencies = [
|
|||
"langchain-daytona>=0.0.2",
|
||||
"pypandoc>=1.16.2",
|
||||
"notion-markdown>=0.7.0",
|
||||
"openpyxl>=3.1.5",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
129
surfsense_backend/tests/unit/test_office_parsers.py
Normal file
129
surfsense_backend/tests/unit/test_office_parsers.py
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
"""Unit tests for native Office file parsers (no DB, no external services)."""
|
||||
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
from openpyxl import Workbook
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _create_xlsx(sheets: dict[str, list[list]]) -> str:
|
||||
"""Create a real .xlsx file on disk and return its path.
|
||||
|
||||
``sheets`` maps sheet name -> list of rows, where each row is a list of
|
||||
cell values.
|
||||
"""
|
||||
wb = Workbook()
|
||||
first = True
|
||||
for name, rows in sheets.items():
|
||||
ws = wb.active if first else wb.create_sheet(title=name)
|
||||
if first:
|
||||
ws.title = name
|
||||
first = False
|
||||
for row in rows:
|
||||
ws.append(row)
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
|
||||
wb.save(tmp.name)
|
||||
wb.close()
|
||||
tmp.close()
|
||||
return tmp.name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tracer bullet: cell values appear in markdown
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_excel_produces_markdown_with_cell_values():
|
||||
"""A single-sheet .xlsx with known data produces markdown containing those values."""
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
path = _create_xlsx(
|
||||
{"Sales": [["Product", "Revenue"], ["Widget", 1500], ["Gadget", 3200]]}
|
||||
)
|
||||
|
||||
md = await parse_excel_to_markdown(path, filename="report.xlsx")
|
||||
|
||||
assert "Product" in md
|
||||
assert "Revenue" in md
|
||||
assert "Widget" in md
|
||||
assert "1500" in md
|
||||
assert "Gadget" in md
|
||||
assert "3200" in md
|
||||
assert "report.xlsx" in md
|
||||
assert "|" in md
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Multi-sheet workbooks include all sheets
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_excel_includes_all_sheets():
|
||||
"""Both sheet names and their data appear in the output."""
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
path = _create_xlsx(
|
||||
{
|
||||
"Inventory": [["Item", "Qty"], ["Bolts", 200]],
|
||||
"Pricing": [["Item", "Price"], ["Bolts", 4.50]],
|
||||
}
|
||||
)
|
||||
|
||||
md = await parse_excel_to_markdown(path, filename="multi.xlsx")
|
||||
|
||||
assert "Inventory" in md
|
||||
assert "Pricing" in md
|
||||
assert "Bolts" in md
|
||||
assert "200" in md
|
||||
assert "4.5" in md
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Empty spreadsheet raises ValueError
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_excel_raises_on_empty_file():
|
||||
"""An .xlsx with no data raises ValueError."""
|
||||
from app.utils.office_parsers import parse_excel_to_markdown
|
||||
|
||||
wb = Workbook()
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False)
|
||||
wb.save(tmp.name)
|
||||
wb.close()
|
||||
tmp.close()
|
||||
|
||||
with pytest.raises(ValueError, match="No data found"):
|
||||
await parse_excel_to_markdown(tmp.name, filename="empty.xlsx")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _parse_file_to_markdown routes .xlsx natively (no ETL call)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_file_to_markdown_routes_xlsx_natively():
|
||||
"""content_extractor._parse_file_to_markdown uses native parser for .xlsx."""
|
||||
from app.connectors.google_drive.content_extractor import _parse_file_to_markdown
|
||||
|
||||
path = _create_xlsx(
|
||||
{"Data": [["Name", "Score"], ["Alice", 95], ["Bob", 82]]}
|
||||
)
|
||||
|
||||
md = await _parse_file_to_markdown(path, "grades.xlsx")
|
||||
|
||||
assert "Alice" in md
|
||||
assert "95" in md
|
||||
assert "Bob" in md
|
||||
assert "82" in md
|
||||
101
surfsense_backend/uv.lock
generated
101
surfsense_backend/uv.lock
generated
|
|
@ -1171,7 +1171,7 @@ name = "contourpy"
|
|||
version = "1.3.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "numpy" },
|
||||
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" }
|
||||
wheels = [
|
||||
|
|
@ -2596,6 +2596,7 @@ dependencies = [
|
|||
{ name = "griffecli" },
|
||||
{ name = "griffelib" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/04/56/28a0accac339c164b52a92c6cfc45a903acc0c174caa5c1713803467b533/griffe-2.0.0.tar.gz", hash = "sha256:c68979cd8395422083a51ea7cf02f9c119d889646d99b7b656ee43725de1b80f", size = 293906, upload-time = "2026-03-23T21:06:53.402Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/94/ee21d41e7eb4f823b94603b9d40f86d3c7fde80eacc2c3c71845476dddaa/griffe-2.0.0-py3-none-any.whl", hash = "sha256:5418081135a391c3e6e757a7f3f156f1a1a746cc7b4023868ff7d5e2f9a980aa", size = 5214, upload-time = "2026-02-09T19:09:44.105Z" },
|
||||
]
|
||||
|
|
@ -2608,6 +2609,7 @@ dependencies = [
|
|||
{ name = "colorama" },
|
||||
{ name = "griffelib" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a4/f8/2e129fd4a86e52e58eefe664de05e7d502decf766e7316cc9e70fdec3e18/griffecli-2.0.0.tar.gz", hash = "sha256:312fa5ebb4ce6afc786356e2d0ce85b06c1c20d45abc42d74f0cda65e159f6ef", size = 56213, upload-time = "2026-03-23T21:06:54.8Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/ed/d93f7a447bbf7a935d8868e9617cbe1cadf9ee9ee6bd275d3040fbf93d60/griffecli-2.0.0-py3-none-any.whl", hash = "sha256:9f7cd9ee9b21d55e91689358978d2385ae65c22f307a63fb3269acf3f21e643d", size = 9345, upload-time = "2026-02-09T19:09:42.554Z" },
|
||||
]
|
||||
|
|
@ -2616,6 +2618,7 @@ wheels = [
|
|||
name = "griffelib"
|
||||
version = "2.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ad/06/eccbd311c9e2b3ca45dbc063b93134c57a1ccc7607c5e545264ad092c4a9/griffelib-2.0.0.tar.gz", hash = "sha256:e504d637a089f5cab9b5daf18f7645970509bf4f53eda8d79ed71cce8bd97934", size = 166312, upload-time = "2026-03-23T21:06:55.954Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/4d/51/c936033e16d12b627ea334aaaaf42229c37620d0f15593456ab69ab48161/griffelib-2.0.0-py3-none-any.whl", hash = "sha256:01284878c966508b6d6f1dbff9b6fa607bc062d8261c5c7253cb285b06422a7f", size = 142004, upload-time = "2026-02-09T19:09:40.561Z" },
|
||||
]
|
||||
|
|
@ -4082,15 +4085,15 @@ name = "matplotlib"
|
|||
version = "3.10.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "contourpy" },
|
||||
{ name = "cycler" },
|
||||
{ name = "fonttools" },
|
||||
{ name = "kiwisolver" },
|
||||
{ name = "numpy" },
|
||||
{ name = "packaging" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pyparsing" },
|
||||
{ name = "python-dateutil" },
|
||||
{ name = "contourpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "cycler", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "fonttools", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "kiwisolver", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "packaging", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "pillow", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "pyparsing", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "python-dateutil", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/8a/76/d3c6e3a13fe484ebe7718d14e269c9569c4eb0020a968a327acb3b9a8fe6/matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3", size = 34806269, upload-time = "2025-12-10T22:56:51.155Z" }
|
||||
wheels = [
|
||||
|
|
@ -4201,7 +4204,7 @@ name = "ml-dtypes"
|
|||
version = "0.5.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "numpy" },
|
||||
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" }
|
||||
wheels = [
|
||||
|
|
@ -4967,9 +4970,9 @@ name = "ocrmac"
|
|||
version = "1.0.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pyobjc-framework-vision" },
|
||||
{ name = "click", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
{ name = "pillow", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
{ name = "pyobjc-framework-vision", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5e/07/3e15ab404f75875c5e48c47163300eb90b7409044d8711fc3aaf52503f2e/ocrmac-1.0.1.tar.gz", hash = "sha256:507fe5e4cbd67b2d03f6729a52bbc11f9d0b58241134eb958a5daafd4b9d93d9", size = 1454317, upload-time = "2026-01-08T16:44:26.412Z" }
|
||||
wheels = [
|
||||
|
|
@ -5003,10 +5006,10 @@ name = "onnx"
|
|||
version = "1.20.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "ml-dtypes" },
|
||||
{ name = "numpy" },
|
||||
{ name = "protobuf" },
|
||||
{ name = "typing-extensions" },
|
||||
{ name = "ml-dtypes", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "protobuf", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "typing-extensions", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/3b/8a/335c03a8683a88a32f9a6bb98899ea6df241a41df64b37b9696772414794/onnx-1.20.1.tar.gz", hash = "sha256:ded16de1df563d51fbc1ad885f2a426f814039d8b5f4feb77febe09c0295ad67", size = 12048980, upload-time = "2026-01-10T01:40:03.043Z" }
|
||||
wheels = [
|
||||
|
|
@ -6493,7 +6496,7 @@ name = "pyobjc-framework-cocoa"
|
|||
version = "12.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyobjc-core" },
|
||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/02/a3/16ca9a15e77c061a9250afbae2eae26f2e1579eb8ca9462ae2d2c71e1169/pyobjc_framework_cocoa-12.1.tar.gz", hash = "sha256:5556c87db95711b985d5efdaaf01c917ddd41d148b1e52a0c66b1a2e2c5c1640", size = 2772191, upload-time = "2025-11-14T10:13:02.069Z" }
|
||||
wheels = [
|
||||
|
|
@ -6509,8 +6512,8 @@ name = "pyobjc-framework-coreml"
|
|||
version = "12.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyobjc-core" },
|
||||
{ name = "pyobjc-framework-cocoa" },
|
||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/30/2d/baa9ea02cbb1c200683cb7273b69b4bee5070e86f2060b77e6a27c2a9d7e/pyobjc_framework_coreml-12.1.tar.gz", hash = "sha256:0d1a4216891a18775c9e0170d908714c18e4f53f9dc79fb0f5263b2aa81609ba", size = 40465, upload-time = "2025-11-14T10:14:02.265Z" }
|
||||
wheels = [
|
||||
|
|
@ -6526,8 +6529,8 @@ name = "pyobjc-framework-quartz"
|
|||
version = "12.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyobjc-core" },
|
||||
{ name = "pyobjc-framework-cocoa" },
|
||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/94/18/cc59f3d4355c9456fc945eae7fe8797003c4da99212dd531ad1b0de8a0c6/pyobjc_framework_quartz-12.1.tar.gz", hash = "sha256:27f782f3513ac88ec9b6c82d9767eef95a5cf4175ce88a1e5a65875fee799608", size = 3159099, upload-time = "2025-11-14T10:21:24.31Z" }
|
||||
wheels = [
|
||||
|
|
@ -6543,10 +6546,10 @@ name = "pyobjc-framework-vision"
|
|||
version = "12.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pyobjc-core" },
|
||||
{ name = "pyobjc-framework-cocoa" },
|
||||
{ name = "pyobjc-framework-coreml" },
|
||||
{ name = "pyobjc-framework-quartz" },
|
||||
{ name = "pyobjc-core", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
{ name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
{ name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
{ name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.13' and sys_platform == 'emscripten') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/c2/5a/08bb3e278f870443d226c141af14205ff41c0274da1e053b72b11dfc9fb2/pyobjc_framework_vision-12.1.tar.gz", hash = "sha256:a30959100e85dcede3a786c544e621ad6eb65ff6abf85721f805822b8c5fe9b0", size = 59538, upload-time = "2025-11-14T10:23:21.979Z" }
|
||||
wheels = [
|
||||
|
|
@ -7916,6 +7919,7 @@ dependencies = [
|
|||
{ name = "notion-client" },
|
||||
{ name = "notion-markdown" },
|
||||
{ name = "numpy" },
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pgvector" },
|
||||
{ name = "playwright" },
|
||||
{ name = "psycopg", extra = ["binary", "pool"] },
|
||||
|
|
@ -7998,6 +8002,7 @@ requires-dist = [
|
|||
{ name = "notion-client", specifier = ">=2.3.0" },
|
||||
{ name = "notion-markdown", specifier = ">=0.7.0" },
|
||||
{ name = "numpy", specifier = ">=1.24.0" },
|
||||
{ name = "openpyxl", specifier = ">=3.1.5" },
|
||||
{ name = "pgvector", specifier = ">=0.3.6" },
|
||||
{ name = "playwright", specifier = ">=1.50.0" },
|
||||
{ name = "psycopg", extras = ["binary", "pool"], specifier = ">=3.3.2" },
|
||||
|
|
@ -8188,11 +8193,11 @@ name = "timm"
|
|||
version = "1.0.25"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "huggingface-hub" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "safetensors" },
|
||||
{ name = "torch" },
|
||||
{ name = "torchvision" },
|
||||
{ name = "huggingface-hub", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "pyyaml", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "safetensors", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "torch", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "torchvision", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d7/2c/593109822fe735e637382aca6640c1102c19797f7791f1fd1dab2d6c3cb1/timm-1.0.25.tar.gz", hash = "sha256:47f59fc2754725735cc81bb83bcbfce5bec4ebd5d4bb9e69da57daa92fcfa768", size = 2414743, upload-time = "2026-02-23T16:49:00.137Z" }
|
||||
wheels = [
|
||||
|
|
@ -8819,22 +8824,22 @@ name = "unstructured-inference"
|
|||
version = "1.2.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "accelerate" },
|
||||
{ name = "huggingface-hub" },
|
||||
{ name = "matplotlib" },
|
||||
{ name = "numpy" },
|
||||
{ name = "onnx" },
|
||||
{ name = "onnxruntime" },
|
||||
{ name = "opencv-python" },
|
||||
{ name = "pandas" },
|
||||
{ name = "pdfminer-six" },
|
||||
{ name = "pypdfium2" },
|
||||
{ name = "python-multipart" },
|
||||
{ name = "rapidfuzz" },
|
||||
{ name = "scipy" },
|
||||
{ name = "timm" },
|
||||
{ name = "torch" },
|
||||
{ name = "transformers" },
|
||||
{ name = "accelerate", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "huggingface-hub", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "matplotlib", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "numpy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "onnx", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "onnxruntime", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "opencv-python", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "pandas", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "pdfminer-six", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "pypdfium2", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "python-multipart", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "rapidfuzz", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "scipy", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "timm", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "torch", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
{ name = "transformers", marker = "python_full_version < '3.13' or sys_platform != 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ce/10/8f3bccfa9f1e0101a402ae1f529e07876541c6b18004747f0e793ed41f9e/unstructured_inference-1.2.0.tar.gz", hash = "sha256:19ca28512f3649c70a759cf2a4e98663e942a1b83c1acdb9506b0445f4862f23", size = 45732, upload-time = "2026-01-30T20:57:58.019Z" }
|
||||
wheels = [
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue