2026-03-13 14:27:42 +00:00
|
|
|
|
"""
|
|
|
|
|
|
Tests for librarian chunked upload operations:
|
|
|
|
|
|
begin_upload, upload_chunk, complete_upload, abort_upload, get_upload_status,
|
|
|
|
|
|
list_uploads, and stream_document.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import base64
|
|
|
|
|
|
import json
|
|
|
|
|
|
import math
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
|
|
|
|
|
|
|
|
from trustgraph.librarian.librarian import Librarian, DEFAULT_CHUNK_SIZE
|
|
|
|
|
|
from trustgraph.exceptions import RequestError
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Helpers
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def _make_librarian(min_chunk_size=1):
|
|
|
|
|
|
"""Create a Librarian with mocked blob_store and table_store."""
|
|
|
|
|
|
lib = Librarian.__new__(Librarian)
|
|
|
|
|
|
lib.blob_store = MagicMock()
|
|
|
|
|
|
lib.table_store = AsyncMock()
|
|
|
|
|
|
lib.load_document = AsyncMock()
|
|
|
|
|
|
lib.min_chunk_size = min_chunk_size
|
|
|
|
|
|
return lib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _make_doc_metadata(
|
|
|
|
|
|
doc_id="doc-1", kind="application/pdf", user="alice", title="Test Doc"
|
|
|
|
|
|
):
|
|
|
|
|
|
meta = MagicMock()
|
|
|
|
|
|
meta.id = doc_id
|
|
|
|
|
|
meta.kind = kind
|
|
|
|
|
|
meta.user = user
|
|
|
|
|
|
meta.title = title
|
|
|
|
|
|
meta.time = 1700000000
|
|
|
|
|
|
meta.comments = ""
|
|
|
|
|
|
meta.tags = []
|
|
|
|
|
|
return meta
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _make_begin_request(
|
|
|
|
|
|
doc_id="doc-1", kind="application/pdf", user="alice",
|
|
|
|
|
|
total_size=10_000_000, chunk_size=0
|
|
|
|
|
|
):
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.document_metadata = _make_doc_metadata(doc_id=doc_id, kind=kind, user=user)
|
|
|
|
|
|
req.total_size = total_size
|
|
|
|
|
|
req.chunk_size = chunk_size
|
|
|
|
|
|
return req
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _make_upload_chunk_request(upload_id="up-1", chunk_index=0, user="alice", content=b"data"):
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = upload_id
|
|
|
|
|
|
req.chunk_index = chunk_index
|
|
|
|
|
|
req.user = user
|
|
|
|
|
|
req.content = base64.b64encode(content)
|
|
|
|
|
|
return req
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _make_session(
|
|
|
|
|
|
user="alice", total_chunks=5, chunk_size=2_000_000,
|
|
|
|
|
|
total_size=10_000_000, chunks_received=None, object_id="obj-1",
|
|
|
|
|
|
s3_upload_id="s3-up-1", document_metadata=None, document_id="doc-1",
|
|
|
|
|
|
):
|
|
|
|
|
|
if chunks_received is None:
|
|
|
|
|
|
chunks_received = {}
|
|
|
|
|
|
if document_metadata is None:
|
|
|
|
|
|
document_metadata = json.dumps({
|
|
|
|
|
|
"id": document_id, "kind": "application/pdf",
|
|
|
|
|
|
"user": user, "title": "Test", "time": 1700000000,
|
|
|
|
|
|
"comments": "", "tags": [],
|
|
|
|
|
|
})
|
|
|
|
|
|
return {
|
|
|
|
|
|
"user": user,
|
|
|
|
|
|
"total_chunks": total_chunks,
|
|
|
|
|
|
"chunk_size": chunk_size,
|
|
|
|
|
|
"total_size": total_size,
|
|
|
|
|
|
"chunks_received": chunks_received,
|
|
|
|
|
|
"object_id": object_id,
|
|
|
|
|
|
"s3_upload_id": s3_upload_id,
|
|
|
|
|
|
"document_metadata": document_metadata,
|
|
|
|
|
|
"document_id": document_id,
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# begin_upload
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
class TestBeginUpload:
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_creates_session(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.document_exists.return_value = False
|
|
|
|
|
|
lib.blob_store.create_multipart_upload.return_value = "s3-upload-id"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_begin_request(total_size=10_000_000)
|
|
|
|
|
|
resp = await lib.begin_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.error is None
|
|
|
|
|
|
assert resp.upload_id is not None
|
|
|
|
|
|
assert resp.total_chunks == math.ceil(10_000_000 / DEFAULT_CHUNK_SIZE)
|
|
|
|
|
|
assert resp.chunk_size == DEFAULT_CHUNK_SIZE
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_custom_chunk_size(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.document_exists.return_value = False
|
|
|
|
|
|
lib.blob_store.create_multipart_upload.return_value = "s3-id"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_begin_request(total_size=10_000, chunk_size=3000)
|
|
|
|
|
|
resp = await lib.begin_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.chunk_size == 3000
|
|
|
|
|
|
assert resp.total_chunks == math.ceil(10_000 / 3000)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support
using 'unstructured'.
New universal decoder service powered by the unstructured
library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF,
ODT, EPUB and more through a single service. Tables are preserved
as HTML markup for better downstream extraction. Images are
stored in the librarian but excluded from the text
pipeline. Configurable section grouping strategies
(whole-document, heading, element-type, count, size) for non-page
formats. Page-based formats (PDF, PPTX, XLSX) are automatically
grouped by page.
All four decoders (PDF, Mistral OCR, Tesseract OCR, universal)
now share the "document-decoder" ident so they are
interchangeable. PDF-only decoders fetch document metadata to
check MIME type and gracefully skip unsupported formats.
Librarian changes: removed MIME type whitelist validation so any
document format can be ingested. Simplified routing so text/plain
goes to text-load and everything else goes to document-load.
Removed dual inline/streaming data paths — documents always use
document_id for content retrieval.
New provenance entity types (tg:Section, tg:Image) and metadata
predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for
richer explainability.
Universal decoder is in its own package (trustgraph-unstructured)
and container image (trustgraph-unstructured).
2026-03-23 12:56:35 +00:00
|
|
|
|
async def test_rejects_empty_kind(self):
|
2026-03-13 14:27:42 +00:00
|
|
|
|
lib = _make_librarian()
|
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support
using 'unstructured'.
New universal decoder service powered by the unstructured
library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF,
ODT, EPUB and more through a single service. Tables are preserved
as HTML markup for better downstream extraction. Images are
stored in the librarian but excluded from the text
pipeline. Configurable section grouping strategies
(whole-document, heading, element-type, count, size) for non-page
formats. Page-based formats (PDF, PPTX, XLSX) are automatically
grouped by page.
All four decoders (PDF, Mistral OCR, Tesseract OCR, universal)
now share the "document-decoder" ident so they are
interchangeable. PDF-only decoders fetch document metadata to
check MIME type and gracefully skip unsupported formats.
Librarian changes: removed MIME type whitelist validation so any
document format can be ingested. Simplified routing so text/plain
goes to text-load and everything else goes to document-load.
Removed dual inline/streaming data paths — documents always use
document_id for content retrieval.
New provenance entity types (tg:Section, tg:Image) and metadata
predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for
richer explainability.
Universal decoder is in its own package (trustgraph-unstructured)
and container image (trustgraph-unstructured).
2026-03-23 12:56:35 +00:00
|
|
|
|
req = _make_begin_request(kind="")
|
2026-03-13 14:27:42 +00:00
|
|
|
|
|
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support
using 'unstructured'.
New universal decoder service powered by the unstructured
library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF,
ODT, EPUB and more through a single service. Tables are preserved
as HTML markup for better downstream extraction. Images are
stored in the librarian but excluded from the text
pipeline. Configurable section grouping strategies
(whole-document, heading, element-type, count, size) for non-page
formats. Page-based formats (PDF, PPTX, XLSX) are automatically
grouped by page.
All four decoders (PDF, Mistral OCR, Tesseract OCR, universal)
now share the "document-decoder" ident so they are
interchangeable. PDF-only decoders fetch document metadata to
check MIME type and gracefully skip unsupported formats.
Librarian changes: removed MIME type whitelist validation so any
document format can be ingested. Simplified routing so text/plain
goes to text-load and everything else goes to document-load.
Removed dual inline/streaming data paths — documents always use
document_id for content retrieval.
New provenance entity types (tg:Section, tg:Image) and metadata
predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for
richer explainability.
Universal decoder is in its own package (trustgraph-unstructured)
and container image (trustgraph-unstructured).
2026-03-23 12:56:35 +00:00
|
|
|
|
with pytest.raises(RequestError, match="MIME type.*required"):
|
2026-03-13 14:27:42 +00:00
|
|
|
|
await lib.begin_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_duplicate_document(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.document_exists.return_value = True
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_begin_request()
|
|
|
|
|
|
with pytest.raises(RequestError, match="already exists"):
|
|
|
|
|
|
await lib.begin_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_zero_size(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.document_exists.return_value = False
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_begin_request(total_size=0)
|
|
|
|
|
|
with pytest.raises(RequestError, match="positive"):
|
|
|
|
|
|
await lib.begin_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_chunk_below_minimum(self):
|
|
|
|
|
|
lib = _make_librarian(min_chunk_size=1024)
|
|
|
|
|
|
lib.table_store.document_exists.return_value = False
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_begin_request(total_size=10_000, chunk_size=512)
|
|
|
|
|
|
with pytest.raises(RequestError, match="below minimum"):
|
|
|
|
|
|
await lib.begin_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_calls_s3_create_multipart(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.document_exists.return_value = False
|
|
|
|
|
|
lib.blob_store.create_multipart_upload.return_value = "s3-id"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_begin_request(kind="application/pdf")
|
|
|
|
|
|
await lib.begin_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
lib.blob_store.create_multipart_upload.assert_called_once()
|
|
|
|
|
|
# create_multipart_upload(object_id, kind) — positional args
|
|
|
|
|
|
args = lib.blob_store.create_multipart_upload.call_args[0]
|
|
|
|
|
|
assert args[1] == "application/pdf"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_stores_session_in_cassandra(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.document_exists.return_value = False
|
|
|
|
|
|
lib.blob_store.create_multipart_upload.return_value = "s3-id"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_begin_request(total_size=5_000_000)
|
|
|
|
|
|
resp = await lib.begin_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
lib.table_store.create_upload_session.assert_called_once()
|
|
|
|
|
|
kwargs = lib.table_store.create_upload_session.call_args[1]
|
|
|
|
|
|
assert kwargs["upload_id"] == resp.upload_id
|
|
|
|
|
|
assert kwargs["total_size"] == 5_000_000
|
|
|
|
|
|
assert kwargs["total_chunks"] == resp.total_chunks
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_accepts_text_plain(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.document_exists.return_value = False
|
|
|
|
|
|
lib.blob_store.create_multipart_upload.return_value = "s3-id"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_begin_request(kind="text/plain", total_size=1000)
|
|
|
|
|
|
resp = await lib.begin_upload(req)
|
|
|
|
|
|
assert resp.error is None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# upload_chunk
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
class TestUploadChunk:
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_successful_chunk_upload(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(total_chunks=5, chunks_received={})
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
lib.blob_store.upload_part.return_value = "etag-1"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request(chunk_index=0, content=b"chunk data")
|
|
|
|
|
|
resp = await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.error is None
|
|
|
|
|
|
assert resp.chunk_index == 0
|
|
|
|
|
|
assert resp.total_chunks == 5
|
|
|
|
|
|
# The chunk is added to the dict (len=1), then +1 applied => 2
|
|
|
|
|
|
assert resp.chunks_received == 2
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_s3_part_number_is_1_indexed(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session()
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
lib.blob_store.upload_part.return_value = "etag"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request(chunk_index=0)
|
|
|
|
|
|
await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
kwargs = lib.blob_store.upload_part.call_args[1]
|
|
|
|
|
|
assert kwargs["part_number"] == 1 # 0-indexed chunk → 1-indexed part
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_chunk_index_3_becomes_part_4(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session()
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
lib.blob_store.upload_part.return_value = "etag"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request(chunk_index=3)
|
|
|
|
|
|
await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
kwargs = lib.blob_store.upload_part.call_args[1]
|
|
|
|
|
|
assert kwargs["part_number"] == 4
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_expired_session(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = None
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request()
|
|
|
|
|
|
with pytest.raises(RequestError, match="not found"):
|
|
|
|
|
|
await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_wrong_user(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(user="alice")
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request(user="bob")
|
|
|
|
|
|
with pytest.raises(RequestError, match="Not authorized"):
|
|
|
|
|
|
await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_negative_chunk_index(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(total_chunks=5)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request(chunk_index=-1)
|
|
|
|
|
|
with pytest.raises(RequestError, match="Invalid chunk index"):
|
|
|
|
|
|
await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_out_of_range_chunk_index(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(total_chunks=5)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request(chunk_index=5)
|
|
|
|
|
|
with pytest.raises(RequestError, match="Invalid chunk index"):
|
|
|
|
|
|
await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_progress_tracking(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(
|
|
|
|
|
|
total_chunks=4, chunk_size=1000, total_size=3500,
|
|
|
|
|
|
chunks_received={0: "e1", 1: "e2"},
|
|
|
|
|
|
)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
lib.blob_store.upload_part.return_value = "e3"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request(chunk_index=2)
|
|
|
|
|
|
resp = await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
# Dict gets chunk 2 added (len=3), then +1 => 4
|
|
|
|
|
|
assert resp.chunks_received == 4
|
|
|
|
|
|
assert resp.total_chunks == 4
|
|
|
|
|
|
assert resp.total_bytes == 3500
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_bytes_capped_at_total_size(self):
|
|
|
|
|
|
"""bytes_received should not exceed total_size for the final chunk."""
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(
|
|
|
|
|
|
total_chunks=2, chunk_size=3000, total_size=5000,
|
|
|
|
|
|
chunks_received={0: "e1"},
|
|
|
|
|
|
)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
lib.blob_store.upload_part.return_value = "e2"
|
|
|
|
|
|
|
|
|
|
|
|
req = _make_upload_chunk_request(chunk_index=1)
|
|
|
|
|
|
resp = await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
# 3 chunks × 3000 = 9000 > 5000, so capped
|
|
|
|
|
|
assert resp.bytes_received <= 5000
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_base64_decodes_content(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session()
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
lib.blob_store.upload_part.return_value = "etag"
|
|
|
|
|
|
|
|
|
|
|
|
raw = b"hello world binary data"
|
|
|
|
|
|
req = _make_upload_chunk_request(content=raw)
|
|
|
|
|
|
await lib.upload_chunk(req)
|
|
|
|
|
|
|
|
|
|
|
|
kwargs = lib.blob_store.upload_part.call_args[1]
|
|
|
|
|
|
assert kwargs["data"] == raw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# complete_upload
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
class TestCompleteUpload:
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_successful_completion(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(
|
|
|
|
|
|
total_chunks=3,
|
|
|
|
|
|
chunks_received={0: "e1", 1: "e2", 2: "e3"},
|
|
|
|
|
|
)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
resp = await lib.complete_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.error is None
|
|
|
|
|
|
assert resp.document_id == "doc-1"
|
|
|
|
|
|
lib.blob_store.complete_multipart_upload.assert_called_once()
|
|
|
|
|
|
lib.table_store.add_document.assert_called_once()
|
|
|
|
|
|
lib.table_store.delete_upload_session.assert_called_once_with("up-1")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_parts_sorted_by_index(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
# Chunks received out of order
|
|
|
|
|
|
session = _make_session(
|
|
|
|
|
|
total_chunks=3,
|
|
|
|
|
|
chunks_received={2: "e3", 0: "e1", 1: "e2"},
|
|
|
|
|
|
)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
await lib.complete_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
parts = lib.blob_store.complete_multipart_upload.call_args[1]["parts"]
|
|
|
|
|
|
part_numbers = [p[0] for p in parts]
|
|
|
|
|
|
assert part_numbers == [1, 2, 3] # Sorted, 1-indexed
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_missing_chunks(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(
|
|
|
|
|
|
total_chunks=3,
|
|
|
|
|
|
chunks_received={0: "e1", 2: "e3"}, # chunk 1 missing
|
|
|
|
|
|
)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
with pytest.raises(RequestError, match="Missing chunks"):
|
|
|
|
|
|
await lib.complete_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_expired_session(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = None
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-gone"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
with pytest.raises(RequestError, match="not found"):
|
|
|
|
|
|
await lib.complete_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_wrong_user(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(user="alice")
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "bob"
|
|
|
|
|
|
|
|
|
|
|
|
with pytest.raises(RequestError, match="Not authorized"):
|
|
|
|
|
|
await lib.complete_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# abort_upload
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
class TestAbortUpload:
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_aborts_and_cleans_up(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session()
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
resp = await lib.abort_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.error is None
|
|
|
|
|
|
lib.blob_store.abort_multipart_upload.assert_called_once_with(
|
|
|
|
|
|
object_id="obj-1", upload_id="s3-up-1"
|
|
|
|
|
|
)
|
|
|
|
|
|
lib.table_store.delete_upload_session.assert_called_once_with("up-1")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_expired_session(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = None
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-gone"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
with pytest.raises(RequestError, match="not found"):
|
|
|
|
|
|
await lib.abort_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_wrong_user(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(user="alice")
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "bob"
|
|
|
|
|
|
|
|
|
|
|
|
with pytest.raises(RequestError, match="Not authorized"):
|
|
|
|
|
|
await lib.abort_upload(req)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# get_upload_status
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
class TestGetUploadStatus:
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_in_progress_status(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(
|
|
|
|
|
|
total_chunks=5, chunk_size=2000, total_size=10_000,
|
|
|
|
|
|
chunks_received={0: "e1", 2: "e3", 4: "e5"},
|
|
|
|
|
|
)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
resp = await lib.get_upload_status(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.upload_state == "in-progress"
|
|
|
|
|
|
assert resp.chunks_received == 3
|
|
|
|
|
|
assert resp.total_chunks == 5
|
|
|
|
|
|
assert sorted(resp.received_chunks) == [0, 2, 4]
|
|
|
|
|
|
assert sorted(resp.missing_chunks) == [1, 3]
|
|
|
|
|
|
assert resp.total_bytes == 10_000
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_expired_session(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = None
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-expired"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
resp = await lib.get_upload_status(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.upload_state == "expired"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_all_chunks_received(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(
|
|
|
|
|
|
total_chunks=3, chunk_size=1000, total_size=2500,
|
|
|
|
|
|
chunks_received={0: "e1", 1: "e2", 2: "e3"},
|
|
|
|
|
|
)
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
resp = await lib.get_upload_status(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.missing_chunks == []
|
|
|
|
|
|
assert resp.chunks_received == 3
|
|
|
|
|
|
# 3 * 1000 = 3000 > 2500, so capped
|
|
|
|
|
|
assert resp.bytes_received <= 2500
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_wrong_user(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
session = _make_session(user="alice")
|
|
|
|
|
|
lib.table_store.get_upload_session.return_value = session
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.upload_id = "up-1"
|
|
|
|
|
|
req.user = "bob"
|
|
|
|
|
|
|
|
|
|
|
|
with pytest.raises(RequestError, match="Not authorized"):
|
|
|
|
|
|
await lib.get_upload_status(req)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# stream_document
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
class TestStreamDocument:
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_streams_chunks_with_progress(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_document_object_id.return_value = "obj-1"
|
|
|
|
|
|
lib.blob_store.get_size = AsyncMock(return_value=5000)
|
|
|
|
|
|
lib.blob_store.get_range = AsyncMock(return_value=b"x" * 2000)
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
req.document_id = "doc-1"
|
|
|
|
|
|
req.chunk_size = 2000
|
|
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
|
async for resp in lib.stream_document(req):
|
|
|
|
|
|
chunks.append(resp)
|
|
|
|
|
|
|
|
|
|
|
|
assert len(chunks) == 3 # ceil(5000/2000)
|
|
|
|
|
|
assert chunks[0].chunk_index == 0
|
|
|
|
|
|
assert chunks[0].total_chunks == 3
|
|
|
|
|
|
assert chunks[0].is_final is False
|
|
|
|
|
|
assert chunks[-1].is_final is True
|
|
|
|
|
|
assert chunks[-1].chunk_index == 2
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_single_chunk_document(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_document_object_id.return_value = "obj-1"
|
|
|
|
|
|
lib.blob_store.get_size = AsyncMock(return_value=500)
|
|
|
|
|
|
lib.blob_store.get_range = AsyncMock(return_value=b"x" * 500)
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
req.document_id = "doc-1"
|
|
|
|
|
|
req.chunk_size = 2000
|
|
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
|
async for resp in lib.stream_document(req):
|
|
|
|
|
|
chunks.append(resp)
|
|
|
|
|
|
|
|
|
|
|
|
assert len(chunks) == 1
|
|
|
|
|
|
assert chunks[0].is_final is True
|
|
|
|
|
|
assert chunks[0].bytes_received == 500
|
|
|
|
|
|
assert chunks[0].total_bytes == 500
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_byte_ranges_correct(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_document_object_id.return_value = "obj-1"
|
|
|
|
|
|
lib.blob_store.get_size = AsyncMock(return_value=5000)
|
|
|
|
|
|
lib.blob_store.get_range = AsyncMock(return_value=b"x" * 100)
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
req.document_id = "doc-1"
|
|
|
|
|
|
req.chunk_size = 2000
|
|
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
|
async for resp in lib.stream_document(req):
|
|
|
|
|
|
chunks.append(resp)
|
|
|
|
|
|
|
|
|
|
|
|
# Verify the byte ranges passed to get_range
|
|
|
|
|
|
calls = lib.blob_store.get_range.call_args_list
|
|
|
|
|
|
assert calls[0][0] == ("obj-1", 0, 2000)
|
|
|
|
|
|
assert calls[1][0] == ("obj-1", 2000, 2000)
|
|
|
|
|
|
assert calls[2][0] == ("obj-1", 4000, 1000) # Last chunk: 5000-4000
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_default_chunk_size(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_document_object_id.return_value = "obj-1"
|
|
|
|
|
|
lib.blob_store.get_size = AsyncMock(return_value=2_000_000)
|
|
|
|
|
|
lib.blob_store.get_range = AsyncMock(return_value=b"x")
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
req.document_id = "doc-1"
|
|
|
|
|
|
req.chunk_size = 0 # Should use default 1MB
|
|
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
|
async for resp in lib.stream_document(req):
|
|
|
|
|
|
chunks.append(resp)
|
|
|
|
|
|
|
|
|
|
|
|
assert len(chunks) == 2 # ceil(2MB / 1MB)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_content_is_base64_encoded(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.get_document_object_id.return_value = "obj-1"
|
|
|
|
|
|
lib.blob_store.get_size = AsyncMock(return_value=100)
|
|
|
|
|
|
raw = b"hello world"
|
|
|
|
|
|
lib.blob_store.get_range = AsyncMock(return_value=raw)
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
req.document_id = "doc-1"
|
|
|
|
|
|
req.chunk_size = 1000
|
|
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
|
async for resp in lib.stream_document(req):
|
|
|
|
|
|
chunks.append(resp)
|
|
|
|
|
|
|
|
|
|
|
|
assert chunks[0].content == base64.b64encode(raw)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_rejects_chunk_below_minimum(self):
|
|
|
|
|
|
lib = _make_librarian(min_chunk_size=1024)
|
|
|
|
|
|
lib.table_store.get_document_object_id.return_value = "obj-1"
|
|
|
|
|
|
lib.blob_store.get_size = AsyncMock(return_value=5000)
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
req.document_id = "doc-1"
|
|
|
|
|
|
req.chunk_size = 512
|
|
|
|
|
|
|
|
|
|
|
|
with pytest.raises(RequestError, match="below minimum"):
|
|
|
|
|
|
async for _ in lib.stream_document(req):
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# list_uploads
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
class TestListUploads:
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_returns_sessions(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.list_upload_sessions.return_value = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"upload_id": "up-1",
|
|
|
|
|
|
"document_id": "doc-1",
|
|
|
|
|
|
"document_metadata": '{"id":"doc-1"}',
|
|
|
|
|
|
"total_size": 10000,
|
|
|
|
|
|
"chunk_size": 2000,
|
|
|
|
|
|
"total_chunks": 5,
|
|
|
|
|
|
"chunks_received": {0: "e1", 1: "e2"},
|
|
|
|
|
|
"created_at": "2024-01-01",
|
|
|
|
|
|
},
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
resp = await lib.list_uploads(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.error is None
|
|
|
|
|
|
assert len(resp.upload_sessions) == 1
|
|
|
|
|
|
assert resp.upload_sessions[0].upload_id == "up-1"
|
|
|
|
|
|
assert resp.upload_sessions[0].total_chunks == 5
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
|
|
|
|
async def test_empty_uploads(self):
|
|
|
|
|
|
lib = _make_librarian()
|
|
|
|
|
|
lib.table_store.list_upload_sessions.return_value = []
|
|
|
|
|
|
|
|
|
|
|
|
req = MagicMock()
|
|
|
|
|
|
req.user = "alice"
|
|
|
|
|
|
|
|
|
|
|
|
resp = await lib.list_uploads(req)
|
|
|
|
|
|
|
|
|
|
|
|
assert resp.upload_sessions == []
|