trustgraph/tests/unit/test_librarian/test_chunked_upload.py
cybermaggedon 29b4300808
Updated test suite for explainability & provenance (#696)
* Provenance tests

* Embeddings tests

* Test librarian

* Test triples stream

* Test concurrency

* Entity centric graph writes

* Agent tool service tests

* Structured data tests

* RDF tests

* Addition LLM tests

* Reliability tests
2026-03-13 14:27:42 +00:00

716 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Tests for librarian chunked upload operations:
begin_upload, upload_chunk, complete_upload, abort_upload, get_upload_status,
list_uploads, and stream_document.
"""
import base64
import json
import math
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from trustgraph.librarian.librarian import Librarian, DEFAULT_CHUNK_SIZE
from trustgraph.exceptions import RequestError
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_librarian(min_chunk_size=1):
"""Create a Librarian with mocked blob_store and table_store."""
lib = Librarian.__new__(Librarian)
lib.blob_store = MagicMock()
lib.table_store = AsyncMock()
lib.load_document = AsyncMock()
lib.min_chunk_size = min_chunk_size
return lib
def _make_doc_metadata(
doc_id="doc-1", kind="application/pdf", user="alice", title="Test Doc"
):
meta = MagicMock()
meta.id = doc_id
meta.kind = kind
meta.user = user
meta.title = title
meta.time = 1700000000
meta.comments = ""
meta.tags = []
return meta
def _make_begin_request(
doc_id="doc-1", kind="application/pdf", user="alice",
total_size=10_000_000, chunk_size=0
):
req = MagicMock()
req.document_metadata = _make_doc_metadata(doc_id=doc_id, kind=kind, user=user)
req.total_size = total_size
req.chunk_size = chunk_size
return req
def _make_upload_chunk_request(upload_id="up-1", chunk_index=0, user="alice", content=b"data"):
req = MagicMock()
req.upload_id = upload_id
req.chunk_index = chunk_index
req.user = user
req.content = base64.b64encode(content)
return req
def _make_session(
user="alice", total_chunks=5, chunk_size=2_000_000,
total_size=10_000_000, chunks_received=None, object_id="obj-1",
s3_upload_id="s3-up-1", document_metadata=None, document_id="doc-1",
):
if chunks_received is None:
chunks_received = {}
if document_metadata is None:
document_metadata = json.dumps({
"id": document_id, "kind": "application/pdf",
"user": user, "title": "Test", "time": 1700000000,
"comments": "", "tags": [],
})
return {
"user": user,
"total_chunks": total_chunks,
"chunk_size": chunk_size,
"total_size": total_size,
"chunks_received": chunks_received,
"object_id": object_id,
"s3_upload_id": s3_upload_id,
"document_metadata": document_metadata,
"document_id": document_id,
}
# ---------------------------------------------------------------------------
# begin_upload
# ---------------------------------------------------------------------------
class TestBeginUpload:
@pytest.mark.asyncio
async def test_creates_session(self):
lib = _make_librarian()
lib.table_store.document_exists.return_value = False
lib.blob_store.create_multipart_upload.return_value = "s3-upload-id"
req = _make_begin_request(total_size=10_000_000)
resp = await lib.begin_upload(req)
assert resp.error is None
assert resp.upload_id is not None
assert resp.total_chunks == math.ceil(10_000_000 / DEFAULT_CHUNK_SIZE)
assert resp.chunk_size == DEFAULT_CHUNK_SIZE
@pytest.mark.asyncio
async def test_custom_chunk_size(self):
lib = _make_librarian()
lib.table_store.document_exists.return_value = False
lib.blob_store.create_multipart_upload.return_value = "s3-id"
req = _make_begin_request(total_size=10_000, chunk_size=3000)
resp = await lib.begin_upload(req)
assert resp.chunk_size == 3000
assert resp.total_chunks == math.ceil(10_000 / 3000)
@pytest.mark.asyncio
async def test_rejects_invalid_kind(self):
lib = _make_librarian()
req = _make_begin_request(kind="image/png")
with pytest.raises(RequestError, match="Invalid document kind"):
await lib.begin_upload(req)
@pytest.mark.asyncio
async def test_rejects_duplicate_document(self):
lib = _make_librarian()
lib.table_store.document_exists.return_value = True
req = _make_begin_request()
with pytest.raises(RequestError, match="already exists"):
await lib.begin_upload(req)
@pytest.mark.asyncio
async def test_rejects_zero_size(self):
lib = _make_librarian()
lib.table_store.document_exists.return_value = False
req = _make_begin_request(total_size=0)
with pytest.raises(RequestError, match="positive"):
await lib.begin_upload(req)
@pytest.mark.asyncio
async def test_rejects_chunk_below_minimum(self):
lib = _make_librarian(min_chunk_size=1024)
lib.table_store.document_exists.return_value = False
req = _make_begin_request(total_size=10_000, chunk_size=512)
with pytest.raises(RequestError, match="below minimum"):
await lib.begin_upload(req)
@pytest.mark.asyncio
async def test_calls_s3_create_multipart(self):
lib = _make_librarian()
lib.table_store.document_exists.return_value = False
lib.blob_store.create_multipart_upload.return_value = "s3-id"
req = _make_begin_request(kind="application/pdf")
await lib.begin_upload(req)
lib.blob_store.create_multipart_upload.assert_called_once()
# create_multipart_upload(object_id, kind) — positional args
args = lib.blob_store.create_multipart_upload.call_args[0]
assert args[1] == "application/pdf"
@pytest.mark.asyncio
async def test_stores_session_in_cassandra(self):
lib = _make_librarian()
lib.table_store.document_exists.return_value = False
lib.blob_store.create_multipart_upload.return_value = "s3-id"
req = _make_begin_request(total_size=5_000_000)
resp = await lib.begin_upload(req)
lib.table_store.create_upload_session.assert_called_once()
kwargs = lib.table_store.create_upload_session.call_args[1]
assert kwargs["upload_id"] == resp.upload_id
assert kwargs["total_size"] == 5_000_000
assert kwargs["total_chunks"] == resp.total_chunks
@pytest.mark.asyncio
async def test_accepts_text_plain(self):
lib = _make_librarian()
lib.table_store.document_exists.return_value = False
lib.blob_store.create_multipart_upload.return_value = "s3-id"
req = _make_begin_request(kind="text/plain", total_size=1000)
resp = await lib.begin_upload(req)
assert resp.error is None
# ---------------------------------------------------------------------------
# upload_chunk
# ---------------------------------------------------------------------------
class TestUploadChunk:
@pytest.mark.asyncio
async def test_successful_chunk_upload(self):
lib = _make_librarian()
session = _make_session(total_chunks=5, chunks_received={})
lib.table_store.get_upload_session.return_value = session
lib.blob_store.upload_part.return_value = "etag-1"
req = _make_upload_chunk_request(chunk_index=0, content=b"chunk data")
resp = await lib.upload_chunk(req)
assert resp.error is None
assert resp.chunk_index == 0
assert resp.total_chunks == 5
# The chunk is added to the dict (len=1), then +1 applied => 2
assert resp.chunks_received == 2
@pytest.mark.asyncio
async def test_s3_part_number_is_1_indexed(self):
lib = _make_librarian()
session = _make_session()
lib.table_store.get_upload_session.return_value = session
lib.blob_store.upload_part.return_value = "etag"
req = _make_upload_chunk_request(chunk_index=0)
await lib.upload_chunk(req)
kwargs = lib.blob_store.upload_part.call_args[1]
assert kwargs["part_number"] == 1 # 0-indexed chunk → 1-indexed part
@pytest.mark.asyncio
async def test_chunk_index_3_becomes_part_4(self):
lib = _make_librarian()
session = _make_session()
lib.table_store.get_upload_session.return_value = session
lib.blob_store.upload_part.return_value = "etag"
req = _make_upload_chunk_request(chunk_index=3)
await lib.upload_chunk(req)
kwargs = lib.blob_store.upload_part.call_args[1]
assert kwargs["part_number"] == 4
@pytest.mark.asyncio
async def test_rejects_expired_session(self):
lib = _make_librarian()
lib.table_store.get_upload_session.return_value = None
req = _make_upload_chunk_request()
with pytest.raises(RequestError, match="not found"):
await lib.upload_chunk(req)
@pytest.mark.asyncio
async def test_rejects_wrong_user(self):
lib = _make_librarian()
session = _make_session(user="alice")
lib.table_store.get_upload_session.return_value = session
req = _make_upload_chunk_request(user="bob")
with pytest.raises(RequestError, match="Not authorized"):
await lib.upload_chunk(req)
@pytest.mark.asyncio
async def test_rejects_negative_chunk_index(self):
lib = _make_librarian()
session = _make_session(total_chunks=5)
lib.table_store.get_upload_session.return_value = session
req = _make_upload_chunk_request(chunk_index=-1)
with pytest.raises(RequestError, match="Invalid chunk index"):
await lib.upload_chunk(req)
@pytest.mark.asyncio
async def test_rejects_out_of_range_chunk_index(self):
lib = _make_librarian()
session = _make_session(total_chunks=5)
lib.table_store.get_upload_session.return_value = session
req = _make_upload_chunk_request(chunk_index=5)
with pytest.raises(RequestError, match="Invalid chunk index"):
await lib.upload_chunk(req)
@pytest.mark.asyncio
async def test_progress_tracking(self):
lib = _make_librarian()
session = _make_session(
total_chunks=4, chunk_size=1000, total_size=3500,
chunks_received={0: "e1", 1: "e2"},
)
lib.table_store.get_upload_session.return_value = session
lib.blob_store.upload_part.return_value = "e3"
req = _make_upload_chunk_request(chunk_index=2)
resp = await lib.upload_chunk(req)
# Dict gets chunk 2 added (len=3), then +1 => 4
assert resp.chunks_received == 4
assert resp.total_chunks == 4
assert resp.total_bytes == 3500
@pytest.mark.asyncio
async def test_bytes_capped_at_total_size(self):
"""bytes_received should not exceed total_size for the final chunk."""
lib = _make_librarian()
session = _make_session(
total_chunks=2, chunk_size=3000, total_size=5000,
chunks_received={0: "e1"},
)
lib.table_store.get_upload_session.return_value = session
lib.blob_store.upload_part.return_value = "e2"
req = _make_upload_chunk_request(chunk_index=1)
resp = await lib.upload_chunk(req)
# 3 chunks × 3000 = 9000 > 5000, so capped
assert resp.bytes_received <= 5000
@pytest.mark.asyncio
async def test_base64_decodes_content(self):
lib = _make_librarian()
session = _make_session()
lib.table_store.get_upload_session.return_value = session
lib.blob_store.upload_part.return_value = "etag"
raw = b"hello world binary data"
req = _make_upload_chunk_request(content=raw)
await lib.upload_chunk(req)
kwargs = lib.blob_store.upload_part.call_args[1]
assert kwargs["data"] == raw
# ---------------------------------------------------------------------------
# complete_upload
# ---------------------------------------------------------------------------
class TestCompleteUpload:
@pytest.mark.asyncio
async def test_successful_completion(self):
lib = _make_librarian()
session = _make_session(
total_chunks=3,
chunks_received={0: "e1", 1: "e2", 2: "e3"},
)
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "alice"
resp = await lib.complete_upload(req)
assert resp.error is None
assert resp.document_id == "doc-1"
lib.blob_store.complete_multipart_upload.assert_called_once()
lib.table_store.add_document.assert_called_once()
lib.table_store.delete_upload_session.assert_called_once_with("up-1")
@pytest.mark.asyncio
async def test_parts_sorted_by_index(self):
lib = _make_librarian()
# Chunks received out of order
session = _make_session(
total_chunks=3,
chunks_received={2: "e3", 0: "e1", 1: "e2"},
)
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "alice"
await lib.complete_upload(req)
parts = lib.blob_store.complete_multipart_upload.call_args[1]["parts"]
part_numbers = [p[0] for p in parts]
assert part_numbers == [1, 2, 3] # Sorted, 1-indexed
@pytest.mark.asyncio
async def test_rejects_missing_chunks(self):
lib = _make_librarian()
session = _make_session(
total_chunks=3,
chunks_received={0: "e1", 2: "e3"}, # chunk 1 missing
)
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "alice"
with pytest.raises(RequestError, match="Missing chunks"):
await lib.complete_upload(req)
@pytest.mark.asyncio
async def test_rejects_expired_session(self):
lib = _make_librarian()
lib.table_store.get_upload_session.return_value = None
req = MagicMock()
req.upload_id = "up-gone"
req.user = "alice"
with pytest.raises(RequestError, match="not found"):
await lib.complete_upload(req)
@pytest.mark.asyncio
async def test_rejects_wrong_user(self):
lib = _make_librarian()
session = _make_session(user="alice")
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "bob"
with pytest.raises(RequestError, match="Not authorized"):
await lib.complete_upload(req)
# ---------------------------------------------------------------------------
# abort_upload
# ---------------------------------------------------------------------------
class TestAbortUpload:
@pytest.mark.asyncio
async def test_aborts_and_cleans_up(self):
lib = _make_librarian()
session = _make_session()
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "alice"
resp = await lib.abort_upload(req)
assert resp.error is None
lib.blob_store.abort_multipart_upload.assert_called_once_with(
object_id="obj-1", upload_id="s3-up-1"
)
lib.table_store.delete_upload_session.assert_called_once_with("up-1")
@pytest.mark.asyncio
async def test_rejects_expired_session(self):
lib = _make_librarian()
lib.table_store.get_upload_session.return_value = None
req = MagicMock()
req.upload_id = "up-gone"
req.user = "alice"
with pytest.raises(RequestError, match="not found"):
await lib.abort_upload(req)
@pytest.mark.asyncio
async def test_rejects_wrong_user(self):
lib = _make_librarian()
session = _make_session(user="alice")
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "bob"
with pytest.raises(RequestError, match="Not authorized"):
await lib.abort_upload(req)
# ---------------------------------------------------------------------------
# get_upload_status
# ---------------------------------------------------------------------------
class TestGetUploadStatus:
@pytest.mark.asyncio
async def test_in_progress_status(self):
lib = _make_librarian()
session = _make_session(
total_chunks=5, chunk_size=2000, total_size=10_000,
chunks_received={0: "e1", 2: "e3", 4: "e5"},
)
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "alice"
resp = await lib.get_upload_status(req)
assert resp.upload_state == "in-progress"
assert resp.chunks_received == 3
assert resp.total_chunks == 5
assert sorted(resp.received_chunks) == [0, 2, 4]
assert sorted(resp.missing_chunks) == [1, 3]
assert resp.total_bytes == 10_000
@pytest.mark.asyncio
async def test_expired_session(self):
lib = _make_librarian()
lib.table_store.get_upload_session.return_value = None
req = MagicMock()
req.upload_id = "up-expired"
req.user = "alice"
resp = await lib.get_upload_status(req)
assert resp.upload_state == "expired"
@pytest.mark.asyncio
async def test_all_chunks_received(self):
lib = _make_librarian()
session = _make_session(
total_chunks=3, chunk_size=1000, total_size=2500,
chunks_received={0: "e1", 1: "e2", 2: "e3"},
)
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "alice"
resp = await lib.get_upload_status(req)
assert resp.missing_chunks == []
assert resp.chunks_received == 3
# 3 * 1000 = 3000 > 2500, so capped
assert resp.bytes_received <= 2500
@pytest.mark.asyncio
async def test_rejects_wrong_user(self):
lib = _make_librarian()
session = _make_session(user="alice")
lib.table_store.get_upload_session.return_value = session
req = MagicMock()
req.upload_id = "up-1"
req.user = "bob"
with pytest.raises(RequestError, match="Not authorized"):
await lib.get_upload_status(req)
# ---------------------------------------------------------------------------
# stream_document
# ---------------------------------------------------------------------------
class TestStreamDocument:
@pytest.mark.asyncio
async def test_streams_chunks_with_progress(self):
lib = _make_librarian()
lib.table_store.get_document_object_id.return_value = "obj-1"
lib.blob_store.get_size = AsyncMock(return_value=5000)
lib.blob_store.get_range = AsyncMock(return_value=b"x" * 2000)
req = MagicMock()
req.user = "alice"
req.document_id = "doc-1"
req.chunk_size = 2000
chunks = []
async for resp in lib.stream_document(req):
chunks.append(resp)
assert len(chunks) == 3 # ceil(5000/2000)
assert chunks[0].chunk_index == 0
assert chunks[0].total_chunks == 3
assert chunks[0].is_final is False
assert chunks[-1].is_final is True
assert chunks[-1].chunk_index == 2
@pytest.mark.asyncio
async def test_single_chunk_document(self):
lib = _make_librarian()
lib.table_store.get_document_object_id.return_value = "obj-1"
lib.blob_store.get_size = AsyncMock(return_value=500)
lib.blob_store.get_range = AsyncMock(return_value=b"x" * 500)
req = MagicMock()
req.user = "alice"
req.document_id = "doc-1"
req.chunk_size = 2000
chunks = []
async for resp in lib.stream_document(req):
chunks.append(resp)
assert len(chunks) == 1
assert chunks[0].is_final is True
assert chunks[0].bytes_received == 500
assert chunks[0].total_bytes == 500
@pytest.mark.asyncio
async def test_byte_ranges_correct(self):
lib = _make_librarian()
lib.table_store.get_document_object_id.return_value = "obj-1"
lib.blob_store.get_size = AsyncMock(return_value=5000)
lib.blob_store.get_range = AsyncMock(return_value=b"x" * 100)
req = MagicMock()
req.user = "alice"
req.document_id = "doc-1"
req.chunk_size = 2000
chunks = []
async for resp in lib.stream_document(req):
chunks.append(resp)
# Verify the byte ranges passed to get_range
calls = lib.blob_store.get_range.call_args_list
assert calls[0][0] == ("obj-1", 0, 2000)
assert calls[1][0] == ("obj-1", 2000, 2000)
assert calls[2][0] == ("obj-1", 4000, 1000) # Last chunk: 5000-4000
@pytest.mark.asyncio
async def test_default_chunk_size(self):
lib = _make_librarian()
lib.table_store.get_document_object_id.return_value = "obj-1"
lib.blob_store.get_size = AsyncMock(return_value=2_000_000)
lib.blob_store.get_range = AsyncMock(return_value=b"x")
req = MagicMock()
req.user = "alice"
req.document_id = "doc-1"
req.chunk_size = 0 # Should use default 1MB
chunks = []
async for resp in lib.stream_document(req):
chunks.append(resp)
assert len(chunks) == 2 # ceil(2MB / 1MB)
@pytest.mark.asyncio
async def test_content_is_base64_encoded(self):
lib = _make_librarian()
lib.table_store.get_document_object_id.return_value = "obj-1"
lib.blob_store.get_size = AsyncMock(return_value=100)
raw = b"hello world"
lib.blob_store.get_range = AsyncMock(return_value=raw)
req = MagicMock()
req.user = "alice"
req.document_id = "doc-1"
req.chunk_size = 1000
chunks = []
async for resp in lib.stream_document(req):
chunks.append(resp)
assert chunks[0].content == base64.b64encode(raw)
@pytest.mark.asyncio
async def test_rejects_chunk_below_minimum(self):
lib = _make_librarian(min_chunk_size=1024)
lib.table_store.get_document_object_id.return_value = "obj-1"
lib.blob_store.get_size = AsyncMock(return_value=5000)
req = MagicMock()
req.user = "alice"
req.document_id = "doc-1"
req.chunk_size = 512
with pytest.raises(RequestError, match="below minimum"):
async for _ in lib.stream_document(req):
pass
# ---------------------------------------------------------------------------
# list_uploads
# ---------------------------------------------------------------------------
class TestListUploads:
@pytest.mark.asyncio
async def test_returns_sessions(self):
lib = _make_librarian()
lib.table_store.list_upload_sessions.return_value = [
{
"upload_id": "up-1",
"document_id": "doc-1",
"document_metadata": '{"id":"doc-1"}',
"total_size": 10000,
"chunk_size": 2000,
"total_chunks": 5,
"chunks_received": {0: "e1", 1: "e2"},
"created_at": "2024-01-01",
},
]
req = MagicMock()
req.user = "alice"
resp = await lib.list_uploads(req)
assert resp.error is None
assert len(resp.upload_sessions) == 1
assert resp.upload_sessions[0].upload_id == "up-1"
assert resp.upload_sessions[0].total_chunks == 5
@pytest.mark.asyncio
async def test_empty_uploads(self):
lib = _make_librarian()
lib.table_store.list_upload_sessions.return_value = []
req = MagicMock()
req.user = "alice"
resp = await lib.list_uploads(req)
assert resp.upload_sessions == []