fix: Improve document upload tests by adding assertions for document IDs and handling transient transport errors

This commit is contained in:
Anish Sarkar 2026-02-26 01:50:57 +05:30
parent 25df3dff64
commit d6e442b466
2 changed files with 38 additions and 26 deletions

View file

@ -353,16 +353,16 @@ class TestDuplicateContentDetection:
assert resp2.status_code == 200
second_ids = resp2.json()["document_ids"]
cleanup_doc_ids.extend(second_ids)
assert second_ids, (
"Expected at least one document id for renamed duplicate content upload"
)
if second_ids:
statuses = await poll_document_status(
client, headers, second_ids, search_space_id=search_space_id
)
for did in second_ids:
assert statuses[did]["status"]["state"] == "failed"
assert "duplicate" in (
statuses[did]["status"].get("reason", "").lower()
)
statuses = await poll_document_status(
client, headers, second_ids, search_space_id=search_space_id
)
for did in second_ids:
assert statuses[did]["status"]["state"] == "failed"
assert "duplicate" in statuses[did]["status"].get("reason", "").lower()
# ---------------------------------------------------------------------------
@ -387,16 +387,16 @@ class TestEmptyFileUpload:
doc_ids = resp.json()["document_ids"]
cleanup_doc_ids.extend(doc_ids)
assert doc_ids, "Expected at least one document id for empty PDF upload"
if doc_ids:
statuses = await poll_document_status(
client, headers, doc_ids, search_space_id=search_space_id, timeout=120.0
statuses = await poll_document_status(
client, headers, doc_ids, search_space_id=search_space_id, timeout=120.0
)
for did in doc_ids:
assert statuses[did]["status"]["state"] == "failed"
assert statuses[did]["status"].get("reason"), (
"Failed document should include a reason"
)
for did in doc_ids:
assert statuses[did]["status"]["state"] == "failed"
assert statuses[did]["status"].get("reason"), (
"Failed document should include a reason"
)
# ---------------------------------------------------------------------------

View file

@ -124,20 +124,31 @@ async def poll_document_status(
terminal state (``ready`` or ``failed``) or *timeout* seconds elapse.
Returns a mapping of ``{document_id: status_item_dict}``.
Retries on transient transport errors until timeout.
"""
ids_param = ",".join(str(d) for d in document_ids)
terminal_states = {"ready", "failed"}
elapsed = 0.0
items: dict[int, dict] = {}
last_transport_error: Exception | None = None
while elapsed < timeout:
resp = await client.get(
"/api/v1/documents/status",
headers=headers,
params={
"search_space_id": search_space_id,
"document_ids": ids_param,
},
)
try:
resp = await client.get(
"/api/v1/documents/status",
headers=headers,
params={
"search_space_id": search_space_id,
"document_ids": ids_param,
},
)
except (httpx.ReadError, httpx.ConnectError, httpx.TimeoutException) as exc:
last_transport_error = exc
await asyncio.sleep(interval)
elapsed += interval
continue
assert resp.status_code == 200, (
f"Status poll failed ({resp.status_code}): {resp.text}"
)
@ -154,7 +165,8 @@ async def poll_document_status(
raise TimeoutError(
f"Documents {document_ids} did not reach terminal state within {timeout}s. "
f"Last status: {items}"
f"Last status: {items}. "
f"Last transport error: {last_transport_error!r}"
)