test(google-drive): assert stuck pending/processing docs retry

This commit is contained in:
CREDO23 2026-06-10 00:11:00 +02:00
parent ba687813c1
commit 77544ab768

View file

@ -177,3 +177,75 @@ async def test_should_skip_file_skips_failed_document(
assert should_skip, "FAILED documents must be skipped during automatic sync"
assert "failed" in msg.lower()
@pytest.mark.parametrize("stuck_state", ["pending", "processing"])
async def test_should_skip_file_retries_stuck_document(
db_session,
db_search_space,
db_user,
stuck_state,
):
"""A doc stuck in pending/processing (worker died mid-index) must re-index, not skip."""
import importlib
import sys
import types
pkg = "app.tasks.connector_indexers"
stub = pkg not in sys.modules
if stub:
mod = types.ModuleType(pkg)
mod.__path__ = ["app/tasks/connector_indexers"]
mod.__package__ = pkg
sys.modules[pkg] = mod
try:
gdm = importlib.import_module(
"app.tasks.connector_indexers.google_drive_indexer"
)
_should_skip_file = gdm._should_skip_file
finally:
if stub:
sys.modules.pop(pkg, None)
space_id = db_search_space.id
file_id = f"file-{stuck_state}-drive"
md5 = "stuck123checksum"
doc_hash = compute_identifier_hash(
DocumentType.GOOGLE_DRIVE_FILE.value, file_id, space_id
)
status = (
DocumentStatus.pending()
if stuck_state == "pending"
else DocumentStatus.processing()
)
stuck_doc = Document(
title="Stuck File.pdf",
document_type=DocumentType.GOOGLE_DRIVE_FILE,
content="Pending...",
content_hash=f"ch-{doc_hash[:12]}",
unique_identifier_hash=doc_hash,
source_markdown="",
search_space_id=space_id,
created_by_id=str(db_user.id),
status=status,
document_metadata={
"google_drive_file_id": file_id,
"google_drive_file_name": "Stuck File.pdf",
"md5_checksum": md5,
},
)
db_session.add(stuck_doc)
await db_session.flush()
incoming_file = {
"id": file_id,
"name": "Stuck File.pdf",
"mimeType": "application/pdf",
"md5Checksum": md5,
}
should_skip, _msg = await _should_skip_file(db_session, incoming_file, space_id)
assert not should_skip, f"{stuck_state} documents must re-index, not be skipped"