mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 09:16:22 +02:00
fix: update file skipping logic in Google Drive indexer
- Modified the `_should_skip_file` function to prevent skipping of documents with a FAILED status, ensuring they are reprocessed even if their content remains unchanged. - Added a new integration test to verify that FAILED documents are not skipped during the indexing process.
This commit is contained in:
parent
8c41fd91ba
commit
c3d5c865fd
2 changed files with 60 additions and 1 deletions
|
|
@ -108,3 +108,62 @@ async def test_drive_legacy_doc_migrated(
|
|||
DocumentType.GOOGLE_DRIVE_FILE.value, file_id, space_id
|
||||
)
|
||||
assert row.unique_identifier_hash == native_hash
|
||||
|
||||
|
||||
async def test_should_skip_file_does_not_skip_failed_document(
|
||||
db_session, db_search_space, db_user,
|
||||
):
|
||||
"""A FAILED document with unchanged md5 must NOT be skipped — it needs reprocessing."""
|
||||
import importlib
|
||||
import sys
|
||||
import types
|
||||
|
||||
pkg = "app.tasks.connector_indexers"
|
||||
stub = pkg not in sys.modules
|
||||
if stub:
|
||||
mod = types.ModuleType(pkg)
|
||||
mod.__path__ = ["app/tasks/connector_indexers"]
|
||||
mod.__package__ = pkg
|
||||
sys.modules[pkg] = mod
|
||||
|
||||
try:
|
||||
gdm = importlib.import_module(
|
||||
"app.tasks.connector_indexers.google_drive_indexer"
|
||||
)
|
||||
_should_skip_file = gdm._should_skip_file
|
||||
finally:
|
||||
if stub:
|
||||
sys.modules.pop(pkg, None)
|
||||
|
||||
space_id = db_search_space.id
|
||||
file_id = "file-failed-drive"
|
||||
md5 = "abc123deadbeef"
|
||||
|
||||
doc_hash = compute_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE.value, file_id, space_id
|
||||
)
|
||||
failed_doc = Document(
|
||||
title="Failed File.pdf",
|
||||
document_type=DocumentType.GOOGLE_DRIVE_FILE,
|
||||
content="LLM rate limit exceeded",
|
||||
content_hash=f"ch-{doc_hash[:12]}",
|
||||
unique_identifier_hash=doc_hash,
|
||||
source_markdown="## Real content",
|
||||
search_space_id=space_id,
|
||||
created_by_id=str(db_user.id),
|
||||
embedding=[0.1] * _EMBEDDING_DIM,
|
||||
status=DocumentStatus.failed("LLM rate limit exceeded"),
|
||||
document_metadata={
|
||||
"google_drive_file_id": file_id,
|
||||
"google_drive_file_name": "Failed File.pdf",
|
||||
"md5_checksum": md5,
|
||||
},
|
||||
)
|
||||
db_session.add(failed_doc)
|
||||
await db_session.flush()
|
||||
|
||||
incoming_file = {"id": file_id, "name": "Failed File.pdf", "mimeType": "application/pdf", "md5Checksum": md5}
|
||||
|
||||
should_skip, _msg = await _should_skip_file(db_session, incoming_file, space_id)
|
||||
|
||||
assert not should_skip, "FAILED documents must not be skipped even when content is unchanged"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue