fix(google-drive): sanitize ETL reason and retry stuck pending/processing files

This commit is contained in:
CREDO23 2026-06-10 00:10:25 +02:00
parent 8699befaa0
commit c0c5f3414e

View file

@ -35,6 +35,7 @@ from app.connectors.google_drive.file_types import (
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import compute_identifier_hash
from app.indexing_pipeline.exceptions import safe_exception_message
from app.indexing_pipeline.indexing_pipeline_service import (
IndexingPipelineService,
PlaceholderInfo,
@ -368,7 +369,12 @@ async def _should_skip_file(
logger.info(f"Rename-only update: '{old_name}''{file_name}'")
return True, f"File renamed: '{old_name}''{file_name}'"
if not DocumentStatus.is_state(existing.status, DocumentStatus.READY):
state = DocumentStatus.get_state(existing.status)
if state in (DocumentStatus.PENDING, DocumentStatus.PROCESSING):
# Stuck placeholder/in-progress doc (e.g. worker died mid-index): re-index
# instead of skipping, otherwise it never recovers.
return False, None
if state != DocumentStatus.READY:
return True, "skipped (previously failed)"
return True, "unchanged"
@ -510,7 +516,7 @@ async def _download_files_parallel(
continue
file_id = file.get("id")
if isinstance(outcome, Exception):
reason = f"Download/ETL error: {outcome}"
reason = f"Download/ETL error: {safe_exception_message(outcome)}"
logger.warning(
"Download/ETL exception for %s: %s",
file.get("name", "Unknown"),