feat: unify handling of native and legacy document types for Google connectors

- Introduced a mapping of native Google document types to their legacy Composio equivalents, ensuring seamless search and indexing for both types.
- Updated relevant components to utilize the new mapping, enhancing the consistency of document type handling across the application.
- Improved search functionality to transparently include legacy types, maintaining accessibility for older documents until re-indexed.
This commit is contained in:
Anish Sarkar 2026-03-20 03:41:32 +05:30
parent aaf34800e6
commit d21593ee71
7 changed files with 104 additions and 34 deletions

View file

@ -170,7 +170,34 @@ async def handle_existing_document_update(
logging.info(f"Document for file {filename} unchanged. Skipping.")
return True, existing_document
else:
# Content has changed - need to re-process
# Content has changed — guard against content_hash collision before
# expensive ETL processing. A collision means the exact same content
# already lives in a *different* document (e.g. a manual upload of the
# same file). Proceeding would trigger a unique-constraint violation
# on ix_documents_content_hash.
collision_doc = await check_duplicate_document(session, content_hash)
if collision_doc and collision_doc.id != existing_document.id:
logging.warning(
"Content-hash collision for %s: identical content exists in "
"document #%s (%s). Skipping re-processing.",
filename,
collision_doc.id,
collision_doc.document_type,
)
if DocumentStatus.is_state(
existing_document.status, DocumentStatus.PENDING
) or DocumentStatus.is_state(
existing_document.status, DocumentStatus.PROCESSING
):
# Pending/processing doc has no real content yet — remove it
# so the UI doesn't show a contentless entry.
await session.delete(existing_document)
await session.commit()
return True, None
# Document already has valid content — keep it as-is.
return True, existing_document
logging.info(f"Content changed for file {filename}. Updating document.")
return False, None
@ -537,6 +564,12 @@ async def add_received_file_document_using_unstructured(
return document
except SQLAlchemyError as db_error:
await session.rollback()
if "ix_documents_content_hash" in str(db_error):
logging.warning(
"content_hash collision during commit for %s (Unstructured). Skipping.",
file_name,
)
return None
raise db_error
except Exception as e:
await session.rollback()
@ -673,6 +706,12 @@ async def add_received_file_document_using_llamacloud(
return document
except SQLAlchemyError as db_error:
await session.rollback()
if "ix_documents_content_hash" in str(db_error):
logging.warning(
"content_hash collision during commit for %s (LlamaCloud). Skipping.",
file_name,
)
return None
raise db_error
except Exception as e:
await session.rollback()
@ -828,6 +867,12 @@ async def add_received_file_document_using_docling(
return document
except SQLAlchemyError as db_error:
await session.rollback()
if "ix_documents_content_hash" in str(db_error):
logging.warning(
"content_hash collision during commit for %s (Docling). Skipping.",
file_name,
)
return None
raise db_error
except Exception as e:
await session.rollback()

View file

@ -158,6 +158,28 @@ async def _handle_existing_document_update(
logging.info(f"Document for markdown file {filename} unchanged. Skipping.")
return True, existing_document
else:
# Content has changed — guard against content_hash collision (same
# content already lives in a different document).
collision_doc = await check_duplicate_document(session, content_hash)
if collision_doc and collision_doc.id != existing_document.id:
logging.warning(
"Content-hash collision for markdown %s: identical content "
"exists in document #%s (%s). Skipping re-processing.",
filename,
collision_doc.id,
collision_doc.document_type,
)
if DocumentStatus.is_state(
existing_document.status, DocumentStatus.PENDING
) or DocumentStatus.is_state(
existing_document.status, DocumentStatus.PROCESSING
):
await session.delete(existing_document)
await session.commit()
return True, None
return True, existing_document
logging.info(
f"Content changed for markdown file {filename}. Updating document."
)
@ -312,6 +334,12 @@ async def add_received_markdown_file_document(
return document
except SQLAlchemyError as db_error:
await session.rollback()
if "ix_documents_content_hash" in str(db_error):
logging.warning(
"content_hash collision during commit for %s (markdown). Skipping.",
file_name,
)
return None
await task_logger.log_task_failure(
log_entry,
f"Database error processing markdown file: {file_name}",