feat(connectors): add connector parameter to file processor for source tracking

- Add optional 'connector' parameter with 'type' and 'metadata' fields
- Create helper function _update_document_from_connector
- Use document_metadata column (not metadata) for JSON field
- Merge metadata with existing using dict spread operator
- Google Drive documents now marked as GOOGLE_DRIVE_CONNECTOR
- Backward compatible - no changes to existing logic
- Simple and clean implementation
This commit is contained in:
CREDO23 2025-12-28 18:01:39 +02:00
parent 8da58be9e0
commit a5935bc677
3 changed files with 60 additions and 71 deletions

View file

@ -447,6 +447,24 @@ async def add_received_file_document_using_docling(
) from e
async def _update_document_from_connector(
document: Document | None, connector: dict | None, session: AsyncSession
) -> None:
"""Helper to update document type and metadata from connector info."""
if document and connector:
if "type" in connector:
document.document_type = connector["type"]
if "metadata" in connector:
# Merge with existing document_metadata (the actual column name)
if not document.document_metadata:
document.document_metadata = connector["metadata"]
else:
# Expand existing metadata with connector metadata
merged = {**document.document_metadata, **connector["metadata"]}
document.document_metadata = merged
await session.commit()
async def process_file_in_background(
file_path: str,
filename: str,
@ -455,6 +473,7 @@ async def process_file_in_background(
session: AsyncSession,
task_logger: TaskLoggingService,
log_entry: Log,
connector: dict | None = None, # Optional: {"type": "GOOGLE_DRIVE_CONNECTOR", "metadata": {...}}
):
try:
# Check if the file is a markdown or text file
@ -492,6 +511,9 @@ async def process_file_in_background(
session, filename, markdown_content, search_space_id, user_id
)
# Update from connector if provided
await _update_document_from_connector(result, connector, session)
if result:
await task_logger.log_task_success(
log_entry,
@ -608,6 +630,9 @@ async def process_file_in_background(
session, filename, transcribed_text, search_space_id, user_id
)
# Update from connector if provided
await _update_document_from_connector(result, connector, session)
if result:
await task_logger.log_task_success(
log_entry,
@ -753,6 +778,9 @@ async def process_file_in_background(
session, filename, docs, search_space_id, user_id
)
# Update from connector if provided
await _update_document_from_connector(result, connector, session)
if result:
# Update page usage after successful processing
# allow_exceed=True because document was already created after passing initial check
@ -897,6 +925,9 @@ async def process_file_in_background(
user_id, final_page_count, allow_exceed=True
)
# Update from connector if provided
await _update_document_from_connector(last_created_doc, connector, session)
await task_logger.log_task_success(
log_entry,
f"Successfully processed file with LlamaCloud: {filename}",
@ -1021,6 +1052,9 @@ async def process_file_in_background(
user_id, final_page_count, allow_exceed=True
)
# Update from connector if provided
await _update_document_from_connector(doc_result, connector, session)
await task_logger.log_task_success(
log_entry,
f"Successfully processed file with Docling: {filename}",