mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-17 18:35:19 +02:00
feat: enhance Google Drive document handling and UI integration
- Implemented support for both new file_id-based and legacy filename-based hash schemes in document processing. - Added functions to generate unique identifier hashes and find existing documents with migration support. - Improved existing document update logic to handle content changes and metadata updates, particularly for Google Drive files. - Enhanced UI components to display appropriate file icons based on file types in the Google Drive connector. - Updated document processing functions to accommodate the new connector structure and ensure seamless integration.
This commit is contained in:
parent
7af3d1bc1a
commit
6550c378b2
5 changed files with 397 additions and 104 deletions
|
|
@ -596,7 +596,14 @@ async def _process_single_file(
|
||||||
|
|
||||||
|
|
||||||
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
|
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
|
||||||
"""Remove a document that was deleted in Drive."""
|
"""Remove a document that was deleted in Drive.
|
||||||
|
|
||||||
|
Handles both new (file_id-based) and legacy (filename-based) hash schemes.
|
||||||
|
"""
|
||||||
|
from sqlalchemy import select
|
||||||
|
from app.db import Document
|
||||||
|
|
||||||
|
# First try with file_id-based hash (new method)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
unique_identifier_hash = generate_unique_identifier_hash(
|
||||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||||
)
|
)
|
||||||
|
|
@ -605,6 +612,19 @@ async def _remove_document(session: AsyncSession, file_id: str, search_space_id:
|
||||||
session, unique_identifier_hash
|
session, unique_identifier_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If not found, search by metadata (for legacy documents with filename-based hash)
|
||||||
|
if not existing_document:
|
||||||
|
result = await session.execute(
|
||||||
|
select(Document).where(
|
||||||
|
Document.search_space_id == search_space_id,
|
||||||
|
Document.document_type == DocumentType.GOOGLE_DRIVE_FILE,
|
||||||
|
Document.document_metadata["google_drive_file_id"].astext == file_id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing_document = result.scalar_one_or_none()
|
||||||
|
if existing_document:
|
||||||
|
logger.info(f"Found legacy document by metadata for file_id: {file_id}")
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
await session.delete(existing_document)
|
await session.delete(existing_document)
|
||||||
logger.info(f"Removed deleted file document: {file_id}")
|
logger.info(f"Removed deleted file document: {file_id}")
|
||||||
|
|
|
||||||
|
|
@ -49,6 +49,131 @@ LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_google_drive_unique_identifier(
|
||||||
|
connector: dict | None,
|
||||||
|
filename: str,
|
||||||
|
search_space_id: int,
|
||||||
|
) -> tuple[str, str | None]:
|
||||||
|
"""
|
||||||
|
Get unique identifier hash for a file, with special handling for Google Drive.
|
||||||
|
|
||||||
|
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
|
||||||
|
For other files, uses filename.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connector: Optional connector info dict with type and metadata
|
||||||
|
filename: The filename (used for non-Google Drive files or as fallback)
|
||||||
|
search_space_id: The search space ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (primary_hash, legacy_hash or None)
|
||||||
|
- For Google Drive: (file_id_based_hash, filename_based_hash for migration)
|
||||||
|
- For other sources: (filename_based_hash, None)
|
||||||
|
"""
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
metadata = connector.get("metadata", {})
|
||||||
|
file_id = metadata.get("google_drive_file_id")
|
||||||
|
|
||||||
|
if file_id:
|
||||||
|
# New method: use file_id as unique identifier (doesn't change on rename)
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||||
|
)
|
||||||
|
# Legacy method: for backward compatibility with existing documents
|
||||||
|
# that were indexed with filename-based hash
|
||||||
|
legacy_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
|
||||||
|
)
|
||||||
|
return primary_hash, legacy_hash
|
||||||
|
|
||||||
|
# For non-Google Drive files, use filename as before
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.FILE, filename, search_space_id
|
||||||
|
)
|
||||||
|
return primary_hash, None
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_existing_document_update(
|
||||||
|
session: AsyncSession,
|
||||||
|
existing_document: Document,
|
||||||
|
content_hash: str,
|
||||||
|
connector: dict | None,
|
||||||
|
filename: str,
|
||||||
|
primary_hash: str,
|
||||||
|
) -> tuple[bool, Document | None]:
|
||||||
|
"""
|
||||||
|
Handle update logic for an existing document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
existing_document: The existing document found in database
|
||||||
|
content_hash: Hash of the new content
|
||||||
|
connector: Optional connector info
|
||||||
|
filename: Current filename
|
||||||
|
primary_hash: The primary hash (file_id based for Google Drive)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (should_skip_processing, document_to_return)
|
||||||
|
- (True, document): Content unchanged, just return existing document
|
||||||
|
- (False, None): Content changed, need to re-process
|
||||||
|
"""
|
||||||
|
# Check if this document needs hash migration (found via legacy hash)
|
||||||
|
if existing_document.unique_identifier_hash != primary_hash:
|
||||||
|
existing_document.unique_identifier_hash = primary_hash
|
||||||
|
logging.info(f"Migrated document to file_id-based identifier: {filename}")
|
||||||
|
|
||||||
|
# Check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
# Content unchanged - check if we need to update metadata (e.g., filename changed)
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
metadata = connector.get("metadata", {})
|
||||||
|
new_name = metadata.get("google_drive_file_name")
|
||||||
|
old_name = (existing_document.document_metadata or {}).get("google_drive_file_name")
|
||||||
|
|
||||||
|
if new_name and old_name != new_name:
|
||||||
|
# File was renamed - update metadata only, skip expensive processing
|
||||||
|
if not existing_document.document_metadata:
|
||||||
|
existing_document.document_metadata = {}
|
||||||
|
existing_document.document_metadata["google_drive_file_name"] = new_name
|
||||||
|
await session.commit()
|
||||||
|
logging.info(f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)")
|
||||||
|
|
||||||
|
logging.info(f"Document for file {filename} unchanged. Skipping.")
|
||||||
|
return True, existing_document
|
||||||
|
else:
|
||||||
|
# Content has changed - need to re-process
|
||||||
|
logging.info(f"Content changed for file {filename}. Updating document.")
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
|
async def find_existing_document_with_migration(
|
||||||
|
session: AsyncSession,
|
||||||
|
primary_hash: str,
|
||||||
|
legacy_hash: str | None,
|
||||||
|
) -> Document | None:
|
||||||
|
"""
|
||||||
|
Find existing document, checking both new hash and legacy hash for migration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session: Database session
|
||||||
|
primary_hash: The primary hash (file_id based for Google Drive)
|
||||||
|
legacy_hash: The legacy hash (filename based) for migration, or None
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Existing document if found, None otherwise
|
||||||
|
"""
|
||||||
|
# First check with primary hash (new method)
|
||||||
|
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||||
|
|
||||||
|
# If not found and we have a legacy hash, check with that (migration path)
|
||||||
|
if not existing_document and legacy_hash:
|
||||||
|
existing_document = await check_document_by_unique_identifier(session, legacy_hash)
|
||||||
|
if existing_document:
|
||||||
|
logging.info("Found legacy document (filename-based hash), will migrate to file_id-based hash")
|
||||||
|
|
||||||
|
return existing_document
|
||||||
|
|
||||||
|
|
||||||
async def parse_with_llamacloud_retry(
|
async def parse_with_llamacloud_retry(
|
||||||
file_path: str,
|
file_path: str,
|
||||||
estimated_pages: int,
|
estimated_pages: int,
|
||||||
|
|
@ -158,6 +283,7 @@ async def add_received_file_document_using_unstructured(
|
||||||
unstructured_processed_elements: list[LangChainDocument],
|
unstructured_processed_elements: list[LangChainDocument],
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
|
connector: dict | None = None,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store a file document using Unstructured service.
|
Process and store a file document using Unstructured service.
|
||||||
|
|
@ -168,6 +294,7 @@ async def add_received_file_document_using_unstructured(
|
||||||
unstructured_processed_elements: Processed elements from Unstructured
|
unstructured_processed_elements: Processed elements from Unstructured
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
|
connector: Optional connector info for Google Drive files
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if failed
|
Document object if successful, None if failed
|
||||||
|
|
@ -177,29 +304,27 @@ async def add_received_file_document_using_unstructured(
|
||||||
unstructured_processed_elements
|
unstructured_processed_elements
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate unique identifier hash for this file
|
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
primary_hash, legacy_hash = get_google_drive_unique_identifier(
|
||||||
DocumentType.FILE, file_name, search_space_id
|
connector, file_name, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
# Check if document exists (with migration support for Google Drive)
|
||||||
existing_document = await check_document_by_unique_identifier(
|
existing_document = await find_existing_document_with_migration(
|
||||||
session, unique_identifier_hash
|
session, primary_hash, legacy_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
# Document exists - check if content has changed
|
# Handle existing document (rename detection, content change check)
|
||||||
if existing_document.content_hash == content_hash:
|
should_skip, doc = await handle_existing_document_update(
|
||||||
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
session, existing_document, content_hash, connector, file_name, primary_hash
|
||||||
return existing_document
|
)
|
||||||
else:
|
if should_skip:
|
||||||
# Content has changed - update the existing document
|
return doc
|
||||||
logging.info(
|
# Content changed - continue to update
|
||||||
f"Content changed for file {file_name}. Updating document."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
@ -251,10 +376,15 @@ async def add_received_file_document_using_unstructured(
|
||||||
document = existing_document
|
document = existing_document
|
||||||
else:
|
else:
|
||||||
# Create new document
|
# Create new document
|
||||||
|
# Determine document type based on connector
|
||||||
|
doc_type = DocumentType.FILE
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=DocumentType.FILE,
|
document_type=doc_type,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "UNSTRUCTURED",
|
"ETL_SERVICE": "UNSTRUCTURED",
|
||||||
|
|
@ -263,7 +393,7 @@ async def add_received_file_document_using_unstructured(
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
blocknote_document=blocknote_json,
|
blocknote_document=blocknote_json,
|
||||||
content_needs_reindexing=False,
|
content_needs_reindexing=False,
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
|
|
@ -288,6 +418,7 @@ async def add_received_file_document_using_llamacloud(
|
||||||
llamacloud_markdown_document: str,
|
llamacloud_markdown_document: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
|
connector: dict | None = None,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store document content parsed by LlamaCloud.
|
Process and store document content parsed by LlamaCloud.
|
||||||
|
|
@ -298,6 +429,7 @@ async def add_received_file_document_using_llamacloud(
|
||||||
llamacloud_markdown_document: Markdown content from LlamaCloud parsing
|
llamacloud_markdown_document: Markdown content from LlamaCloud parsing
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
|
connector: Optional connector info for Google Drive files
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if failed
|
Document object if successful, None if failed
|
||||||
|
|
@ -306,29 +438,27 @@ async def add_received_file_document_using_llamacloud(
|
||||||
# Combine all markdown documents into one
|
# Combine all markdown documents into one
|
||||||
file_in_markdown = llamacloud_markdown_document
|
file_in_markdown = llamacloud_markdown_document
|
||||||
|
|
||||||
# Generate unique identifier hash for this file
|
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
primary_hash, legacy_hash = get_google_drive_unique_identifier(
|
||||||
DocumentType.FILE, file_name, search_space_id
|
connector, file_name, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
# Check if document exists (with migration support for Google Drive)
|
||||||
existing_document = await check_document_by_unique_identifier(
|
existing_document = await find_existing_document_with_migration(
|
||||||
session, unique_identifier_hash
|
session, primary_hash, legacy_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
# Document exists - check if content has changed
|
# Handle existing document (rename detection, content change check)
|
||||||
if existing_document.content_hash == content_hash:
|
should_skip, doc = await handle_existing_document_update(
|
||||||
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
session, existing_document, content_hash, connector, file_name, primary_hash
|
||||||
return existing_document
|
)
|
||||||
else:
|
if should_skip:
|
||||||
# Content has changed - update the existing document
|
return doc
|
||||||
logging.info(
|
# Content changed - continue to update
|
||||||
f"Content changed for file {file_name}. Updating document."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
@ -380,10 +510,15 @@ async def add_received_file_document_using_llamacloud(
|
||||||
document = existing_document
|
document = existing_document
|
||||||
else:
|
else:
|
||||||
# Create new document
|
# Create new document
|
||||||
|
# Determine document type based on connector
|
||||||
|
doc_type = DocumentType.FILE
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=DocumentType.FILE,
|
document_type=doc_type,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "LLAMACLOUD",
|
"ETL_SERVICE": "LLAMACLOUD",
|
||||||
|
|
@ -392,7 +527,7 @@ async def add_received_file_document_using_llamacloud(
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
blocknote_document=blocknote_json,
|
blocknote_document=blocknote_json,
|
||||||
content_needs_reindexing=False,
|
content_needs_reindexing=False,
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
|
|
@ -419,6 +554,7 @@ async def add_received_file_document_using_docling(
|
||||||
docling_markdown_document: str,
|
docling_markdown_document: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
|
connector: dict | None = None,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store document content parsed by Docling.
|
Process and store document content parsed by Docling.
|
||||||
|
|
@ -429,6 +565,7 @@ async def add_received_file_document_using_docling(
|
||||||
docling_markdown_document: Markdown content from Docling parsing
|
docling_markdown_document: Markdown content from Docling parsing
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
|
connector: Optional connector info for Google Drive files
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if failed
|
Document object if successful, None if failed
|
||||||
|
|
@ -436,29 +573,27 @@ async def add_received_file_document_using_docling(
|
||||||
try:
|
try:
|
||||||
file_in_markdown = docling_markdown_document
|
file_in_markdown = docling_markdown_document
|
||||||
|
|
||||||
# Generate unique identifier hash for this file
|
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
primary_hash, legacy_hash = get_google_drive_unique_identifier(
|
||||||
DocumentType.FILE, file_name, search_space_id
|
connector, file_name, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
# Check if document exists (with migration support for Google Drive)
|
||||||
existing_document = await check_document_by_unique_identifier(
|
existing_document = await find_existing_document_with_migration(
|
||||||
session, unique_identifier_hash
|
session, primary_hash, legacy_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
# Document exists - check if content has changed
|
# Handle existing document (rename detection, content change check)
|
||||||
if existing_document.content_hash == content_hash:
|
should_skip, doc = await handle_existing_document_update(
|
||||||
logging.info(f"Document for file {file_name} unchanged. Skipping.")
|
session, existing_document, content_hash, connector, file_name, primary_hash
|
||||||
return existing_document
|
)
|
||||||
else:
|
if should_skip:
|
||||||
# Content has changed - update the existing document
|
return doc
|
||||||
logging.info(
|
# Content changed - continue to update
|
||||||
f"Content changed for file {file_name}. Updating document."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
@ -534,10 +669,15 @@ async def add_received_file_document_using_docling(
|
||||||
document = existing_document
|
document = existing_document
|
||||||
else:
|
else:
|
||||||
# Create new document
|
# Create new document
|
||||||
|
# Determine document type based on connector
|
||||||
|
doc_type = DocumentType.FILE
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=DocumentType.FILE,
|
document_type=doc_type,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
"ETL_SERVICE": "DOCLING",
|
"ETL_SERVICE": "DOCLING",
|
||||||
|
|
@ -546,15 +686,15 @@ async def add_received_file_document_using_docling(
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
blocknote_document=blocknote_json,
|
blocknote_document=blocknote_json,
|
||||||
content_needs_reindexing=False,
|
content_needs_reindexing=False,
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
)
|
)
|
||||||
|
|
||||||
session.add(document)
|
session.add(document)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
await session.refresh(document)
|
await session.refresh(document)
|
||||||
|
|
||||||
return document
|
return document
|
||||||
except SQLAlchemyError as db_error:
|
except SQLAlchemyError as db_error:
|
||||||
|
|
@ -650,7 +790,7 @@ async def process_file_in_background(
|
||||||
|
|
||||||
# Process markdown directly through specialized function
|
# Process markdown directly through specialized function
|
||||||
result = await add_received_markdown_file_document(
|
result = await add_received_markdown_file_document(
|
||||||
session, filename, markdown_content, search_space_id, user_id
|
session, filename, markdown_content, search_space_id, user_id, connector
|
||||||
)
|
)
|
||||||
|
|
||||||
if connector:
|
if connector:
|
||||||
|
|
@ -790,7 +930,7 @@ async def process_file_in_background(
|
||||||
|
|
||||||
# Process transcription as markdown document
|
# Process transcription as markdown document
|
||||||
result = await add_received_markdown_file_document(
|
result = await add_received_markdown_file_document(
|
||||||
session, filename, transcribed_text, search_space_id, user_id
|
session, filename, transcribed_text, search_space_id, user_id, connector
|
||||||
)
|
)
|
||||||
|
|
||||||
if connector:
|
if connector:
|
||||||
|
|
@ -955,7 +1095,7 @@ async def process_file_in_background(
|
||||||
|
|
||||||
# Pass the documents to the existing background task
|
# Pass the documents to the existing background task
|
||||||
result = await add_received_file_document_using_unstructured(
|
result = await add_received_file_document_using_unstructured(
|
||||||
session, filename, docs, search_space_id, user_id
|
session, filename, docs, search_space_id, user_id, connector
|
||||||
)
|
)
|
||||||
|
|
||||||
if connector:
|
if connector:
|
||||||
|
|
@ -1103,6 +1243,7 @@ async def process_file_in_background(
|
||||||
llamacloud_markdown_document=markdown_content,
|
llamacloud_markdown_document=markdown_content,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
|
connector=connector,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Track if this document was successfully created
|
# Track if this document was successfully created
|
||||||
|
|
@ -1256,6 +1397,7 @@ async def process_file_in_background(
|
||||||
docling_markdown_document=result["content"],
|
docling_markdown_document=result["content"],
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
|
connector=connector,
|
||||||
)
|
)
|
||||||
|
|
||||||
if doc_result:
|
if doc_result:
|
||||||
|
|
|
||||||
|
|
@ -23,12 +23,118 @@ from .base import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_google_drive_unique_identifier(
|
||||||
|
connector: dict | None,
|
||||||
|
filename: str,
|
||||||
|
search_space_id: int,
|
||||||
|
) -> tuple[str, str | None]:
|
||||||
|
"""
|
||||||
|
Get unique identifier hash for a file, with special handling for Google Drive.
|
||||||
|
|
||||||
|
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
|
||||||
|
For other files, uses filename.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connector: Optional connector info dict with type and metadata
|
||||||
|
filename: The filename (used for non-Google Drive files or as fallback)
|
||||||
|
search_space_id: The search space ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (primary_hash, legacy_hash or None)
|
||||||
|
"""
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
metadata = connector.get("metadata", {})
|
||||||
|
file_id = metadata.get("google_drive_file_id")
|
||||||
|
|
||||||
|
if file_id:
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||||
|
)
|
||||||
|
legacy_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
|
||||||
|
)
|
||||||
|
return primary_hash, legacy_hash
|
||||||
|
|
||||||
|
primary_hash = generate_unique_identifier_hash(
|
||||||
|
DocumentType.FILE, filename, search_space_id
|
||||||
|
)
|
||||||
|
return primary_hash, None
|
||||||
|
|
||||||
|
|
||||||
|
async def _find_existing_document_with_migration(
|
||||||
|
session: AsyncSession,
|
||||||
|
primary_hash: str,
|
||||||
|
legacy_hash: str | None,
|
||||||
|
) -> Document | None:
|
||||||
|
"""Find existing document, checking both new hash and legacy hash for migration."""
|
||||||
|
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||||
|
|
||||||
|
if not existing_document and legacy_hash:
|
||||||
|
existing_document = await check_document_by_unique_identifier(session, legacy_hash)
|
||||||
|
if existing_document:
|
||||||
|
logging.info("Found legacy document (filename-based hash), will migrate to file_id-based hash")
|
||||||
|
|
||||||
|
return existing_document
|
||||||
|
|
||||||
|
|
||||||
|
async def _handle_existing_document_update(
|
||||||
|
session: AsyncSession,
|
||||||
|
existing_document: Document,
|
||||||
|
content_hash: str,
|
||||||
|
connector: dict | None,
|
||||||
|
filename: str,
|
||||||
|
primary_hash: str,
|
||||||
|
task_logger: TaskLoggingService,
|
||||||
|
log_entry,
|
||||||
|
) -> tuple[bool, Document | None]:
|
||||||
|
"""
|
||||||
|
Handle update logic for an existing document.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (should_skip_processing, document_to_return)
|
||||||
|
"""
|
||||||
|
# Check if this document needs hash migration
|
||||||
|
if existing_document.unique_identifier_hash != primary_hash:
|
||||||
|
existing_document.unique_identifier_hash = primary_hash
|
||||||
|
logging.info(f"Migrated document to file_id-based identifier: {filename}")
|
||||||
|
|
||||||
|
# Check if content has changed
|
||||||
|
if existing_document.content_hash == content_hash:
|
||||||
|
# Content unchanged - check if we need to update metadata (e.g., filename changed)
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
metadata = connector.get("metadata", {})
|
||||||
|
new_name = metadata.get("google_drive_file_name")
|
||||||
|
old_name = (existing_document.document_metadata or {}).get("google_drive_file_name")
|
||||||
|
|
||||||
|
if new_name and old_name != new_name:
|
||||||
|
if not existing_document.document_metadata:
|
||||||
|
existing_document.document_metadata = {}
|
||||||
|
existing_document.document_metadata["google_drive_file_name"] = new_name
|
||||||
|
await session.commit()
|
||||||
|
logging.info(f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)")
|
||||||
|
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"Markdown file document unchanged: {filename}",
|
||||||
|
{
|
||||||
|
"duplicate_detected": True,
|
||||||
|
"existing_document_id": existing_document.id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
logging.info(f"Document for markdown file {filename} unchanged. Skipping.")
|
||||||
|
return True, existing_document
|
||||||
|
else:
|
||||||
|
logging.info(f"Content changed for markdown file {filename}. Updating document.")
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
|
||||||
async def add_received_markdown_file_document(
|
async def add_received_markdown_file_document(
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
file_name: str,
|
file_name: str,
|
||||||
file_in_markdown: str,
|
file_in_markdown: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
|
connector: dict | None = None,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store a markdown file document.
|
Process and store a markdown file document.
|
||||||
|
|
@ -39,6 +145,7 @@ async def add_received_markdown_file_document(
|
||||||
file_in_markdown: Content of the markdown file
|
file_in_markdown: Content of the markdown file
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
|
connector: Optional connector info for Google Drive files
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if failed
|
Document object if successful, None if failed
|
||||||
|
|
@ -58,39 +165,28 @@ async def add_received_markdown_file_document(
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Generate unique identifier hash for this markdown file
|
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||||
unique_identifier_hash = generate_unique_identifier_hash(
|
primary_hash, legacy_hash = _get_google_drive_unique_identifier(
|
||||||
DocumentType.FILE, file_name, search_space_id
|
connector, file_name, search_space_id
|
||||||
)
|
)
|
||||||
|
|
||||||
# Generate content hash
|
# Generate content hash
|
||||||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||||
|
|
||||||
# Check if document with this unique identifier already exists
|
# Check if document exists (with migration support for Google Drive)
|
||||||
existing_document = await check_document_by_unique_identifier(
|
existing_document = await _find_existing_document_with_migration(
|
||||||
session, unique_identifier_hash
|
session, primary_hash, legacy_hash
|
||||||
)
|
)
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
# Document exists - check if content has changed
|
# Handle existing document (rename detection, content change check)
|
||||||
if existing_document.content_hash == content_hash:
|
should_skip, doc = await _handle_existing_document_update(
|
||||||
await task_logger.log_task_success(
|
session, existing_document, content_hash, connector, file_name, primary_hash,
|
||||||
log_entry,
|
task_logger, log_entry
|
||||||
f"Markdown file document unchanged: {file_name}",
|
)
|
||||||
{
|
if should_skip:
|
||||||
"duplicate_detected": True,
|
return doc
|
||||||
"existing_document_id": existing_document.id,
|
# Content changed - continue to update
|
||||||
},
|
|
||||||
)
|
|
||||||
logging.info(
|
|
||||||
f"Document for markdown file {file_name} unchanged. Skipping."
|
|
||||||
)
|
|
||||||
return existing_document
|
|
||||||
else:
|
|
||||||
# Content has changed - update the existing document
|
|
||||||
logging.info(
|
|
||||||
f"Content changed for markdown file {file_name}. Updating document."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
# Get user's long context LLM (needed for both create and update)
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
@ -139,10 +235,15 @@ async def add_received_markdown_file_document(
|
||||||
document = existing_document
|
document = existing_document
|
||||||
else:
|
else:
|
||||||
# Create new document
|
# Create new document
|
||||||
|
# Determine document type based on connector
|
||||||
|
doc_type = DocumentType.FILE
|
||||||
|
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||||
|
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=DocumentType.FILE,
|
document_type=doc_type,
|
||||||
document_metadata={
|
document_metadata={
|
||||||
"FILE_NAME": file_name,
|
"FILE_NAME": file_name,
|
||||||
},
|
},
|
||||||
|
|
@ -150,7 +251,7 @@ async def add_received_markdown_file_document(
|
||||||
embedding=summary_embedding,
|
embedding=summary_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=unique_identifier_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
blocknote_document=blocknote_json,
|
blocknote_document=blocknote_json,
|
||||||
updated_at=get_current_timestamp(),
|
updated_at=get_current_timestamp(),
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,9 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import { Info } from "lucide-react";
|
import { File, FileText, FileSpreadsheet, FolderClosed, Image, Presentation } from "lucide-react";
|
||||||
import type { FC } from "react";
|
import type { FC } from "react";
|
||||||
import { useEffect, useState } from "react";
|
import { useEffect, useState } from "react";
|
||||||
import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree";
|
import { GoogleDriveFolderTree } from "@/components/connectors/google-drive-folder-tree";
|
||||||
import { Alert, AlertDescription } from "@/components/ui/alert";
|
|
||||||
import { Button } from "@/components/ui/button";
|
import { Button } from "@/components/ui/button";
|
||||||
import { Label } from "@/components/ui/label";
|
import { Label } from "@/components/ui/label";
|
||||||
import {
|
import {
|
||||||
|
|
@ -34,6 +33,29 @@ const DEFAULT_INDEXING_OPTIONS: IndexingOptions = {
|
||||||
include_subfolders: true,
|
include_subfolders: true,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Helper to get appropriate icon for file type based on file name
|
||||||
|
function getFileIconFromName(fileName: string, className: string = "size-3.5 shrink-0") {
|
||||||
|
const lowerName = fileName.toLowerCase();
|
||||||
|
// Spreadsheets
|
||||||
|
if (lowerName.endsWith(".xlsx") || lowerName.endsWith(".xls") || lowerName.endsWith(".csv") || lowerName.includes("spreadsheet")) {
|
||||||
|
return <FileSpreadsheet className={`${className} text-green-500`} />;
|
||||||
|
}
|
||||||
|
// Presentations
|
||||||
|
if (lowerName.endsWith(".pptx") || lowerName.endsWith(".ppt") || lowerName.includes("presentation")) {
|
||||||
|
return <Presentation className={`${className} text-orange-500`} />;
|
||||||
|
}
|
||||||
|
// Documents (word, text only - not PDF)
|
||||||
|
if (lowerName.endsWith(".docx") || lowerName.endsWith(".doc") || lowerName.endsWith(".txt") || lowerName.includes("document") || lowerName.includes("word") || lowerName.includes("text")) {
|
||||||
|
return <FileText className={`${className} text-gray-500`} />;
|
||||||
|
}
|
||||||
|
// Images
|
||||||
|
if (lowerName.endsWith(".png") || lowerName.endsWith(".jpg") || lowerName.endsWith(".jpeg") || lowerName.endsWith(".gif") || lowerName.endsWith(".webp") || lowerName.endsWith(".svg")) {
|
||||||
|
return <Image className={`${className} text-purple-500`} />;
|
||||||
|
}
|
||||||
|
// Default (including PDF)
|
||||||
|
return <File className={`${className} text-gray-500`} />;
|
||||||
|
}
|
||||||
|
|
||||||
export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigChange }) => {
|
export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfigChange }) => {
|
||||||
// Initialize with existing selected folders and files from connector config
|
// Initialize with existing selected folders and files from connector config
|
||||||
const existingFolders =
|
const existingFolders =
|
||||||
|
|
@ -103,29 +125,37 @@ export const GoogleDriveConfig: FC<ConnectorConfigProps> = ({ connector, onConfi
|
||||||
{totalSelected > 0 && (
|
{totalSelected > 0 && (
|
||||||
<div className="p-2 sm:p-3 bg-muted rounded-lg text-xs sm:text-sm space-y-1 sm:space-y-2">
|
<div className="p-2 sm:p-3 bg-muted rounded-lg text-xs sm:text-sm space-y-1 sm:space-y-2">
|
||||||
<p className="font-medium">
|
<p className="font-medium">
|
||||||
Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}:
|
Selected {totalSelected} item{totalSelected > 1 ? "s" : ""}:{" "}
|
||||||
{selectedFolders.length > 0 &&
|
{(() => {
|
||||||
` ${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`}
|
const parts: string[] = [];
|
||||||
{selectedFiles.length > 0 &&
|
if (selectedFolders.length > 0) {
|
||||||
` ${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`}
|
parts.push(`${selectedFolders.length} folder${selectedFolders.length > 1 ? "s" : ""}`);
|
||||||
|
}
|
||||||
|
if (selectedFiles.length > 0) {
|
||||||
|
parts.push(`${selectedFiles.length} file${selectedFiles.length > 1 ? "s" : ""}`);
|
||||||
|
}
|
||||||
|
return parts.length > 0 ? `(${parts.join(" ")})` : "";
|
||||||
|
})()}
|
||||||
</p>
|
</p>
|
||||||
<div className="max-h-20 sm:max-h-24 overflow-y-auto space-y-1">
|
<div className="max-h-20 sm:max-h-24 overflow-y-auto space-y-1">
|
||||||
{selectedFolders.map((folder) => (
|
{selectedFolders.map((folder) => (
|
||||||
<p
|
<p
|
||||||
key={folder.id}
|
key={folder.id}
|
||||||
className="text-xs sm:text-sm text-muted-foreground truncate"
|
className="text-xs sm:text-sm text-muted-foreground truncate flex items-center gap-1.5"
|
||||||
title={folder.name}
|
title={folder.name}
|
||||||
>
|
>
|
||||||
📁 {folder.name}
|
<FolderClosed className="size-3.5 shrink-0 text-gray-500" />
|
||||||
|
{folder.name}
|
||||||
</p>
|
</p>
|
||||||
))}
|
))}
|
||||||
{selectedFiles.map((file) => (
|
{selectedFiles.map((file) => (
|
||||||
<p
|
<p
|
||||||
key={file.id}
|
key={file.id}
|
||||||
className="text-xs sm:text-sm text-muted-foreground truncate"
|
className="text-xs sm:text-sm text-muted-foreground truncate flex items-center gap-1.5"
|
||||||
title={file.name}
|
title={file.name}
|
||||||
>
|
>
|
||||||
📄 {file.name}
|
{getFileIconFromName(file.name)}
|
||||||
|
{file.name}
|
||||||
</p>
|
</p>
|
||||||
))}
|
))}
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
|
|
@ -5,13 +5,13 @@ import {
|
||||||
ChevronRight,
|
ChevronRight,
|
||||||
File,
|
File,
|
||||||
FileText,
|
FileText,
|
||||||
Folder,
|
FolderClosed,
|
||||||
FolderOpen,
|
FolderOpen,
|
||||||
HardDrive,
|
HardDrive,
|
||||||
Image,
|
Image,
|
||||||
Loader2,
|
Loader2,
|
||||||
Presentation,
|
Presentation,
|
||||||
Sheet,
|
FileSpreadsheet,
|
||||||
} from "lucide-react";
|
} from "lucide-react";
|
||||||
import { useState } from "react";
|
import { useState } from "react";
|
||||||
import { Checkbox } from "@/components/ui/checkbox";
|
import { Checkbox } from "@/components/ui/checkbox";
|
||||||
|
|
@ -53,16 +53,16 @@ interface GoogleDriveFolderTreeProps {
|
||||||
// Helper to get appropriate icon for file type
|
// Helper to get appropriate icon for file type
|
||||||
function getFileIcon(mimeType: string, className: string = "h-4 w-4") {
|
function getFileIcon(mimeType: string, className: string = "h-4 w-4") {
|
||||||
if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) {
|
if (mimeType.includes("spreadsheet") || mimeType.includes("excel")) {
|
||||||
return <Sheet className={`${className} text-green-600`} />;
|
return <FileSpreadsheet className={`${className} text-green-500`} />;
|
||||||
}
|
}
|
||||||
if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) {
|
if (mimeType.includes("presentation") || mimeType.includes("powerpoint")) {
|
||||||
return <Presentation className={`${className} text-orange-600`} />;
|
return <Presentation className={`${className} text-orange-500`} />;
|
||||||
}
|
}
|
||||||
if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) {
|
if (mimeType.includes("document") || mimeType.includes("word") || mimeType.includes("text")) {
|
||||||
return <FileText className={`${className} text-blue-600`} />;
|
return <FileText className={`${className} text-gray-500`} />;
|
||||||
}
|
}
|
||||||
if (mimeType.includes("image")) {
|
if (mimeType.includes("image")) {
|
||||||
return <Image className={`${className} text-purple-600`} />;
|
return <Image className={`${className} text-purple-500`} />;
|
||||||
}
|
}
|
||||||
return <File className={`${className} text-gray-500`} />;
|
return <File className={`${className} text-gray-500`} />;
|
||||||
}
|
}
|
||||||
|
|
@ -280,9 +280,9 @@ export function GoogleDriveFolderTree({
|
||||||
<div className="shrink-0">
|
<div className="shrink-0">
|
||||||
{isFolder ? (
|
{isFolder ? (
|
||||||
isExpanded ? (
|
isExpanded ? (
|
||||||
<FolderOpen className="h-3 w-3 sm:h-4 sm:w-4 text-blue-500" />
|
<FolderOpen className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" />
|
||||||
) : (
|
) : (
|
||||||
<Folder className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" />
|
<FolderClosed className="h-3 w-3 sm:h-4 sm:w-4 text-gray-500" />
|
||||||
)
|
)
|
||||||
) : (
|
) : (
|
||||||
getFileIcon(item.mimeType, "h-3 w-3 sm:h-4 sm:w-4")
|
getFileIcon(item.mimeType, "h-3 w-3 sm:h-4 sm:w-4")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue