diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py index 7b433cf62..5c4878a04 100644 --- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -178,6 +178,22 @@ def _content_hash(content: str, search_space_id: int) -> str: return hashlib.sha256(f"{search_space_id}:{content}".encode()).hexdigest() +def _compute_raw_file_hash(file_path: str) -> str: + """SHA-256 hash of the raw file bytes. + + Much cheaper than ETL/OCR extraction -- only performs sequential I/O. + Used as a pre-filter to skip expensive content extraction when the + underlying file hasn't changed at all. + """ + import hashlib + + h = hashlib.sha256() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + h.update(chunk) + return h.hexdigest() + + async def _compute_file_content_hash( file_path: str, filename: str, @@ -630,6 +646,24 @@ async def index_local_folder( skipped_count += 1 continue + raw_hash = await asyncio.to_thread( + _compute_raw_file_hash, file_path_abs + ) + + stored_raw_hash = ( + existing_document.document_metadata or {} + ).get("raw_file_hash") + if stored_raw_hash and stored_raw_hash == raw_hash: + meta = dict(existing_document.document_metadata or {}) + meta["mtime"] = current_mtime + existing_document.document_metadata = meta + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() + skipped_count += 1 + continue + try: estimated_pages = await _check_page_limit_or_skip( page_limit_service, user_id, file_path_abs @@ -653,6 +687,7 @@ async def index_local_folder( if existing_document.content_hash == content_hash: meta = dict(existing_document.document_metadata or {}) meta["mtime"] = current_mtime + meta["raw_file_hash"] = raw_hash existing_document.document_metadata = meta if not DocumentStatus.is_state( existing_document.status, DocumentStatus.READY @@ -687,6 +722,10 @@ async def index_local_folder( skipped_count += 1 continue + raw_hash = await asyncio.to_thread( + _compute_raw_file_hash, file_path_abs + ) + doc = _build_connector_doc( title=file_info["name"], content=content, @@ -702,6 +741,7 @@ async def index_local_folder( "mtime": file_info["modified_at"].timestamp(), "estimated_pages": estimated_pages, "content_length": len(content), + "raw_file_hash": raw_hash, } except Exception as e: @@ -795,6 +835,7 @@ async def index_local_folder( doc_meta = dict(result.document_metadata or {}) doc_meta["mtime"] = mtime_info.get("mtime") + doc_meta["raw_file_hash"] = mtime_info.get("raw_file_hash") result.document_metadata = doc_meta est = mtime_info.get("estimated_pages", 1) @@ -988,6 +1029,26 @@ async def _index_single_file( DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id ) + raw_hash = await asyncio.to_thread(_compute_raw_file_hash, str(full_path)) + + existing = await check_document_by_unique_identifier(session, uid_hash) + + if existing: + stored_raw_hash = (existing.document_metadata or {}).get( + "raw_file_hash" + ) + if stored_raw_hash and stored_raw_hash == raw_hash: + mtime = full_path.stat().st_mtime + meta = dict(existing.document_metadata or {}) + meta["mtime"] = mtime + existing.document_metadata = meta + if not DocumentStatus.is_state( + existing.status, DocumentStatus.READY + ): + existing.status = DocumentStatus.ready() + await session.commit() + return 0, 0, None + page_limit_service = PageLimitService(session) try: estimated_pages = await _check_page_limit_or_skip( @@ -1006,13 +1067,12 @@ async def _index_single_file( if not content.strip(): return 0, 1, None - existing = await check_document_by_unique_identifier(session, uid_hash) - if existing: if existing.content_hash == content_hash: mtime = full_path.stat().st_mtime meta = dict(existing.document_metadata or {}) meta["mtime"] = mtime + meta["raw_file_hash"] = raw_hash existing.document_metadata = meta await session.commit() return 0, 1, None @@ -1055,6 +1115,7 @@ async def _index_single_file( await session.refresh(db_doc) doc_meta = dict(db_doc.document_metadata or {}) doc_meta["mtime"] = mtime + doc_meta["raw_file_hash"] = raw_hash db_doc.document_metadata = doc_meta await session.commit() @@ -1236,6 +1297,29 @@ async def index_uploaded_files( search_space_id, ) + raw_hash = await asyncio.to_thread( + _compute_raw_file_hash, temp_path + ) + + existing = await check_document_by_unique_identifier( + session, uid_hash + ) + + if existing: + stored_raw_hash = (existing.document_metadata or {}).get( + "raw_file_hash" + ) + if stored_raw_hash and stored_raw_hash == raw_hash: + meta = dict(existing.document_metadata or {}) + meta["mtime"] = datetime.now(UTC).timestamp() + existing.document_metadata = meta + if not DocumentStatus.is_state( + existing.status, DocumentStatus.READY + ): + existing.status = DocumentStatus.ready() + await session.commit() + continue + try: estimated_pages = await _check_page_limit_or_skip( page_limit_service, user_id, temp_path @@ -1259,14 +1343,11 @@ async def index_uploaded_files( failed_count += 1 continue - existing = await check_document_by_unique_identifier( - session, uid_hash - ) - if existing: if existing.content_hash == content_hash: meta = dict(existing.document_metadata or {}) meta["mtime"] = datetime.now(UTC).timestamp() + meta["raw_file_hash"] = raw_hash existing.document_metadata = meta if not DocumentStatus.is_state( existing.status, DocumentStatus.READY @@ -1312,6 +1393,7 @@ async def index_uploaded_files( await session.refresh(db_doc) doc_meta = dict(db_doc.document_metadata or {}) doc_meta["mtime"] = datetime.now(UTC).timestamp() + doc_meta["raw_file_hash"] = raw_hash db_doc.document_metadata = doc_meta await session.commit()