feat: implement raw file hash computation to optimize content extraction during local folder indexing

2026-05-01 11:56:25 +02:00 · 2026-04-08 16:28:51 +05:30 · 2026-04-08 16:28:51 +05:30 · 60eb1e4060
commit 60eb1e4060
parent b76a6babc1
1 changed files with 88 additions and 6 deletions
--- a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py
@ -178,6 +178,22 @@ def _content_hash(content: str, search_space_id: int) -> str:
    return hashlib.sha256(f"{search_space_id}:{content}".encode()).hexdigest()


+def _compute_raw_file_hash(file_path: str) -> str:
+    """SHA-256 hash of the raw file bytes.
+
+    Much cheaper than ETL/OCR extraction -- only performs sequential I/O.
+    Used as a pre-filter to skip expensive content extraction when the
+    underlying file hasn't changed at all.
+    """
+    import hashlib
+
+    h = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
 async def _compute_file_content_hash(
    file_path: str,
    filename: str,
@ -630,6 +646,24 @@ async def index_local_folder(
                        skipped_count += 1
                        continue

+                    raw_hash = await asyncio.to_thread(
+                        _compute_raw_file_hash, file_path_abs
+                    )
+
+                    stored_raw_hash = (
+                        existing_document.document_metadata or {}
+                    ).get("raw_file_hash")
+                    if stored_raw_hash and stored_raw_hash == raw_hash:
+                        meta = dict(existing_document.document_metadata or {})
+                        meta["mtime"] = current_mtime
+                        existing_document.document_metadata = meta
+                        if not DocumentStatus.is_state(
+                            existing_document.status, DocumentStatus.READY
+                        ):
+                            existing_document.status = DocumentStatus.ready()
+                        skipped_count += 1
+                        continue
+
                    try:
                        estimated_pages = await _check_page_limit_or_skip(
                            page_limit_service, user_id, file_path_abs
@ -653,6 +687,7 @@ async def index_local_folder(
                    if existing_document.content_hash == content_hash:
                        meta = dict(existing_document.document_metadata or {})
                        meta["mtime"] = current_mtime
+                        meta["raw_file_hash"] = raw_hash
                        existing_document.document_metadata = meta
                        if not DocumentStatus.is_state(
                            existing_document.status, DocumentStatus.READY
@ -687,6 +722,10 @@ async def index_local_folder(
                        skipped_count += 1
                        continue

+                    raw_hash = await asyncio.to_thread(
+                        _compute_raw_file_hash, file_path_abs
+                    )
+
                doc = _build_connector_doc(
                    title=file_info["name"],
                    content=content,
@ -702,6 +741,7 @@ async def index_local_folder(
                    "mtime": file_info["modified_at"].timestamp(),
                    "estimated_pages": estimated_pages,
                    "content_length": len(content),
+                    "raw_file_hash": raw_hash,
                }

            except Exception as e:
@ -795,6 +835,7 @@ async def index_local_folder(

                    doc_meta = dict(result.document_metadata or {})
                    doc_meta["mtime"] = mtime_info.get("mtime")
+                    doc_meta["raw_file_hash"] = mtime_info.get("raw_file_hash")
                    result.document_metadata = doc_meta

                    est = mtime_info.get("estimated_pages", 1)
@ -988,6 +1029,26 @@ async def _index_single_file(
            DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id
        )

+        raw_hash = await asyncio.to_thread(_compute_raw_file_hash, str(full_path))
+
+        existing = await check_document_by_unique_identifier(session, uid_hash)
+
+        if existing:
+            stored_raw_hash = (existing.document_metadata or {}).get(
+                "raw_file_hash"
+            )
+            if stored_raw_hash and stored_raw_hash == raw_hash:
+                mtime = full_path.stat().st_mtime
+                meta = dict(existing.document_metadata or {})
+                meta["mtime"] = mtime
+                existing.document_metadata = meta
+                if not DocumentStatus.is_state(
+                    existing.status, DocumentStatus.READY
+                ):
+                    existing.status = DocumentStatus.ready()
+                await session.commit()
+                return 0, 0, None
+
        page_limit_service = PageLimitService(session)
        try:
            estimated_pages = await _check_page_limit_or_skip(
@ -1006,13 +1067,12 @@ async def _index_single_file(
        if not content.strip():
            return 0, 1, None

-        existing = await check_document_by_unique_identifier(session, uid_hash)
-
        if existing:
            if existing.content_hash == content_hash:
                mtime = full_path.stat().st_mtime
                meta = dict(existing.document_metadata or {})
                meta["mtime"] = mtime
+                meta["raw_file_hash"] = raw_hash
                existing.document_metadata = meta
                await session.commit()
                return 0, 1, None
@ -1055,6 +1115,7 @@ async def _index_single_file(
        await session.refresh(db_doc)
        doc_meta = dict(db_doc.document_metadata or {})
        doc_meta["mtime"] = mtime
+        doc_meta["raw_file_hash"] = raw_hash
        db_doc.document_metadata = doc_meta
        await session.commit()

@ -1236,6 +1297,29 @@ async def index_uploaded_files(
                    search_space_id,
                )

+                raw_hash = await asyncio.to_thread(
+                    _compute_raw_file_hash, temp_path
+                )
+
+                existing = await check_document_by_unique_identifier(
+                    session, uid_hash
+                )
+
+                if existing:
+                    stored_raw_hash = (existing.document_metadata or {}).get(
+                        "raw_file_hash"
+                    )
+                    if stored_raw_hash and stored_raw_hash == raw_hash:
+                        meta = dict(existing.document_metadata or {})
+                        meta["mtime"] = datetime.now(UTC).timestamp()
+                        existing.document_metadata = meta
+                        if not DocumentStatus.is_state(
+                            existing.status, DocumentStatus.READY
+                        ):
+                            existing.status = DocumentStatus.ready()
+                        await session.commit()
+                        continue
+
                try:
                    estimated_pages = await _check_page_limit_or_skip(
                        page_limit_service, user_id, temp_path
@ -1259,14 +1343,11 @@ async def index_uploaded_files(
                    failed_count += 1
                    continue

-                existing = await check_document_by_unique_identifier(
-                    session, uid_hash
-                )
-
                if existing:
                    if existing.content_hash == content_hash:
                        meta = dict(existing.document_metadata or {})
                        meta["mtime"] = datetime.now(UTC).timestamp()
+                        meta["raw_file_hash"] = raw_hash
                        existing.document_metadata = meta
                        if not DocumentStatus.is_state(
                            existing.status, DocumentStatus.READY
@ -1312,6 +1393,7 @@ async def index_uploaded_files(
                await session.refresh(db_doc)
                doc_meta = dict(db_doc.document_metadata or {})
                doc_meta["mtime"] = datetime.now(UTC).timestamp()
+                doc_meta["raw_file_hash"] = raw_hash
                db_doc.document_metadata = doc_meta
                await session.commit()