From c86d5727ed8dcec4efe1d5c22058c5e7fec1c159 Mon Sep 17 00:00:00 2001 From: Bukely_ Date: Tue, 26 May 2026 20:30:46 +0800 Subject: [PATCH] fix(filesystem): preserve pageindex registration errors Persist PageIndex tree build failure details in metadata_status and surface them through stat and structural reads. --- pageindex/filesystem/commands.py | 9 +++ pageindex/filesystem/core.py | 82 ++++++++++++++++++++++--- tests/test_pageindex_structural_read.py | 11 +++- 3 files changed, 91 insertions(+), 11 deletions(-) diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 270de4d..cc4535f 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -1258,6 +1258,15 @@ class PIFSCommandExecutor: metadata_status = data.get("metadata_status") or {} if metadata_status: lines.append(f"metadata_status: {metadata_status.get('status', '-')}") + pageindex_tree = metadata_status.get("pageindex_tree") or {} + if isinstance(pageindex_tree, dict) and pageindex_tree: + lines.append(f"pageindex_tree_status: {pageindex_tree.get('status', '-')}") + message = str(pageindex_tree.get("message") or "").strip() + error_type = str(pageindex_tree.get("error_type") or "").strip() + if error_type and message: + lines.append(f"pageindex_tree_error: {error_type}: {message}") + elif message or error_type: + lines.append(f"pageindex_tree_error: {message or error_type}") summary_projection = ( metadata_status.get("projection_indexes", {}).get("summary", {}) ) diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index b6cdef6..e2c6136 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -858,21 +858,47 @@ class PageIndexFileSystem: storage_uri: str, source_path: str, content_type: str, - ) -> tuple[str | None, str]: + ) -> tuple[str | None, str, dict[str, Any] | None]: if self._source_format(source_path, content_type) not in {"pdf", "markdown"}: - return None, "not_built" + return None, "not_built", None client = self._pageindex_client() source = self._canonical_source_path(storage_uri=storage_uri, source_path=source_path) cached_doc_id = self._find_cached_pageindex_doc_id(client, source) if cached_doc_id: - return cached_doc_id, "built" + return cached_doc_id, "built", None if source is None: - return None, "failed" + return None, "failed", self._pageindex_tree_failure_record( + source="PageIndexFileSystem.registration", + error_type="UnresolvableSourcePath", + message=( + "PageIndex source path must resolve to a local file path for " + "PDF/Markdown registration." + ), + ) try: doc_id = client.index(source) - return doc_id, "built" - except Exception: - return None, "failed" + return doc_id, "built", None + except Exception as exc: + return None, "failed", self._pageindex_tree_failure_record( + source="PageIndexClient.index", + error_type=exc.__class__.__name__, + message=str(exc) or exc.__class__.__name__, + ) + + @staticmethod + def _pageindex_tree_failure_record( + *, + source: str, + error_type: str, + message: str, + ) -> dict[str, Any]: + return { + "status": "failed", + "owner": "pageindex", + "source": source, + "error_type": error_type, + "message": message, + } def _find_cached_pageindex_doc_id( self, @@ -938,7 +964,11 @@ class PageIndexFileSystem: external_id = file.get("external_id") content = file.get("content") or "" content_type = file.get("content_type") or "text/plain" - pageindex_doc_id, pageindex_tree_status = self._registration_pageindex_pointer( + ( + pageindex_doc_id, + pageindex_tree_status, + pageindex_tree_failure, + ) = self._registration_pageindex_pointer( storage_uri=storage_uri, source_path=raw_source_path, content_type=content_type, @@ -961,6 +991,7 @@ class PageIndexFileSystem: metadata=metadata, status=file.get("metadata_status"), ) + self._attach_pageindex_tree_failure(metadata_status, pageindex_tree_failure) indexed_metadata = SQLiteFileSystemStore.indexed_metadata_values(metadata) searchable_metadata = dict(metadata) folder_path = normalize_path(file.get("folder_path") or "/") @@ -1116,6 +1147,10 @@ class PageIndexFileSystem: metadata=entry.metadata, status=entry.metadata_status.get("status"), ) + self._attach_pageindex_tree_failure( + metadata_status, + entry.metadata_status.get("pageindex_tree"), + ) return { "file_ref": entry.file_ref, "external_id": entry.external_id, @@ -1317,8 +1352,9 @@ class PageIndexFileSystem: source_path=entry.source_path, ) - @staticmethod + @classmethod def _structural_unavailable( + cls, mode: str, entry: Any, *, @@ -1326,6 +1362,9 @@ class PageIndexFileSystem: node_id: str | None = None, pages: str | None = None, ) -> dict[str, Any]: + pageindex_tree_error = cls._pageindex_tree_failure_message(entry.metadata_status) + if pageindex_tree_error and entry.pageindex_tree_status == "failed": + message = f"PageIndex tree build failed: {pageindex_tree_error}" result = { "mode": mode, "file_ref": entry.file_ref, @@ -1335,12 +1374,37 @@ class PageIndexFileSystem: "available": False, "message": message, } + if pageindex_tree_error: + result["pageindex_tree_error"] = pageindex_tree_error if node_id is not None: result["node_id"] = node_id if pages is not None: result["pages"] = pages return result + @staticmethod + def _attach_pageindex_tree_failure( + metadata_status: dict[str, Any], + pageindex_tree_failure: Any, + ) -> None: + if isinstance(pageindex_tree_failure, dict) and pageindex_tree_failure: + metadata_status["pageindex_tree"] = dict(pageindex_tree_failure) + + @staticmethod + def _pageindex_tree_failure_message(metadata_status: Any) -> str | None: + if not isinstance(metadata_status, dict): + return None + pageindex_tree = metadata_status.get("pageindex_tree") + if not isinstance(pageindex_tree, dict): + return None + if pageindex_tree.get("status") != "failed": + return None + message = str(pageindex_tree.get("message") or "").strip() + error_type = str(pageindex_tree.get("error_type") or "").strip() + if error_type and message: + return f"{error_type}: {message}" + return message or error_type or None + def _resolve_target(self, target: str) -> str: return self.store.resolve_file_ref(target) diff --git a/tests/test_pageindex_structural_read.py b/tests/test_pageindex_structural_read.py index d54d9a7..500e4c5 100644 --- a/tests/test_pageindex_structural_read.py +++ b/tests/test_pageindex_structural_read.py @@ -55,7 +55,7 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch): filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") def fail_index(*args, **kwargs): - raise RuntimeError("index failed") + raise RuntimeError("index failed: extractor unavailable") monkeypatch.setattr(PageIndexClient, "index", fail_index) filesystem.register_file( @@ -75,8 +75,15 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch): assert structure["data"]["mode"] == "structure" assert structure["data"]["available"] is False assert structure["data"]["status"] == "failed" - assert "PageIndexClient workspace" in structure["data"]["message"] + assert "RuntimeError: index failed: extractor unavailable" in structure["data"]["message"] assert stat["data"]["pageindex_tree_status"] == "failed" + assert stat["data"]["metadata_status"]["pageindex_tree"] == { + "status": "failed", + "owner": "pageindex", + "source": "PageIndexClient.index", + "error_type": "RuntimeError", + "message": "index failed: extractor unavailable", + } assert node["data"]["mode"] == "node" assert node["data"]["available"] is False