fix(pifs): clean partial add pageindex cache

This commit is contained in:
BukeLy 2026-05-31 21:11:26 +08:00
parent eca5edd8a9
commit 2d55fd2f5a
2 changed files with 44 additions and 1 deletions

View file

@ -1756,11 +1756,12 @@ class PageIndexFileSystem:
records: list[dict[str, Any]],
preexisting_doc_ids: set[str],
) -> None:
doc_ids: list[str] = []
doc_ids = sorted(self._pageindex_cache_doc_ids() - preexisting_doc_ids)
for record in records:
doc_id = str(record.get("pageindex_doc_id") or "").strip()
if doc_id and doc_id not in preexisting_doc_ids:
doc_ids.append(doc_id)
doc_ids = sorted(set(doc_ids))
if not doc_ids:
return
workspace = self.pageindex_client_workspace

View file

@ -288,6 +288,48 @@ def test_add_markdown_insert_failure_removes_pageindex_cache(tmp_path, monkeypat
assert not list((workspace / "artifacts" / "raw").glob("*.json"))
def test_add_markdown_index_failure_removes_pageindex_cache_delta(tmp_path, monkeypatch):
from pageindex import PageIndexClient
def fake_index(self, file_path, mode="auto"):
doc_id = "doc_partial_before_raise"
doc = {
"id": doc_id,
"type": "md",
"path": str(Path(file_path).resolve()),
"doc_name": "partial.md",
"doc_description": "",
"line_count": 3,
"structure": [{"title": "Partial", "node_id": "0001", "nodes": []}],
}
self.documents[doc_id] = doc
self._save_doc(doc_id)
raise RuntimeError("index failed after cache write")
monkeypatch.setattr(PageIndexClient, "index", fake_index)
source = tmp_path / "partial.md"
source.write_text("# Partial\n\nbody", encoding="utf-8")
workspace = tmp_path / "workspace"
filesystem = make_filesystem(workspace)
pageindex_workspace = workspace / "artifacts" / "pageindex_client"
with pytest.raises(RuntimeError, match="failed to build PageIndex tree"):
filesystem.add_file(source, "/documents/reports")
assert not (pageindex_workspace / "doc_partial_before_raise.json").exists()
meta_path = pageindex_workspace / "_meta.json"
if meta_path.exists():
meta = json.loads(meta_path.read_text(encoding="utf-8"))
assert "doc_partial_before_raise" not in meta
listing = filesystem.browse("/", recursive=True)
assert listing["files"] == []
assert listing["folders"] == []
assert filesystem.summary_projection_indexer.index.info()["document_count"] == 0
assert not list((workspace / "artifacts" / "uploads").glob("**/*"))
assert not list((workspace / "artifacts" / "text").glob("*.txt"))
assert not list((workspace / "artifacts" / "raw").glob("*.json"))
def test_add_markdown_failure_preserves_unrelated_pageindex_cache(tmp_path, monkeypatch):
from pageindex import PageIndexClient