mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
486 lines
18 KiB
Python
486 lines
18 KiB
Python
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
class GeneratedMetadata:
|
|
def __init__(self):
|
|
self.calls = []
|
|
|
|
def generate(self, request, *, fields):
|
|
self.calls.append((request, list(fields)))
|
|
values = {
|
|
"summary": f"Summary for {request.title}: {request.text[:60]}",
|
|
"doc_type": "uploaded_file",
|
|
"domain": "workspace",
|
|
"topic": "pifs add",
|
|
}
|
|
return {field: values[field] for field in fields if field in values}
|
|
|
|
|
|
class StaticEmbedder:
|
|
def embed(self, texts):
|
|
return [[1.0, 0.0, 0.0] for _ in texts]
|
|
|
|
|
|
def make_summary_indexer(workspace: Path):
|
|
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
|
|
|
return SummaryProjectionIndexer(
|
|
workspace / "artifacts" / "projection_indexes",
|
|
embedder=StaticEmbedder(),
|
|
embedding_provider="test",
|
|
embedding_model="static",
|
|
embedding_dimensions=3,
|
|
)
|
|
|
|
|
|
def make_filesystem(workspace: Path):
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
|
|
return PageIndexFileSystem(
|
|
workspace=workspace,
|
|
metadata_generator=GeneratedMetadata(),
|
|
summary_projection_indexer=make_summary_indexer(workspace),
|
|
summary_projection_embedding_provider="test",
|
|
summary_projection_embedding_model="static",
|
|
summary_projection_embedding_dimensions=3,
|
|
)
|
|
|
|
|
|
def write_pageindex_client_doc(workspace: Path, doc_id: str, doc: dict) -> None:
|
|
workspace.mkdir(parents=True, exist_ok=True)
|
|
(workspace / f"{doc_id}.json").write_text(
|
|
json.dumps(doc, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
meta = {
|
|
doc_id: {
|
|
"type": doc.get("type", ""),
|
|
"doc_name": doc.get("doc_name", ""),
|
|
"doc_description": doc.get("doc_description", ""),
|
|
"path": doc.get("path", ""),
|
|
"line_count": doc.get("line_count"),
|
|
}
|
|
}
|
|
(workspace / "_meta.json").write_text(
|
|
json.dumps(meta, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def test_add_text_folder_target_copies_artifact_indexes_summary_and_is_readable(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor
|
|
|
|
source = tmp_path / "filing.txt"
|
|
source.write_text("alpha filing text for pifs add", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = make_filesystem(workspace)
|
|
|
|
info = filesystem.add_file(str(source), "/documents/reports")
|
|
|
|
assert info["source_path"] == "documents/reports/filing.txt"
|
|
assert info["folder_path"] == "/documents/reports"
|
|
assert filesystem.folder_info("/documents/reports")["path"] == "/documents/reports"
|
|
assert info["storage_uri"] != source.as_uri()
|
|
assert "/artifacts/uploads/" in info["storage_uri"]
|
|
copied_path = Path(info["storage_uri"].removeprefix("file://"))
|
|
assert copied_path.read_text(encoding="utf-8") == "alpha filing text for pifs add"
|
|
assert copied_path.resolve() != source.resolve()
|
|
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
rendered = json.loads(executor.execute("cat /documents/reports/filing.txt --all"))
|
|
|
|
assert rendered["data"]["text"] == "alpha filing text for pifs add"
|
|
assert info["metadata"]["summary"].startswith("Summary for filing.txt")
|
|
assert filesystem.summary_projection_indexer.index.info()["document_count"] == 1
|
|
|
|
|
|
def test_add_rejects_same_folder_same_basename_without_overwrite(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor
|
|
|
|
source = tmp_path / "conflict.txt"
|
|
source.write_text("first body", encoding="utf-8")
|
|
filesystem = make_filesystem(tmp_path / "workspace")
|
|
|
|
filesystem.add_file(source, "/documents")
|
|
source.write_text("second body must not overwrite", encoding="utf-8")
|
|
|
|
with pytest.raises(FileExistsError, match="already exists"):
|
|
filesystem.add_file(source, "/documents")
|
|
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
rendered = json.loads(executor.execute("cat /documents/conflict.txt --all"))
|
|
assert rendered["data"]["text"] == "first body"
|
|
|
|
|
|
def test_add_rejects_unsupported_type_before_registration(tmp_path):
|
|
source = tmp_path / "payload.json"
|
|
source.write_text('{"unsupported": true}', encoding="utf-8")
|
|
filesystem = make_filesystem(tmp_path / "workspace")
|
|
|
|
with pytest.raises(ValueError, match="Unsupported file type"):
|
|
filesystem.add_file(source, "/documents")
|
|
|
|
assert filesystem.browse("/", recursive=True)["files"] == []
|
|
assert not list((tmp_path / "workspace" / "artifacts" / "uploads").glob("**/*"))
|
|
|
|
|
|
def test_add_rejects_disabled_summary_projection_before_registration(tmp_path):
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
|
|
source = tmp_path / "disabled.txt"
|
|
source.write_text("must not register without summary vector", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=workspace,
|
|
metadata_generator=GeneratedMetadata(),
|
|
summary_projection_index=False,
|
|
)
|
|
|
|
with pytest.raises(RuntimeError, match="summary projection index"):
|
|
filesystem.add_file(source, "/documents")
|
|
|
|
assert filesystem.browse("/", recursive=True)["files"] == []
|
|
assert not list((workspace / "artifacts" / "uploads").glob("**/*"))
|
|
assert not list((workspace / "artifacts" / "text").glob("*.txt"))
|
|
assert not list((workspace / "artifacts" / "raw").glob("*.json"))
|
|
|
|
|
|
def test_add_configures_semantic_retrieval_in_same_filesystem_instance(tmp_path):
|
|
source = tmp_path / "semantic.txt"
|
|
source.write_text("alpha semantic recall text", encoding="utf-8")
|
|
filesystem = make_filesystem(tmp_path / "workspace")
|
|
|
|
assert filesystem.semantic_retrieval_channels() == ()
|
|
|
|
filesystem.add_file(source, "/documents")
|
|
|
|
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
|
results = filesystem.search_semantic_channel(
|
|
"summary",
|
|
"semantic recall",
|
|
scope={"folder_path": "/documents", "recursive": True},
|
|
limit=5,
|
|
)
|
|
assert [result.source_path for result in results] == ["documents/semantic.txt"]
|
|
|
|
|
|
def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
from pageindex.filesystem import PIFSCommandExecutor
|
|
|
|
indexed_paths = []
|
|
|
|
def fake_index(self, file_path, mode="auto"):
|
|
indexed_paths.append(Path(file_path))
|
|
doc_id = "doc_added_md"
|
|
doc = {
|
|
"id": doc_id,
|
|
"type": "md",
|
|
"path": str(Path(file_path).resolve()),
|
|
"doc_name": "notes.md",
|
|
"doc_description": "",
|
|
"line_count": 3,
|
|
"structure": [
|
|
{
|
|
"title": "Notes",
|
|
"node_id": "0001",
|
|
"line_num": 1,
|
|
"text": "# Notes\n\ncopied markdown body",
|
|
"nodes": [],
|
|
}
|
|
],
|
|
}
|
|
write_pageindex_client_doc(self.workspace, doc_id, doc)
|
|
self.documents[doc_id] = doc
|
|
return doc_id
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fake_index)
|
|
source = tmp_path / "notes.md"
|
|
source.write_text("# Notes\n\ncopied markdown body", encoding="utf-8")
|
|
filesystem = make_filesystem(tmp_path / "workspace")
|
|
|
|
info = filesystem.add_file(source, "/documents")
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
structure = json.loads(executor.execute("cat /documents/notes.md --structure"))
|
|
|
|
assert structure["data"]["available"] is True
|
|
assert structure["data"]["structure"][0]["title"] == "Notes"
|
|
assert indexed_paths == [Path(info["storage_uri"].removeprefix("file://"))]
|
|
assert indexed_paths[0].resolve() != source.resolve()
|
|
|
|
|
|
def test_add_failure_does_not_leave_visible_catalog_or_artifacts(tmp_path, monkeypatch):
|
|
source = tmp_path / "atomic.txt"
|
|
source.write_text("atomic body", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = make_filesystem(workspace)
|
|
|
|
def fail_insert(records):
|
|
raise RuntimeError("catalog insert failed")
|
|
|
|
monkeypatch.setattr(filesystem.store, "insert_files", fail_insert)
|
|
|
|
with pytest.raises(RuntimeError, match="catalog insert failed"):
|
|
filesystem.add_file(source, "/documents")
|
|
|
|
assert filesystem.browse("/", recursive=True)["files"] == []
|
|
assert filesystem.summary_projection_indexer.index.info()["document_count"] == 0
|
|
assert not list((workspace / "artifacts" / "uploads").glob("**/*"))
|
|
assert not list((workspace / "artifacts" / "text").glob("*.txt"))
|
|
assert not list((workspace / "artifacts" / "raw").glob("*.json"))
|
|
|
|
|
|
def test_add_markdown_insert_failure_removes_pageindex_cache(tmp_path, monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
|
|
def fake_index(self, file_path, mode="auto"):
|
|
doc_id = "doc_failed_add_md"
|
|
doc = {
|
|
"id": doc_id,
|
|
"type": "md",
|
|
"path": str(Path(file_path).resolve()),
|
|
"doc_name": "failed.md",
|
|
"doc_description": "",
|
|
"line_count": 3,
|
|
"structure": [
|
|
{
|
|
"title": "Failed",
|
|
"node_id": "0001",
|
|
"line_num": 1,
|
|
"text": "# Failed\n\nbody",
|
|
"nodes": [],
|
|
}
|
|
],
|
|
}
|
|
write_pageindex_client_doc(self.workspace, doc_id, doc)
|
|
self.documents[doc_id] = doc
|
|
return doc_id
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fake_index)
|
|
source = tmp_path / "failed.md"
|
|
source.write_text("# Failed\n\nbody", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = make_filesystem(workspace)
|
|
|
|
def fail_insert(records):
|
|
raise RuntimeError("catalog insert failed")
|
|
|
|
monkeypatch.setattr(filesystem.store, "insert_files", fail_insert)
|
|
|
|
with pytest.raises(RuntimeError, match="catalog insert failed"):
|
|
filesystem.add_file(source, "/documents/reports")
|
|
|
|
pageindex_workspace = workspace / "artifacts" / "pageindex_client"
|
|
assert not (pageindex_workspace / "doc_failed_add_md.json").exists()
|
|
meta_path = pageindex_workspace / "_meta.json"
|
|
if meta_path.exists():
|
|
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
assert "doc_failed_add_md" not in meta
|
|
listing = filesystem.browse("/", recursive=True)
|
|
assert listing["files"] == []
|
|
assert listing["folders"] == []
|
|
assert filesystem.summary_projection_indexer.index.info()["document_count"] == 0
|
|
assert not list((workspace / "artifacts" / "uploads").glob("**/*"))
|
|
assert not list((workspace / "artifacts" / "text").glob("*.txt"))
|
|
assert not list((workspace / "artifacts" / "raw").glob("*.json"))
|
|
|
|
|
|
def test_add_markdown_index_failure_removes_pageindex_cache_delta(tmp_path, monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
|
|
def fake_index(self, file_path, mode="auto"):
|
|
doc_id = "doc_partial_before_raise"
|
|
doc = {
|
|
"id": doc_id,
|
|
"type": "md",
|
|
"path": str(Path(file_path).resolve()),
|
|
"doc_name": "partial.md",
|
|
"doc_description": "",
|
|
"line_count": 3,
|
|
"structure": [{"title": "Partial", "node_id": "0001", "nodes": []}],
|
|
}
|
|
self.documents[doc_id] = doc
|
|
self._save_doc(doc_id)
|
|
raise RuntimeError("index failed after cache write")
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fake_index)
|
|
source = tmp_path / "partial.md"
|
|
source.write_text("# Partial\n\nbody", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = make_filesystem(workspace)
|
|
pageindex_workspace = workspace / "artifacts" / "pageindex_client"
|
|
|
|
with pytest.raises(RuntimeError, match="failed to build PageIndex tree"):
|
|
filesystem.add_file(source, "/documents/reports")
|
|
|
|
assert not (pageindex_workspace / "doc_partial_before_raise.json").exists()
|
|
meta_path = pageindex_workspace / "_meta.json"
|
|
if meta_path.exists():
|
|
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
assert "doc_partial_before_raise" not in meta
|
|
listing = filesystem.browse("/", recursive=True)
|
|
assert listing["files"] == []
|
|
assert listing["folders"] == []
|
|
assert filesystem.summary_projection_indexer.index.info()["document_count"] == 0
|
|
assert not list((workspace / "artifacts" / "uploads").glob("**/*"))
|
|
assert not list((workspace / "artifacts" / "text").glob("*.txt"))
|
|
assert not list((workspace / "artifacts" / "raw").glob("*.json"))
|
|
|
|
|
|
def test_add_markdown_failure_preserves_unrelated_pageindex_cache(tmp_path, monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
|
|
def fake_index(self, file_path, mode="auto"):
|
|
doc_id = "doc_failed_add_md"
|
|
doc = {
|
|
"id": doc_id,
|
|
"type": "md",
|
|
"path": str(Path(file_path).resolve()),
|
|
"doc_name": "failed.md",
|
|
"doc_description": "",
|
|
"line_count": 3,
|
|
"structure": [{"title": "Failed", "node_id": "0001", "nodes": []}],
|
|
}
|
|
self.documents[doc_id] = doc
|
|
self._save_doc(doc_id)
|
|
return doc_id
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fake_index)
|
|
source = tmp_path / "failed.md"
|
|
source.write_text("# Failed\n\nbody", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = make_filesystem(workspace)
|
|
pageindex_workspace = workspace / "artifacts" / "pageindex_client"
|
|
write_pageindex_client_doc(
|
|
pageindex_workspace,
|
|
"doc_unrelated",
|
|
{
|
|
"id": "doc_unrelated",
|
|
"type": "md",
|
|
"path": str((tmp_path / "unrelated.md").resolve()),
|
|
"doc_name": "unrelated.md",
|
|
"doc_description": "",
|
|
"line_count": 1,
|
|
"structure": [{"title": "Unrelated", "node_id": "0001", "nodes": []}],
|
|
},
|
|
)
|
|
|
|
def fail_insert(records):
|
|
raise RuntimeError("catalog insert failed")
|
|
|
|
monkeypatch.setattr(filesystem.store, "insert_files", fail_insert)
|
|
|
|
with pytest.raises(RuntimeError, match="catalog insert failed"):
|
|
filesystem.add_file(source, "/documents")
|
|
|
|
assert not (pageindex_workspace / "doc_failed_add_md.json").exists()
|
|
assert (pageindex_workspace / "doc_unrelated.json").exists()
|
|
meta = json.loads((pageindex_workspace / "_meta.json").read_text(encoding="utf-8"))
|
|
assert "doc_failed_add_md" not in meta
|
|
assert "doc_unrelated" in meta
|
|
|
|
|
|
def test_add_failure_after_summary_vector_rolls_back_catalog_and_vector(
|
|
tmp_path, monkeypatch
|
|
):
|
|
source = tmp_path / "post_vector.txt"
|
|
source.write_text("post vector rollback body", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = make_filesystem(workspace)
|
|
|
|
def fail_status_update(*args, **kwargs):
|
|
raise RuntimeError("metadata status update failed")
|
|
|
|
monkeypatch.setattr(filesystem.store, "update_file_metadata_status", fail_status_update)
|
|
|
|
with pytest.raises(RuntimeError, match="metadata status update failed"):
|
|
filesystem.add_file(source, "/documents")
|
|
|
|
assert filesystem.browse("/", recursive=True)["files"] == []
|
|
assert filesystem.summary_projection_indexer.index.info()["document_count"] == 0
|
|
assert not list((workspace / "artifacts" / "uploads").glob("**/*"))
|
|
assert not list((workspace / "artifacts" / "text").glob("*.txt"))
|
|
assert not list((workspace / "artifacts" / "raw").glob("*.json"))
|
|
|
|
|
|
def test_add_failure_removes_nested_folders_created_only_for_add(tmp_path, monkeypatch):
|
|
source = tmp_path / "nested.txt"
|
|
source.write_text("nested rollback body", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = make_filesystem(workspace)
|
|
|
|
def fail_status_update(*args, **kwargs):
|
|
raise RuntimeError("metadata status update failed")
|
|
|
|
monkeypatch.setattr(filesystem.store, "update_file_metadata_status", fail_status_update)
|
|
|
|
with pytest.raises(RuntimeError, match="metadata status update failed"):
|
|
filesystem.add_file(source, "/documents/reports")
|
|
|
|
listing = filesystem.browse("/", recursive=True)
|
|
assert listing["files"] == []
|
|
assert listing["folders"] == []
|
|
assert filesystem.summary_projection_indexer.index.info()["document_count"] == 0
|
|
assert not list((workspace / "artifacts" / "uploads").glob("**/*"))
|
|
assert not list((workspace / "artifacts" / "text").glob("*.txt"))
|
|
assert not list((workspace / "artifacts" / "raw").glob("*.json"))
|
|
|
|
|
|
def test_add_failure_preserves_preexisting_parent_folder(tmp_path, monkeypatch):
|
|
source = tmp_path / "nested.txt"
|
|
source.write_text("nested rollback body", encoding="utf-8")
|
|
workspace = tmp_path / "workspace"
|
|
filesystem = make_filesystem(workspace)
|
|
filesystem.create_folder("/documents")
|
|
|
|
def fail_status_update(*args, **kwargs):
|
|
raise RuntimeError("metadata status update failed")
|
|
|
|
monkeypatch.setattr(filesystem.store, "update_file_metadata_status", fail_status_update)
|
|
|
|
with pytest.raises(RuntimeError, match="metadata status update failed"):
|
|
filesystem.add_file(source, "/documents/reports")
|
|
|
|
listing = filesystem.browse("/", recursive=True)
|
|
assert listing["files"] == []
|
|
assert [folder["path"] for folder in listing["folders"]] == ["/documents"]
|
|
assert filesystem.summary_projection_indexer.index.info()["document_count"] == 0
|
|
|
|
|
|
def test_cli_add_uses_workspace_and_prints_added_file(monkeypatch, capsys, tmp_path):
|
|
from pageindex.filesystem import cli
|
|
|
|
source = tmp_path / "cli.txt"
|
|
source.write_text("cli body", encoding="utf-8")
|
|
calls = []
|
|
|
|
class FakeAddFileSystem:
|
|
def __init__(self, workspace):
|
|
self.workspace = Path(workspace)
|
|
|
|
def configure_existing_projection_retrieval(self):
|
|
return False
|
|
|
|
def add_file(self, physical_path, virtual_target):
|
|
calls.append((self.workspace, physical_path, virtual_target))
|
|
return {
|
|
"file_ref": "file_cli",
|
|
"path": "/documents/cli.txt",
|
|
"source_path": "documents/cli.txt",
|
|
"storage_uri": "file:///workspace/artifacts/uploads/file_cli/cli.txt",
|
|
}
|
|
|
|
monkeypatch.setattr(cli, "PageIndexFileSystem", FakeAddFileSystem)
|
|
|
|
status = cli.main(["--workspace", str(tmp_path / "workspace"), "add", str(source), "/documents"])
|
|
|
|
assert status == 0
|
|
assert calls == [(tmp_path / "workspace", str(source), "/documents")]
|
|
assert capsys.readouterr().out == (
|
|
"added: /documents/cli.txt\n"
|
|
"file_ref: file_cli\n"
|
|
"storage_uri: file:///workspace/artifacts/uploads/file_cli/cli.txt\n"
|
|
)
|