PageIndex/tests/test_pifs_find_maxdepth.py

399 lines
14 KiB
Python

import json
from pathlib import Path
import pytest
def _register_find_fixture(tmp_path: Path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
source_dir = tmp_path / "source"
source_dir.mkdir()
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
filesystem.metadata.register_schema({"fields": {"department": "string"}})
def add_file(
filename: str,
*,
folder_path: str,
external_id: str,
title: str,
domain: str,
) -> None:
source = source_dir / filename
source.write_text(f"{title} fixture text", encoding="utf-8")
filesystem.register_file(
storage_uri=source.as_uri(),
source_path=f"docs/{filename}",
folder_path=folder_path,
external_id=external_id,
title=title,
content=source.read_text(encoding="utf-8"),
metadata={"department": domain},
)
add_file(
"root.txt",
folder_path="/documents",
external_id="doc_root",
title="Root document",
domain="ops",
)
add_file(
"child.txt",
folder_path="/documents/team",
external_id="doc_child",
title="Child document",
domain="ops",
)
add_file(
"deep.txt",
folder_path="/documents/team/deep",
external_id="doc_deep",
title="Deep document",
domain="ops",
)
add_file(
"other.txt",
folder_path="/documents/team",
external_id="doc_other",
title="Other document",
domain="finance",
)
return PIFSCommandExecutor(filesystem, json_output=True)
def _data(output: str):
return json.loads(output)["data"]
def test_find_maxdepth_one_returns_direct_files_only(tmp_path):
executor = _register_find_fixture(tmp_path)
rows = _data(executor.execute("find /documents -maxdepth 1 -type f"))
assert [row["external_id"] for row in rows] == ["doc_root"]
def test_find_output_is_path_first_without_session_refs(tmp_path):
executor = _register_find_fixture(tmp_path)
executor.json_output = False
output = executor.execute("find /documents -maxdepth 1 -type f")
assert output.startswith("/documents/Root document id=doc_root file_ref=file_")
assert "ref_1" not in output
assert "title=Root document" in output
def test_stable_path_targets_work_without_session_refs(tmp_path):
executor = _register_find_fixture(tmp_path)
executor.json_output = False
stat = executor.execute("stat '/documents/Root document'")
text = executor.execute("cat '/documents/Root document' --all")
assert "target: /documents/Root document" in stat
assert "document_id: doc_root" in stat
assert "Root document fixture text" in text
def test_shell_limits_reject_context_expanding_counts(tmp_path):
from pageindex.filesystem.commands import PIFSCommandError
executor = _register_find_fixture(tmp_path)
for command, limit in (
("find /documents --limit 51", 50),
("grep --limit 21 Root /documents", 20),
("ls /documents --limit 101", 100),
("tree /documents --limit 201", 200),
("head -n 101 /documents/Root\\ document", 100),
("tail -n 101 /documents/Root\\ document", 100),
("sed -n 1,101p /documents/Root\\ document", 100),
):
with pytest.raises(PIFSCommandError, match=f"at most {limit}"):
executor.execute(command)
def test_grep_rejects_regex_alternation_patterns(tmp_path):
from pageindex.filesystem.commands import PIFSCommandError
executor = _register_find_fixture(tmp_path)
executor.json_output = False
with pytest.raises(PIFSCommandError, match="does not support regex alternation"):
executor.execute('grep -R "Root|Child" /documents')
with pytest.raises(PIFSCommandError, match="multiple grep commands"):
executor.execute('find /documents -type f | grep "Root|Child"')
def test_stat_shell_output_includes_unified_metadata_status(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
source = tmp_path / "source.txt"
source.write_text("fixture text", encoding="utf-8")
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(
values={field: "Generated summary for retrieval." for field in fields}
)
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_generated",
title="Generated metadata document",
content=source.read_text(encoding="utf-8"),
metadata={"department": "ops"},
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
executor = PIFSCommandExecutor(filesystem, json_output=False)
stat = executor.execute("stat /documents/'Generated metadata document'")
assert "metadata:" in stat
assert " department: ops" in stat
assert " summary: Generated summary for retrieval." in stat
assert "metadata_status: generated" in stat
def test_stat_field_reads_one_metadata_field_across_multiple_targets(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(
values={
field: (
f"Summary for {document.title}\n"
+ "full summary token " * 80
)
for field in fields
}
)
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
for index in range(1, 3):
source = tmp_path / f"source{index}.txt"
source.write_text(f"fixture text {index}", encoding="utf-8")
filesystem.register_file(
storage_uri=source.as_uri(),
source_path=f"docs/source{index}.txt",
folder_path="/documents",
external_id=f"doc_summary_{index}",
title=f"Summary document {index}",
content=source.read_text(encoding="utf-8"),
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
executor = PIFSCommandExecutor(filesystem, json_output=False)
output = executor.execute(
"stat --field summary /documents/'Summary document 1' /documents/'Summary document 2'"
)
assert "/documents/Summary document 1:" in output
assert "summary: Summary for Summary document 1" in output
assert "full summary token" in output
assert "[truncated]" not in output
assert "/documents/Summary document 2:" in output
assert "summary: Summary for Summary document 2" in output
data = json.loads(
PIFSCommandExecutor(filesystem, json_output=True).execute(
"stat --field summary /documents/'Summary document 1' /documents/'Summary document 2'"
)
)["data"]
assert data["mode"] == "field_values"
assert data["target_count"] == 2
assert data["data"][0]["field"] == "summary"
assert data["data"][0]["value"].startswith("Summary for Summary document 1\n")
assert data["data"][0]["value"].count("full summary token") == 80
with pytest.raises(PIFSCommandError, match="Unknown metadata field"):
executor.execute("stat --field missing_field /documents/'Summary document 1'")
def test_stat_field_rejects_more_than_twenty_targets(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
targets = []
for index in range(21):
source = tmp_path / f"source{index}.txt"
source.write_text(f"fixture text {index}", encoding="utf-8")
filesystem.register_file(
storage_uri=source.as_uri(),
source_path=f"docs/source{index}.txt",
folder_path="/documents",
external_id=f"doc_{index}",
title=f"Document {index}",
content=source.read_text(encoding="utf-8"),
metadata={"department": "ops"},
)
targets.append(f"/documents/'Document {index}'")
executor = PIFSCommandExecutor(filesystem, json_output=False)
with pytest.raises(PIFSCommandError, match="at most 20"):
executor.execute("stat --field department " + " ".join(targets))
def test_register_rejects_pifs_owned_metadata_fields(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
source = tmp_path / "source.txt"
source.write_text("fixture text", encoding="utf-8")
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
with pytest.raises(ValueError, match="PIFS-owned generated field"):
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_conflict",
title="Conflict document",
content=source.read_text(encoding="utf-8"),
metadata={"summary": "caller summary"},
)
def test_batch_metadata_status_generates_into_unified_metadata(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
source = tmp_path / "source.txt"
source.write_text("fixture text", encoding="utf-8")
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(values={"summary": "Batch generated summary."})
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
file_ref = filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_batch",
title="Batch document",
content=source.read_text(encoding="utf-8"),
metadata={"department": "ops"},
metadata_policy={
"batch": True,
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
},
},
)
before = filesystem.store.get_file(file_ref)
assert "summary" not in before.metadata
assert before.metadata_status["fields"]["summary"]["status"] == "pending_submit"
result = filesystem.batch_generate()
after = filesystem.store.get_file(file_ref)
assert result["generated"] == 1
assert after.metadata["summary"] == "Batch generated summary."
assert after.metadata["department"] == "ops"
assert after.metadata_status["fields"]["summary"]["status"] == "generated"
def test_find_maxdepth_zero_type_directory_returns_start_folder(tmp_path):
executor = _register_find_fixture(tmp_path)
rows = _data(executor.execute("find /documents -maxdepth 0 -type d"))
assert [row["path"] for row in rows] == ["/documents"]
def test_find_directory_output_renders_root_without_double_slash(tmp_path):
executor = _register_find_fixture(tmp_path)
executor.json_output = False
output = executor.execute("find / -maxdepth 1 -type d")
assert output.splitlines()[0] == "/ folders=1 files=0"
assert "//" not in output
assert "/documents/ folders=1 files=1" in output
def test_find_maxdepth_combines_with_where_and_limit(tmp_path):
executor = _register_find_fixture(tmp_path)
rows = _data(
executor.execute(
"""find /documents -maxdepth 2 -type f --where '{"department":"ops"}' --limit 1"""
)
)
assert len(rows) == 1
assert rows[0]["metadata"]["department"] == "ops"
assert rows[0]["folder_path"] in {"/documents", "/documents/team"}
def test_find_maxdepth_rejects_invalid_values_and_unsupported_options(tmp_path):
from pageindex.filesystem.commands import PIFSCommandError
executor = _register_find_fixture(tmp_path)
with pytest.raises(PIFSCommandError, match="find -maxdepth requires an integer >= 0"):
executor.execute("find /documents -maxdepth nope -type f")
with pytest.raises(PIFSCommandError, match="find -maxdepth requires an integer >= 0"):
executor.execute("find /documents -maxdepth -1 -type f")
with pytest.raises(PIFSCommandError, match="Unsupported find option: -exec"):
executor.execute("find /documents -maxdepth 1 -type f -exec")
def test_find_maxdepth_is_advertised_to_agents(tmp_path):
executor = _register_find_fixture(tmp_path)
assert "-maxdepth N -type f|d" in executor.describe_available_command_surfaces()
assert executor.command_capabilities()["retrieval"]["lexical"]["find_maxdepth"] is True
def test_where_path_error_points_to_folder_scope(tmp_path):
from pageindex.filesystem.commands import PIFSCommandError
executor = _register_find_fixture(tmp_path)
with pytest.raises(PIFSCommandError) as exc_info:
executor.execute("""find --where '{"path":"/documents"}'""")
message = str(exc_info.value)
assert "Folder paths are positional PIFS paths" in message
assert "find /documents -type f" in message
assert "stat --schema" in message