mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-15 20:05:14 +02:00
refactor: remove source paths from PIFS
This commit is contained in:
parent
b9e30952ad
commit
dc4de3116f
22 changed files with 324 additions and 528 deletions
|
|
@ -21,7 +21,6 @@ def test_insert_files_does_not_disable_sqlite_synchronous(tmp_path):
|
|||
"file_ref": "ref_report",
|
||||
"external_id": "doc_report",
|
||||
"storage_uri": "file:///tmp/report.pdf",
|
||||
"source_path": "documents/report.pdf",
|
||||
"folder_path": "/documents",
|
||||
"title": "Report",
|
||||
"descriptor": "documents/report.pdf",
|
||||
|
|
|
|||
|
|
@ -20,7 +20,6 @@ def test_metadata_generator_uses_provider_parameter():
|
|||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
title="A",
|
||||
source_path="docs/a.txt",
|
||||
content_type="text/plain",
|
||||
source_type=None,
|
||||
text="hello",
|
||||
|
|
|
|||
|
|
@ -135,7 +135,6 @@ def _register_browse_file(
|
|||
filesystem.metadata_generator = SummaryGenerator()
|
||||
return filesystem.register_file(
|
||||
storage_uri=f"file:///tmp/{external_id}.txt",
|
||||
source_path=f"documents/{external_id}.txt",
|
||||
folder_path=folder_path,
|
||||
external_id=external_id,
|
||||
title=f"{external_id}.txt",
|
||||
|
|
@ -427,7 +426,7 @@ def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path)
|
|||
assert "score:" not in rendered
|
||||
|
||||
|
||||
def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp_path):
|
||||
def test_browse_shell_path_uses_virtual_locator_when_source_collides(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
|
|
@ -443,7 +442,6 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp
|
|||
)
|
||||
first_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.json",
|
||||
source_path="shared/source.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_first",
|
||||
title="First",
|
||||
|
|
@ -459,7 +457,6 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp
|
|||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.json",
|
||||
source_path="shared/source.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_second",
|
||||
title="Second",
|
||||
|
|
@ -478,13 +475,52 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp
|
|||
|
||||
rendered = executor.execute('browse /documents "first"')
|
||||
|
||||
assert "path: dsid_first" in rendered
|
||||
assert "path: /documents/First" in rendered
|
||||
assert "path: /shared/source.json" not in rendered
|
||||
assert filesystem.store.resolve_file_ref("dsid_first") == first_ref
|
||||
with pytest.raises(KeyError, match="Ambiguous file target"):
|
||||
assert filesystem.store.resolve_file_ref("/documents/First") == first_ref
|
||||
with pytest.raises(KeyError, match="Unknown file target"):
|
||||
filesystem.store.resolve_file_ref("/shared/source.json")
|
||||
|
||||
|
||||
def test_browse_shell_path_never_returns_storage_uri_path(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
return MetadataGenerationResult(
|
||||
values={"summary": "summary for physical source report"}
|
||||
)
|
||||
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri="file:///Users/chengjie/Downloads/source/report.pdf",
|
||||
folder_path="/documents/reports",
|
||||
external_id="dsid_report",
|
||||
title="report.pdf",
|
||||
content="physical source report content",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_report"])
|
||||
executor = PIFSCommandExecutor(filesystem)
|
||||
|
||||
rendered = executor.execute('browse /documents/reports "physical source"')
|
||||
|
||||
assert "path: /documents/reports/report.pdf" in rendered
|
||||
assert "/Users/chengjie/Downloads" not in rendered
|
||||
assert filesystem.store.resolve_file_ref("/documents/reports/report.pdf") == file_ref
|
||||
|
||||
|
||||
def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
|
@ -501,7 +537,6 @@ def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path
|
|||
)
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.pdf",
|
||||
source_path="examples/documents/report.pdf",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_report",
|
||||
title="report.pdf",
|
||||
|
|
@ -525,14 +560,13 @@ def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path
|
|||
)
|
||||
|
||||
assert "source_type" not in backend.calls[0][2]
|
||||
assert "source_path" not in backend.calls[0][2]
|
||||
assert result["data"]["data"][0]["path"] == "/examples/documents/report.pdf"
|
||||
assert result["data"]["data"][0]["path"] == "/documents/report.pdf"
|
||||
assert result["data"]["data"][0]["summary"] == "Federal Reserve annual report summary"
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref
|
||||
|
||||
|
||||
def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
def test_register_file_rejects_duplicate_title_in_folder(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class SummaryGenerator:
|
||||
|
|
@ -545,9 +579,8 @@ def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path):
|
|||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
first_ref = filesystem.register_file(
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.json",
|
||||
source_path="slack/dsid_first.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_first",
|
||||
title="announcements",
|
||||
|
|
@ -561,34 +594,25 @@ def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path):
|
|||
}
|
||||
},
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.json",
|
||||
source_path="slack/dsid_second.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_second",
|
||||
title="announcements",
|
||||
content="second announcement mentions unrelated maintenance.",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
result = json.loads(executor.execute('browse /documents "H200 reservations"'))
|
||||
|
||||
assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json"
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
|
||||
with pytest.raises(KeyError, match="Ambiguous file target"):
|
||||
filesystem.store.resolve_file_ref("/documents/announcements")
|
||||
with pytest.raises(FileExistsError, match="File already exists at /documents/announcements"):
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_second",
|
||||
title="announcements",
|
||||
content="second announcement mentions unrelated maintenance.",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
|
||||
def test_browse_path_uses_virtual_title_when_storage_paths_are_unrelated(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
|
|
@ -604,7 +628,6 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
|
|||
)
|
||||
first_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.json",
|
||||
source_path="shared/source.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_first",
|
||||
title="First",
|
||||
|
|
@ -620,7 +643,6 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
|
|||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.json",
|
||||
source_path="shared/source.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_second",
|
||||
title="Second",
|
||||
|
|
@ -639,7 +661,7 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
|
|||
|
||||
result = json.loads(executor.execute('browse /documents "first"'))
|
||||
|
||||
assert result["data"]["data"][0]["path"] == "dsid_first"
|
||||
assert result["data"]["data"][0]["path"] == "/documents/First"
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
|
||||
|
||||
|
||||
|
|
@ -663,7 +685,6 @@ def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path)
|
|||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/market-note.pdf",
|
||||
source_path="examples/documents/market-note.pdf",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_market_note",
|
||||
title="market-note.pdf",
|
||||
|
|
@ -695,13 +716,13 @@ def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path)
|
|||
executor.execute('browse /documents "Federal Reserve" --space entity')
|
||||
)
|
||||
assert entity["data"]["data"][0]["summary"] == "Risk and compliance summary"
|
||||
assert entity["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf"
|
||||
assert entity["data"]["data"][0]["path"] == "/documents/market-note.pdf"
|
||||
|
||||
relation = json.loads(
|
||||
executor.execute('browse /documents "Disney valuation" --space relation')
|
||||
)
|
||||
assert relation["data"]["data"][0]["summary"] == "Risk and compliance summary"
|
||||
assert relation["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf"
|
||||
assert relation["data"]["data"][0]["path"] == "/documents/market-note.pdf"
|
||||
|
||||
|
||||
def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path):
|
||||
|
|
@ -711,7 +732,6 @@ def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path):
|
|||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.pdf",
|
||||
source_path="examples/documents/report.pdf",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_report",
|
||||
title="Annual report",
|
||||
|
|
@ -755,7 +775,7 @@ def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_
|
|||
assert "semantic-grep" not in rendered
|
||||
|
||||
|
||||
def test_grep_source_file_requires_terms_on_same_line(tmp_path):
|
||||
def test_grep_file_requires_terms_on_same_line(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
source_dir = tmp_path / "source" / "documents"
|
||||
|
|
@ -769,11 +789,10 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path):
|
|||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri=str(source),
|
||||
source_path="documents/split.json",
|
||||
folder_path="/documents",
|
||||
external_id="doc_split_terms",
|
||||
title="Split source terms",
|
||||
content="registered artifact without the searched tokens",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
|
|
@ -813,7 +832,6 @@ def test_existing_summary_projection_index_uses_current_config_when_dimensions_m
|
|||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
|
|
@ -879,7 +897,6 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
|
|||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
|
|
@ -948,7 +965,6 @@ def test_browse_semantic_files_uses_summary_projection_when_only_summary_availab
|
|||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
folder_path="/documents",
|
||||
external_id="doc_summary_only",
|
||||
title="Operations note",
|
||||
|
|
|
|||
|
|
@ -60,7 +60,6 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
|
|||
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/report.md",
|
||||
external_id="dsid_structural_missing",
|
||||
title="Structural report",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
|
|
@ -152,14 +151,12 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft
|
|||
|
||||
filesystem.register_file(
|
||||
storage_uri=source_pdf.as_uri(),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_pdf_extracted",
|
||||
title="PDF extracted",
|
||||
content="CALLER PDF CONTENT MUST NOT REACH GENERATOR",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source_md.as_uri(),
|
||||
source_path="docs/notes.md",
|
||||
external_id="dsid_md_extracted",
|
||||
title="Markdown extracted",
|
||||
content="CALLER MD CONTENT MUST NOT REACH GENERATOR",
|
||||
|
|
@ -167,8 +164,12 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft
|
|||
|
||||
pdf_request = generator.calls[0][0]
|
||||
md_request = generator.calls[1][0]
|
||||
pdf_stat = filesystem.store.file_info("dsid_pdf_extracted")
|
||||
md_stat = filesystem.store.file_info("dsid_md_extracted")
|
||||
pdf_entry = filesystem.store.get_file(
|
||||
filesystem.store.resolve_file_ref("dsid_pdf_extracted")
|
||||
)
|
||||
md_entry = filesystem.store.get_file(
|
||||
filesystem.store.resolve_file_ref("dsid_md_extracted")
|
||||
)
|
||||
|
||||
assert "PageIndex PDF extracted alpha text" in pdf_request.text
|
||||
assert "Second PageIndex PDF extracted beta text" in pdf_request.text
|
||||
|
|
@ -176,10 +177,10 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft
|
|||
assert "PageIndex Markdown extracted gamma text" in md_request.text
|
||||
assert "CALLER MD CONTENT" not in md_request.text
|
||||
assert "PageIndex PDF extracted alpha text" in Path(
|
||||
pdf_stat["text_artifact_path"]
|
||||
pdf_entry.text_artifact_path
|
||||
).read_text(encoding="utf-8")
|
||||
assert "PageIndex Markdown extracted gamma text" in Path(
|
||||
md_stat["text_artifact_path"]
|
||||
md_entry.text_artifact_path
|
||||
).read_text(encoding="utf-8")
|
||||
assert [r.external_id for r in filesystem.search("alpha beta", limit=5)] == [
|
||||
"dsid_pdf_extracted"
|
||||
|
|
@ -207,7 +208,6 @@ def test_register_text_metadata_generation_keeps_caller_content_without_pageinde
|
|||
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/readme.txt",
|
||||
source_path="docs/readme.txt",
|
||||
external_id="dsid_text_generation",
|
||||
title="Text generation",
|
||||
content="Plain text caller content stays authoritative.",
|
||||
|
|
@ -215,11 +215,14 @@ def test_register_text_metadata_generation_keeps_caller_content_without_pageinde
|
|||
)
|
||||
|
||||
stat = filesystem.store.file_info("dsid_text_generation")
|
||||
entry = filesystem.store.get_file(
|
||||
filesystem.store.resolve_file_ref("dsid_text_generation")
|
||||
)
|
||||
|
||||
assert generator.calls[0][0].text == "Plain text caller content stays authoritative."
|
||||
assert stat["pageindex_doc_id"] is None
|
||||
assert stat["pageindex_tree_status"] == "not_built"
|
||||
assert Path(stat["text_artifact_path"]).read_text(
|
||||
assert Path(entry.text_artifact_path).read_text(
|
||||
encoding="utf-8"
|
||||
) == "Plain text caller content stays authoritative."
|
||||
|
||||
|
|
@ -261,14 +264,12 @@ def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeyp
|
|||
|
||||
filesystem.register_file(
|
||||
storage_uri=str(source_pdf),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_pdf_build",
|
||||
title="PDF build",
|
||||
content="pdf text",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source_md.as_uri(),
|
||||
source_path="docs/notes.md",
|
||||
external_id="dsid_md_build",
|
||||
title="Markdown build",
|
||||
content=source_md.read_text(encoding="utf-8"),
|
||||
|
|
@ -332,7 +333,6 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke
|
|||
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_structural_cached",
|
||||
title="Cached structural report",
|
||||
content="text artifact remains available for grep, not cat --all",
|
||||
|
|
@ -370,7 +370,6 @@ def test_cat_node_is_not_supported():
|
|||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/notes.md",
|
||||
source_path="docs/notes.md",
|
||||
external_id="dsid_md_cached",
|
||||
title="Cached markdown notes",
|
||||
content="# Notes\n\nBody",
|
||||
|
|
@ -419,7 +418,6 @@ def test_cat_structure_page_and_text_outputs_are_hard_limited():
|
|||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_limited_pdf",
|
||||
title="Limited structural report",
|
||||
content="text artifact remains available for grep",
|
||||
|
|
@ -427,7 +425,6 @@ def test_cat_structure_page_and_text_outputs_are_hard_limited():
|
|||
text_content = "\n".join(f"line {index}" for index in range(1, 106))
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/long.txt",
|
||||
source_path="docs/long.txt",
|
||||
external_id="dsid_long_text",
|
||||
title="Long text",
|
||||
content=text_content,
|
||||
|
|
@ -474,7 +471,6 @@ def test_tree_folder_behavior_is_preserved():
|
|||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.txt",
|
||||
source_path="docs/report.txt",
|
||||
folder_path="/docs/reports",
|
||||
external_id="dsid_folder_tree",
|
||||
title="Folder report",
|
||||
|
|
@ -514,7 +510,6 @@ def test_tree_does_not_read_file_internal_pageindex_structure():
|
|||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_tree_is_folder_only",
|
||||
title="Cached structural report",
|
||||
content="text artifact remains available",
|
||||
|
|
@ -536,28 +531,24 @@ def test_cat_all_is_limited_to_text_files():
|
|||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/readme.txt",
|
||||
source_path="docs/readme.txt",
|
||||
external_id="dsid_text_file",
|
||||
title="Text readme",
|
||||
content="plain text body",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.pdf",
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_pdf_file",
|
||||
title="PDF report",
|
||||
content="extracted text should not be served through cat --all",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/notes.md",
|
||||
source_path="docs/notes.md",
|
||||
external_id="dsid_md_file",
|
||||
title="Markdown notes",
|
||||
content="markdown text should use PageIndex structure reads",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/data.json",
|
||||
source_path="docs/data.json",
|
||||
external_id="dsid_json_file",
|
||||
title="JSON record",
|
||||
content='{"body":"json"}',
|
||||
|
|
@ -589,7 +580,6 @@ def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
|
|||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/readme.txt",
|
||||
source_path="docs/readme.txt",
|
||||
external_id="dsid_text_only",
|
||||
title="Text readme",
|
||||
content="plain text body",
|
||||
|
|
@ -617,7 +607,6 @@ def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
|
|||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="uploads/uploaded",
|
||||
external_id="dsid_legacy_pageindex",
|
||||
title="Legacy PageIndex record",
|
||||
content="text/plain is only a weak default here",
|
||||
|
|
@ -665,7 +654,6 @@ def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monke
|
|||
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/late.md",
|
||||
external_id="dsid_late_cache",
|
||||
title="Late cache",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
|
|
|
|||
|
|
@ -80,12 +80,13 @@ def test_add_text_folder_target_copies_artifact_indexes_summary_and_is_readable(
|
|||
|
||||
info = filesystem.add_file(str(source), "/documents/reports")
|
||||
|
||||
assert info["source_path"] == "documents/reports/filing.txt"
|
||||
assert info["path"] == "/documents/reports/filing.txt"
|
||||
assert info["folder_path"] == "/documents/reports"
|
||||
assert filesystem.folder_info("/documents/reports")["path"] == "/documents/reports"
|
||||
assert info["storage_uri"] != source.as_uri()
|
||||
assert "/artifacts/uploads/" in info["storage_uri"]
|
||||
copied_path = Path(info["storage_uri"].removeprefix("file://"))
|
||||
entry = filesystem.store.get_file(info["file_ref"])
|
||||
assert entry.storage_uri != source.as_uri()
|
||||
assert "/artifacts/uploads/" in entry.storage_uri
|
||||
copied_path = Path(entry.storage_uri.removeprefix("file://"))
|
||||
assert copied_path.read_text(encoding="utf-8") == "alpha filing text for pifs add"
|
||||
assert copied_path.resolve() != source.resolve()
|
||||
|
||||
|
|
@ -164,7 +165,7 @@ def test_add_configures_semantic_retrieval_in_same_filesystem_instance(tmp_path)
|
|||
recursive=True,
|
||||
page_size=5,
|
||||
)
|
||||
assert [item["source_path"] for item in results["data"]] == ["documents/semantic.txt"]
|
||||
assert [item["path"] for item in results["data"]] == ["/documents/semantic.txt"]
|
||||
|
||||
|
||||
def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch):
|
||||
|
|
@ -205,10 +206,11 @@ def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monke
|
|||
info = filesystem.add_file(source, "/documents")
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
structure = json.loads(executor.execute("cat /documents/notes.md --structure"))
|
||||
entry = filesystem.store.get_file(info["file_ref"])
|
||||
|
||||
assert structure["data"]["available"] is True
|
||||
assert structure["data"]["structure"][0]["title"] == "Notes"
|
||||
assert indexed_paths == [Path(info["storage_uri"].removeprefix("file://"))]
|
||||
assert indexed_paths == [Path(entry.storage_uri.removeprefix("file://"))]
|
||||
assert indexed_paths[0].resolve() != source.resolve()
|
||||
|
||||
|
||||
|
|
@ -469,8 +471,6 @@ def test_cli_add_uses_workspace_and_prints_added_file(monkeypatch, capsys, tmp_p
|
|||
return {
|
||||
"file_ref": "file_cli",
|
||||
"path": "/documents/cli.txt",
|
||||
"source_path": "documents/cli.txt",
|
||||
"storage_uri": "file:///workspace/artifacts/uploads/file_cli/cli.txt",
|
||||
}
|
||||
|
||||
monkeypatch.setattr(cli, "PageIndexFileSystem", FakeAddFileSystem)
|
||||
|
|
@ -482,5 +482,4 @@ def test_cli_add_uses_workspace_and_prints_added_file(monkeypatch, capsys, tmp_p
|
|||
assert capsys.readouterr().out == (
|
||||
"added: /documents/cli.txt\n"
|
||||
"file_ref: file_cli\n"
|
||||
"storage_uri: file:///workspace/artifacts/uploads/file_cli/cli.txt\n"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -76,7 +76,6 @@ def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path):
|
|||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
|
|
@ -226,6 +225,28 @@ def test_cli_ask_invokes_agent_with_question(monkeypatch, capsys, tmp_path):
|
|||
}
|
||||
|
||||
|
||||
def test_cli_ask_defaults_to_global_agent_model(monkeypatch, capsys, tmp_path):
|
||||
from pageindex.filesystem import cli
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
agent_calls = []
|
||||
monkeypatch.delenv("PIFS_AGENT_MODEL", raising=False)
|
||||
monkeypatch.delenv("PIFS_MODEL", raising=False)
|
||||
|
||||
def fake_run_pifs_agent(filesystem, question, **kwargs):
|
||||
agent_calls.append(kwargs)
|
||||
return "agent answer"
|
||||
|
||||
monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
|
||||
monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent)
|
||||
|
||||
status = cli.main(["ask", "--workspace", str(workspace), "What?"])
|
||||
|
||||
assert status == 0
|
||||
assert capsys.readouterr().out == "agent answer\n"
|
||||
assert agent_calls[0]["model"] == "gpt-5.4"
|
||||
|
||||
|
||||
def test_cli_ask_loads_env_file_before_running_agent(monkeypatch, capsys, tmp_path):
|
||||
from pageindex.filesystem import cli
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,6 @@ def _register_find_fixture(tmp_path: Path):
|
|||
source.write_text(f"{title} fixture text", encoding="utf-8")
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path=f"docs/{filename}",
|
||||
folder_path=folder_path,
|
||||
external_id=external_id,
|
||||
title=title,
|
||||
|
|
@ -145,7 +144,6 @@ def test_stat_shell_output_includes_unified_metadata_status(tmp_path):
|
|||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
folder_path="/documents",
|
||||
external_id="doc_generated",
|
||||
title="Generated metadata document",
|
||||
|
|
@ -196,7 +194,6 @@ def test_stat_field_reads_one_metadata_field_across_multiple_targets(tmp_path):
|
|||
source.write_text(f"fixture text {index}", encoding="utf-8")
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path=f"docs/source{index}.txt",
|
||||
folder_path="/documents",
|
||||
external_id=f"doc_summary_{index}",
|
||||
title=f"Summary document {index}",
|
||||
|
|
@ -249,7 +246,6 @@ def test_stat_field_rejects_more_than_twenty_targets(tmp_path):
|
|||
source.write_text(f"fixture text {index}", encoding="utf-8")
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path=f"docs/source{index}.txt",
|
||||
folder_path="/documents",
|
||||
external_id=f"doc_{index}",
|
||||
title=f"Document {index}",
|
||||
|
|
@ -273,7 +269,6 @@ def test_register_rejects_pifs_owned_metadata_fields(tmp_path):
|
|||
with pytest.raises(ValueError, match="PIFS-owned generated field"):
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
folder_path="/documents",
|
||||
external_id="doc_conflict",
|
||||
title="Conflict document",
|
||||
|
|
@ -299,7 +294,6 @@ def test_batch_metadata_status_generates_into_unified_metadata(tmp_path):
|
|||
)
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
folder_path="/documents",
|
||||
external_id="doc_batch",
|
||||
title="Batch document",
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@ def _register_file(
|
|||
source.write_text(f"{external_id} fixture text", encoding="utf-8")
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path=f"docs/{filename}",
|
||||
folder_path=folder_path,
|
||||
external_id=external_id,
|
||||
title=external_id,
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ def test_root_virtual_file_path_resolves_without_double_slash(tmp_path):
|
|||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/root-source.txt",
|
||||
source_path="sources/root-source.txt",
|
||||
folder_path="/",
|
||||
external_id="doc_root_title",
|
||||
title="Root Title",
|
||||
|
|
@ -17,13 +16,12 @@ def test_root_virtual_file_path_resolves_without_double_slash(tmp_path):
|
|||
assert filesystem.store.resolve_file_ref("/Root Title") == file_ref
|
||||
|
||||
|
||||
def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path):
|
||||
def test_nested_virtual_file_path_resolves_by_folder_and_title(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
first_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.txt",
|
||||
source_path="b/file.txt",
|
||||
folder_path="/a",
|
||||
external_id="doc_first",
|
||||
title="First",
|
||||
|
|
@ -31,26 +29,23 @@ def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path):
|
|||
)
|
||||
second_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.txt",
|
||||
source_path="second-source.txt",
|
||||
folder_path="/a/b",
|
||||
external_id="doc_second",
|
||||
title="file.txt",
|
||||
content="second content",
|
||||
)
|
||||
|
||||
with pytest.raises(KeyError, match="Ambiguous file target"):
|
||||
filesystem.store.resolve_file_ref("/a/b/file.txt")
|
||||
assert filesystem.store.resolve_file_ref("/a/b/file.txt") == second_ref
|
||||
|
||||
assert first_ref != second_ref
|
||||
|
||||
|
||||
def test_duplicate_source_path_target_raises_clear_error(tmp_path):
|
||||
def test_unknown_virtual_file_target_raises_clear_error(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
first_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.txt",
|
||||
source_path="shared/source.txt",
|
||||
folder_path="/first",
|
||||
external_id="doc_first",
|
||||
title="First",
|
||||
|
|
@ -58,14 +53,13 @@ def test_duplicate_source_path_target_raises_clear_error(tmp_path):
|
|||
)
|
||||
second_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.txt",
|
||||
source_path="shared/source.txt",
|
||||
folder_path="/second",
|
||||
external_id="doc_second",
|
||||
title="Second",
|
||||
content="second content",
|
||||
)
|
||||
|
||||
with pytest.raises(KeyError, match="Ambiguous file target"):
|
||||
filesystem.store.resolve_file_ref("/shared/source.txt")
|
||||
with pytest.raises(KeyError, match="Unknown file target"):
|
||||
filesystem.store.resolve_file_ref("/shared/missing.txt")
|
||||
|
||||
assert first_ref != second_ref
|
||||
|
|
|
|||
|
|
@ -40,7 +40,6 @@ def test_register_insert_failure_cleans_owned_artifacts_and_skips_projection(
|
|||
with pytest.raises(RuntimeError, match="catalog insert failed"):
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
folder_path="/documents",
|
||||
external_id="doc_insert_failure",
|
||||
title="Insert failure",
|
||||
|
|
|
|||
|
|
@ -31,7 +31,6 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
|||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="github",
|
||||
source_path="github/a.json",
|
||||
title="Multipart upload limits",
|
||||
text="multipart upload limits",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
|
|
@ -41,7 +40,6 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
|||
file_ref="file_b",
|
||||
external_id="doc_b",
|
||||
source_type="slack",
|
||||
source_path="slack/b.json",
|
||||
title="GPU cache issue",
|
||||
text="gpu cache issue",
|
||||
vector=[0.0, 1.0, 0.0],
|
||||
|
|
@ -72,7 +70,6 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
|
|||
file_ref=f"file_off_{item:02d}",
|
||||
external_id=f"doc_off_{item:02d}",
|
||||
source_type="documents",
|
||||
source_path=f"other/{item:02d}.pdf",
|
||||
title=f"Off scope {item:02d}",
|
||||
text="off scope",
|
||||
vector=[1.0, 0.0],
|
||||
|
|
@ -84,7 +81,6 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
|
|||
file_ref="file_in_scope",
|
||||
external_id="doc_in_scope",
|
||||
source_type="documents",
|
||||
source_path="documents/in-scope.pdf",
|
||||
title="In scope",
|
||||
text="in scope",
|
||||
vector=[0.0, 1.0],
|
||||
|
|
@ -117,7 +113,6 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
|||
"file_ref": "file_a",
|
||||
"external_id": "doc_a",
|
||||
"source_type": "documents",
|
||||
"source_path": "docs/a.pdf",
|
||||
"title": "A",
|
||||
"metadata": {
|
||||
"summary": "Unified metadata summary.",
|
||||
|
|
@ -153,7 +148,6 @@ def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
|
|||
"file_ref": "file_a",
|
||||
"external_id": "doc_a",
|
||||
"source_type": "documents",
|
||||
"source_path": "docs/a.pdf",
|
||||
"title": "A",
|
||||
"metadata": {"summary": "Default dimension summary."},
|
||||
}
|
||||
|
|
@ -180,7 +174,6 @@ def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
|
|||
"file_ref": "file_a",
|
||||
"external_id": "doc_a",
|
||||
"source_type": "documents",
|
||||
"source_path": "docs/a.pdf",
|
||||
"title": "A",
|
||||
"metadata": {"summary": "Explicit 256 dimension summary."},
|
||||
}
|
||||
|
|
@ -304,7 +297,6 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
|
|||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="docs/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue