mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-24 20:28:12 +02:00
feat(filesystem): add PageIndex filesystem shell
This commit is contained in:
parent
7592163e2a
commit
74d0600261
24 changed files with 11373 additions and 4 deletions
60
tests/test_pageindex_filesystem_scope.py
Normal file
60
tests/test_pageindex_filesystem_scope.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
import json
|
||||
from types import SimpleNamespace
|
||||
|
||||
|
||||
class SummaryBackend:
|
||||
def __init__(self, document_id):
|
||||
self.document_id = document_id
|
||||
self.calls = []
|
||||
|
||||
def available_channels(self):
|
||||
return ("summary",)
|
||||
|
||||
def search_channel(self, channel, query, *, limit=10, filters=None):
|
||||
self.calls.append((channel, query, filters))
|
||||
return [
|
||||
SimpleNamespace(
|
||||
document_id=self.document_id,
|
||||
snippet=f"summary candidate: {query}",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.pdf",
|
||||
source_path="examples/documents/report.pdf",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_report",
|
||||
title="Annual report",
|
||||
metadata={"source_type": "examples-documents"},
|
||||
content="Federal Reserve supervision and regulation annual report.",
|
||||
)
|
||||
backend = SummaryBackend("dsid_report")
|
||||
filesystem.semantic_retrieval_backend = backend
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
result = json.loads(
|
||||
executor.execute('search-summary "Federal Reserve annual report" /documents')
|
||||
)
|
||||
|
||||
assert backend.calls[0][2] == {}
|
||||
assert result["data"]["data"][0]["external_id"] == "dsid_report"
|
||||
|
||||
|
||||
def test_semantic_search_scope_filters_explicit_source_type_facets():
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
assert PageIndexFileSystem._semantic_filters_for_scope(
|
||||
{"folder_path": "/source_type=google-drive"}
|
||||
) == {"source_type": "google_drive"}
|
||||
assert PageIndexFileSystem._semantic_filters_for_scope(
|
||||
{"folder_path": "/semantic/source_type=google-drive"}
|
||||
) == {"source_type": "google_drive"}
|
||||
assert PageIndexFileSystem._semantic_filters_for_scope(
|
||||
{"folder_path": "/documents"}
|
||||
) == {}
|
||||
|
||||
632
tests/test_pageindex_structural_read.py
Normal file
632
tests/test_pageindex_structural_read.py
Normal file
|
|
@ -0,0 +1,632 @@
|
|||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def write_pageindex_client_doc(workspace: Path, doc_id: str, doc: dict) -> None:
|
||||
workspace.mkdir(parents=True, exist_ok=True)
|
||||
(workspace / f"{doc_id}.json").write_text(
|
||||
json.dumps(doc, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
meta = {
|
||||
doc_id: {
|
||||
"type": doc.get("type", ""),
|
||||
"doc_name": doc.get("doc_name", ""),
|
||||
"doc_description": doc.get("doc_description", ""),
|
||||
"path": doc.get("path", ""),
|
||||
}
|
||||
}
|
||||
if doc.get("type") == "pdf":
|
||||
meta[doc_id]["page_count"] = doc.get("page_count")
|
||||
elif doc.get("type") == "md":
|
||||
meta[doc_id]["line_count"] = doc.get("line_count")
|
||||
(workspace / "_meta.json").write_text(
|
||||
json.dumps(meta, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
class RecordingMetadataGenerator:
|
||||
values = {
|
||||
"summary": "Generated retrieval summary.",
|
||||
"doc_type": "technical_note",
|
||||
"domain": "documentation",
|
||||
"topic": "pageindex extraction",
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.calls = []
|
||||
|
||||
def generate(self, request, *, fields):
|
||||
self.calls.append((request, list(fields)))
|
||||
return {field: self.values[field] for field in fields if field in self.values}
|
||||
|
||||
|
||||
def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
|
||||
from pageindex import PageIndexClient
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source = Path(tmp) / "report.md"
|
||||
source.write_text("# Report\n\nCached structure is not built yet.", encoding="utf-8")
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
|
||||
def fail_index(*args, **kwargs):
|
||||
raise RuntimeError("index failed")
|
||||
|
||||
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/report.md",
|
||||
external_id="dsid_structural_missing",
|
||||
title="Structural report",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_structural_missing"))
|
||||
node = json.loads(executor.execute("cat --node 0001 dsid_structural_missing"))
|
||||
pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_missing"))
|
||||
stat = json.loads(executor.execute("stat dsid_structural_missing"))
|
||||
|
||||
assert structure["data"]["mode"] == "structure"
|
||||
assert structure["data"]["available"] is False
|
||||
assert structure["data"]["status"] == "failed"
|
||||
assert "PageIndexClient workspace" in structure["data"]["message"]
|
||||
assert stat["data"]["pageindex_tree_status"] == "failed"
|
||||
|
||||
assert node["data"]["mode"] == "node"
|
||||
assert node["data"]["available"] is False
|
||||
assert node["data"]["node_id"] == "0001"
|
||||
|
||||
assert pages["data"]["mode"] == "page"
|
||||
assert pages["data"]["available"] is False
|
||||
assert pages["data"]["pages"] == "1-2"
|
||||
|
||||
assert "cp" not in executor.allowed_commands()
|
||||
assert "mkdir" not in executor.allowed_commands()
|
||||
|
||||
|
||||
def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_fts(monkeypatch):
|
||||
from pageindex import PageIndexClient
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
def fake_index(self, file_path, mode="auto"):
|
||||
suffix = Path(file_path).suffix.lower()
|
||||
doc_id = f"doc_{suffix.lstrip('.')}"
|
||||
if suffix == ".pdf":
|
||||
doc = {
|
||||
"id": doc_id,
|
||||
"type": "pdf",
|
||||
"path": str(Path(file_path).resolve()),
|
||||
"doc_name": "report.pdf",
|
||||
"doc_description": "",
|
||||
"page_count": 2,
|
||||
"structure": [{"title": "Report", "node_id": "0001", "nodes": []}],
|
||||
"pages": [
|
||||
{"page": 1, "content": "PageIndex PDF extracted alpha text."},
|
||||
{"page": 2, "content": "Second PageIndex PDF extracted beta text."},
|
||||
],
|
||||
}
|
||||
else:
|
||||
doc = {
|
||||
"id": doc_id,
|
||||
"type": "md",
|
||||
"path": str(Path(file_path).resolve()),
|
||||
"doc_name": "notes",
|
||||
"doc_description": "",
|
||||
"line_count": 3,
|
||||
"structure": [
|
||||
{
|
||||
"title": "Notes",
|
||||
"node_id": "0001",
|
||||
"line_num": 1,
|
||||
"text": "# Notes\n\nPageIndex Markdown extracted gamma text.",
|
||||
"nodes": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
write_pageindex_client_doc(self.workspace, doc_id, doc)
|
||||
self.documents[doc_id] = doc
|
||||
return doc_id
|
||||
|
||||
monkeypatch.setattr(PageIndexClient, "index", fake_index)
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source_pdf = Path(tmp) / "report.pdf"
|
||||
source_md = Path(tmp) / "notes.md"
|
||||
source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
||||
source_md.write_text("# Notes\n\nCaller markdown content", encoding="utf-8")
|
||||
generator = RecordingMetadataGenerator()
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=Path(tmp) / "workspace",
|
||||
metadata_generator=generator,
|
||||
)
|
||||
|
||||
filesystem.register_file(
|
||||
storage_uri=source_pdf.as_uri(),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_pdf_extracted",
|
||||
title="PDF extracted",
|
||||
content="CALLER PDF CONTENT MUST NOT REACH GENERATOR",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source_md.as_uri(),
|
||||
source_path="docs/notes.md",
|
||||
external_id="dsid_md_extracted",
|
||||
title="Markdown extracted",
|
||||
content="CALLER MD CONTENT MUST NOT REACH GENERATOR",
|
||||
)
|
||||
|
||||
pdf_request = generator.calls[0][0]
|
||||
md_request = generator.calls[1][0]
|
||||
pdf_stat = filesystem.store.file_info("dsid_pdf_extracted")
|
||||
md_stat = filesystem.store.file_info("dsid_md_extracted")
|
||||
|
||||
assert "PageIndex PDF extracted alpha text" in pdf_request.text
|
||||
assert "Second PageIndex PDF extracted beta text" in pdf_request.text
|
||||
assert "CALLER PDF CONTENT" not in pdf_request.text
|
||||
assert "PageIndex Markdown extracted gamma text" in md_request.text
|
||||
assert "CALLER MD CONTENT" not in md_request.text
|
||||
assert "PageIndex PDF extracted alpha text" in Path(
|
||||
pdf_stat["text_artifact_path"]
|
||||
).read_text(encoding="utf-8")
|
||||
assert "PageIndex Markdown extracted gamma text" in Path(
|
||||
md_stat["text_artifact_path"]
|
||||
).read_text(encoding="utf-8")
|
||||
assert [r.external_id for r in filesystem.search("alpha beta", limit=5)] == [
|
||||
"dsid_pdf_extracted"
|
||||
]
|
||||
assert [r.external_id for r in filesystem.search("gamma", limit=5)] == [
|
||||
"dsid_md_extracted"
|
||||
]
|
||||
assert filesystem.search("CALLER", limit=5) == []
|
||||
|
||||
|
||||
def test_register_text_metadata_generation_keeps_caller_content_without_pageindex(monkeypatch):
|
||||
from pageindex import PageIndexClient
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
def fail_index(*args, **kwargs):
|
||||
raise AssertionError("PageIndexClient.index should not be called for text files")
|
||||
|
||||
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
generator = RecordingMetadataGenerator()
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=Path(tmp) / "workspace",
|
||||
metadata_generator=generator,
|
||||
)
|
||||
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/readme.txt",
|
||||
source_path="docs/readme.txt",
|
||||
external_id="dsid_text_generation",
|
||||
title="Text generation",
|
||||
content="Plain text caller content stays authoritative.",
|
||||
content_type="text/plain",
|
||||
)
|
||||
|
||||
stat = filesystem.store.file_info("dsid_text_generation")
|
||||
|
||||
assert generator.calls[0][0].text == "Plain text caller content stays authoritative."
|
||||
assert stat["pageindex_doc_id"] is None
|
||||
assert stat["pageindex_tree_status"] == "not_built"
|
||||
assert Path(stat["text_artifact_path"]).read_text(
|
||||
encoding="utf-8"
|
||||
) == "Plain text caller content stays authoritative."
|
||||
|
||||
|
||||
def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeypatch):
|
||||
from pageindex import PageIndexClient
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
calls: list[str] = []
|
||||
|
||||
def fake_index(self, file_path, mode="auto"):
|
||||
calls.append(str(file_path))
|
||||
doc_id = f"doc_{Path(file_path).suffix.lstrip('.')}"
|
||||
doc_type = "pdf" if Path(file_path).suffix == ".pdf" else "md"
|
||||
doc = {
|
||||
"id": doc_id,
|
||||
"type": doc_type,
|
||||
"path": str(Path(file_path).resolve()),
|
||||
"doc_name": Path(file_path).name,
|
||||
"doc_description": "",
|
||||
"structure": [{"title": Path(file_path).stem, "node_id": "0001", "nodes": []}],
|
||||
}
|
||||
if doc_type == "pdf":
|
||||
doc["page_count"] = 1
|
||||
doc["pages"] = [{"page": 1, "content": "Page one text"}]
|
||||
else:
|
||||
doc["line_count"] = 1
|
||||
write_pageindex_client_doc(self.workspace, doc_id, doc)
|
||||
self.documents[doc_id] = doc
|
||||
return doc_id
|
||||
|
||||
monkeypatch.setattr(PageIndexClient, "index", fake_index)
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source_pdf = Path(tmp) / "report.pdf"
|
||||
source_md = Path(tmp) / "notes.md"
|
||||
source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
||||
source_md.write_text("# Notes", encoding="utf-8")
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
|
||||
filesystem.register_file(
|
||||
storage_uri=str(source_pdf),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_pdf_build",
|
||||
title="PDF build",
|
||||
content="pdf text",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source_md.as_uri(),
|
||||
source_path="docs/notes.md",
|
||||
external_id="dsid_md_build",
|
||||
title="Markdown build",
|
||||
content=source_md.read_text(encoding="utf-8"),
|
||||
)
|
||||
|
||||
pdf_stat = filesystem.store.file_info("dsid_pdf_build")
|
||||
md_stat = filesystem.store.file_info("dsid_md_build")
|
||||
|
||||
assert calls == [str(source_pdf.resolve()), str(source_md.resolve())]
|
||||
assert pdf_stat["pageindex_doc_id"] == "doc_pdf"
|
||||
assert pdf_stat["pageindex_tree_status"] == "built"
|
||||
assert md_stat["pageindex_doc_id"] == "doc_md"
|
||||
assert md_stat["pageindex_tree_status"] == "built"
|
||||
|
||||
|
||||
def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monkeypatch):
|
||||
from pageindex import PageIndexClient
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source = Path(tmp) / "report.pdf"
|
||||
source.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
||||
workspace = Path(tmp) / "workspace"
|
||||
filesystem = PageIndexFileSystem(workspace=workspace)
|
||||
write_pageindex_client_doc(
|
||||
filesystem.pageindex_client_workspace,
|
||||
"doc_cached_pdf",
|
||||
{
|
||||
"id": "doc_cached_pdf",
|
||||
"type": "pdf",
|
||||
"path": str(source.resolve()),
|
||||
"doc_name": "report.pdf",
|
||||
"doc_description": "",
|
||||
"page_count": 2,
|
||||
"structure": [
|
||||
{
|
||||
"title": "Introduction",
|
||||
"node_id": "0001",
|
||||
"text": "Intro section text",
|
||||
"nodes": [
|
||||
{
|
||||
"title": "Findings",
|
||||
"node_id": "0002",
|
||||
"physical_index": 2,
|
||||
"nodes": [],
|
||||
}
|
||||
],
|
||||
}
|
||||
],
|
||||
"pages": [
|
||||
{"page": 1, "content": "Page one text"},
|
||||
{"page": 2, "content": "Page two text"},
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
def fail_index(*args, **kwargs):
|
||||
raise AssertionError("PageIndexClient.index should not be called on cache hit")
|
||||
|
||||
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_structural_cached",
|
||||
title="Cached structural report",
|
||||
content="text artifact remains available for grep, not cat --all",
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_structural_cached"))
|
||||
pages = json.loads(executor.execute("cat --page 1-2 dsid_structural_cached"))
|
||||
stat = json.loads(executor.execute("stat dsid_structural_cached"))
|
||||
|
||||
assert structure["data"]["available"] is True
|
||||
assert structure["data"]["pageindex_doc_id"] == "doc_cached_pdf"
|
||||
assert structure["data"]["structure"][0]["title"] == "Introduction"
|
||||
assert "text" not in structure["data"]["structure"][0]
|
||||
assert "text" not in structure["data"]["structure"][0]["nodes"][0]
|
||||
|
||||
assert pages["data"]["available"] is True
|
||||
assert pages["data"]["text"] == "Page one text\n\nPage two text"
|
||||
|
||||
assert stat["data"]["pageindex_doc_id"] == "doc_cached_pdf"
|
||||
assert stat["data"]["pageindex_tree_status"] == "built"
|
||||
|
||||
|
||||
def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact():
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source = Path(tmp) / "notes.md"
|
||||
source.write_text("# Notes\n\nBody", encoding="utf-8")
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
write_pageindex_client_doc(
|
||||
filesystem.pageindex_client_workspace,
|
||||
"doc_cached_md",
|
||||
{
|
||||
"id": "doc_cached_md",
|
||||
"type": "md",
|
||||
"path": str(source.resolve()),
|
||||
"doc_name": "notes",
|
||||
"doc_description": "",
|
||||
"line_count": 3,
|
||||
"structure": [
|
||||
{
|
||||
"title": "Notes",
|
||||
"node_id": "0001",
|
||||
"line_num": 1,
|
||||
"text": "# Notes\n\nBody",
|
||||
"nodes": [],
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/notes.md",
|
||||
external_id="dsid_md_cached",
|
||||
title="Cached markdown notes",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
node = json.loads(executor.execute("cat --node 0001 dsid_md_cached"))
|
||||
|
||||
assert node["data"]["available"] is True
|
||||
assert node["data"]["pageindex_doc_id"] == "doc_cached_md"
|
||||
assert node["data"]["node"]["title"] == "Notes"
|
||||
assert node["data"]["text"] == "# Notes\n\nBody"
|
||||
assert "text" not in node["data"]["node"]
|
||||
|
||||
|
||||
def test_tree_folder_behavior_is_preserved():
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.txt",
|
||||
source_path="docs/report.txt",
|
||||
folder_path="/docs/reports",
|
||||
external_id="dsid_folder_tree",
|
||||
title="Folder report",
|
||||
content="folder tree behavior remains intact",
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
folder_tree = json.loads(executor.execute("tree /docs --depth 2"))
|
||||
|
||||
assert folder_tree["data"]["path"] == "/docs"
|
||||
assert folder_tree["data"]["folders"][0]["path"] == "/docs/reports"
|
||||
|
||||
|
||||
def test_tree_does_not_read_file_internal_pageindex_structure():
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source = Path(tmp) / "report.pdf"
|
||||
source.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
write_pageindex_client_doc(
|
||||
filesystem.pageindex_client_workspace,
|
||||
"doc_tree_is_folder_only",
|
||||
{
|
||||
"id": "doc_tree_is_folder_only",
|
||||
"type": "pdf",
|
||||
"path": str(source.resolve()),
|
||||
"doc_name": "report.pdf",
|
||||
"doc_description": "",
|
||||
"page_count": 1,
|
||||
"structure": [
|
||||
{"title": "Introduction", "node_id": "0001", "nodes": []}
|
||||
],
|
||||
"pages": [{"page": 1, "content": "Page one text"}],
|
||||
},
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_tree_is_folder_only",
|
||||
title="Cached structural report",
|
||||
content="text artifact remains available",
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
with pytest.raises(PIFSCommandError):
|
||||
executor.execute("tree dsid_tree_is_folder_only")
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_tree_is_folder_only"))
|
||||
assert structure["data"]["structure"][0]["title"] == "Introduction"
|
||||
|
||||
|
||||
def test_cat_all_is_limited_to_text_files():
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/readme.txt",
|
||||
source_path="docs/readme.txt",
|
||||
external_id="dsid_text_file",
|
||||
title="Text readme",
|
||||
content="plain text body",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.pdf",
|
||||
source_path="docs/report.pdf",
|
||||
external_id="dsid_pdf_file",
|
||||
title="PDF report",
|
||||
content="extracted text should not be served through cat --all",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/notes.md",
|
||||
source_path="docs/notes.md",
|
||||
external_id="dsid_md_file",
|
||||
title="Markdown notes",
|
||||
content="markdown text should use PageIndex structure reads",
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/data.json",
|
||||
source_path="docs/data.json",
|
||||
external_id="dsid_json_file",
|
||||
title="JSON record",
|
||||
content='{"body":"json"}',
|
||||
content_type="application/json",
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
text = json.loads(executor.execute("cat --all dsid_text_file"))
|
||||
assert text["data"]["text"] == "plain text body"
|
||||
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute("cat --all dsid_pdf_file")
|
||||
with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
|
||||
filesystem.open("dsid_pdf_file")
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute("cat --all dsid_md_file")
|
||||
with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
|
||||
filesystem.open("dsid_md_file")
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute("cat --all dsid_json_file")
|
||||
assert filesystem.open("dsid_json_file").text == '{"body":"json"}'
|
||||
for command in (
|
||||
"head dsid_pdf_file",
|
||||
"tail dsid_pdf_file",
|
||||
"sed -n 1,1p dsid_pdf_file",
|
||||
"head dsid_md_file",
|
||||
"tail dsid_md_file",
|
||||
"sed -n 1,1p dsid_md_file",
|
||||
):
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute(command)
|
||||
|
||||
|
||||
def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/readme.txt",
|
||||
source_path="docs/readme.txt",
|
||||
external_id="dsid_text_only",
|
||||
title="Text readme",
|
||||
content="plain text body",
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
for command in (
|
||||
"cat --structure dsid_text_only",
|
||||
"cat --page 1 dsid_text_only",
|
||||
"cat --node 0001 dsid_text_only",
|
||||
):
|
||||
with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"):
|
||||
executor.execute(command)
|
||||
|
||||
|
||||
def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.commands import PIFSCommandError
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source = Path(tmp) / "uploaded"
|
||||
source.write_text("# Uploaded\n\nBody", encoding="utf-8")
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="uploads/uploaded",
|
||||
external_id="dsid_legacy_pageindex",
|
||||
title="Legacy PageIndex record",
|
||||
content="text/plain is only a weak default here",
|
||||
)
|
||||
write_pageindex_client_doc(
|
||||
filesystem.pageindex_client_workspace,
|
||||
"doc_legacy_pageindex",
|
||||
{
|
||||
"id": "doc_legacy_pageindex",
|
||||
"type": "md",
|
||||
"path": str(source.resolve()),
|
||||
"doc_name": "uploaded",
|
||||
"doc_description": "",
|
||||
"line_count": 3,
|
||||
"structure": [
|
||||
{"title": "Uploaded", "node_id": "0001", "text": "Body", "nodes": []}
|
||||
],
|
||||
},
|
||||
)
|
||||
filesystem.store.update_pageindex_pointer(
|
||||
file_ref,
|
||||
pageindex_doc_id="doc_legacy_pageindex",
|
||||
pageindex_tree_status="built",
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_legacy_pageindex"))
|
||||
assert structure["data"]["structure"][0]["title"] == "Uploaded"
|
||||
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
||||
executor.execute("cat --all dsid_legacy_pageindex")
|
||||
|
||||
|
||||
def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monkeypatch):
|
||||
from pageindex import PageIndexClient
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
source = Path(tmp) / "late.md"
|
||||
source.write_text("# Late\n\nBody", encoding="utf-8")
|
||||
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
||||
|
||||
def fail_index(*args, **kwargs):
|
||||
raise RuntimeError("index failed")
|
||||
|
||||
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/late.md",
|
||||
external_id="dsid_late_cache",
|
||||
title="Late cache",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
)
|
||||
write_pageindex_client_doc(
|
||||
filesystem.pageindex_client_workspace,
|
||||
"doc_late_cache",
|
||||
{
|
||||
"id": "doc_late_cache",
|
||||
"type": "md",
|
||||
"path": str(source.resolve()),
|
||||
"doc_name": "late",
|
||||
"doc_description": "",
|
||||
"line_count": 3,
|
||||
"structure": [
|
||||
{"title": "Late", "node_id": "0001", "text": "Body", "nodes": []}
|
||||
],
|
||||
},
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
structure = json.loads(executor.execute("cat --structure dsid_late_cache"))
|
||||
stat = json.loads(executor.execute("stat dsid_late_cache"))
|
||||
|
||||
assert structure["data"]["available"] is False
|
||||
assert stat["data"]["pageindex_doc_id"] is None
|
||||
assert stat["data"]["pageindex_tree_status"] == "failed"
|
||||
185
tests/test_pifs_agent_stream.py
Normal file
185
tests/test_pifs_agent_stream.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
import io
|
||||
import os
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from pageindex.filesystem.agent import (
|
||||
PIFSAgentStreamObserver,
|
||||
build_agent_model_settings,
|
||||
normalize_agent_stream_mode,
|
||||
normalize_reasoning_effort,
|
||||
normalize_reasoning_summary,
|
||||
pifs_agent_raw_reasoning_enabled,
|
||||
serialize_agent_final_output,
|
||||
should_disable_pifs_agent_tracing,
|
||||
should_use_openai_compatible_chat_model,
|
||||
)
|
||||
|
||||
|
||||
class StructuredAnswer(BaseModel):
|
||||
model_config = ConfigDict(extra="forbid")
|
||||
|
||||
answer: str
|
||||
document_ids: list[str]
|
||||
|
||||
|
||||
class PIFSAgentStreamTest(unittest.TestCase):
|
||||
def raw_event(self, event_type, delta):
|
||||
return SimpleNamespace(
|
||||
type="raw_response_event",
|
||||
data=SimpleNamespace(type=event_type, delta=delta),
|
||||
)
|
||||
|
||||
def test_model_stream_prints_output_and_think_deltas(self):
|
||||
output = io.StringIO()
|
||||
stream_log = []
|
||||
observer = PIFSAgentStreamObserver("model", stream_log=stream_log, output=output)
|
||||
|
||||
observer.handle_event(self.raw_event("response.reasoning_summary_text.delta", "look up folder"))
|
||||
observer.handle_event(self.raw_event("response.output_text.delta", '{"answer":'))
|
||||
observer.handle_event(self.raw_event("response.output_text.delta", '"done"}'))
|
||||
observer.finish()
|
||||
|
||||
printed = output.getvalue()
|
||||
self.assertIn("[llm reasoning summary stream]", printed)
|
||||
self.assertIn("look up folder", printed)
|
||||
self.assertIn("[llm final output stream]", printed)
|
||||
self.assertIn('{"answer":"done"}', printed.replace("\n", ""))
|
||||
self.assertEqual(
|
||||
stream_log,
|
||||
[
|
||||
{"kind": "output", "text": '{"answer":"done"}'},
|
||||
{"kind": "think_summary", "text": "look up folder"},
|
||||
],
|
||||
)
|
||||
|
||||
def test_tools_mode_does_not_print_model_text(self):
|
||||
output = io.StringIO()
|
||||
stream_log = []
|
||||
observer = PIFSAgentStreamObserver("tools", stream_log=stream_log, output=output)
|
||||
|
||||
observer.handle_event(self.raw_event("response.output_text.delta", "hidden from tools mode"))
|
||||
observer.handle_event(self.raw_event("response.function_call_arguments.delta", '{"command":"ls /"}'))
|
||||
observer.emit_tool_call("ls /")
|
||||
observer.emit_tool_result(ok=True, output='{"ok": true}', seconds=0.001)
|
||||
observer.finish()
|
||||
|
||||
printed = output.getvalue()
|
||||
self.assertNotIn("hidden from tools mode", printed)
|
||||
self.assertIn("[llm -> pifs command]", printed)
|
||||
self.assertIn("ls /", printed)
|
||||
self.assertIn("[pifs -> llm result preview]", printed)
|
||||
self.assertIn('{"ok": true}', printed)
|
||||
self.assertEqual(stream_log[0], {"kind": "tool_call", "command": "ls /"})
|
||||
self.assertEqual(stream_log[1]["kind"], "tool_result")
|
||||
self.assertEqual(stream_log[2], {"kind": "tool_args", "text": '{"command":"ls /"}'})
|
||||
|
||||
def test_tool_result_preview_compacts_large_outputs(self):
|
||||
output = io.StringIO()
|
||||
observer = PIFSAgentStreamObserver("tools", output=output)
|
||||
|
||||
observer.emit_tool_result(
|
||||
ok=True,
|
||||
output="\n".join(f"line {index}" for index in range(50)),
|
||||
seconds=0.001,
|
||||
)
|
||||
|
||||
printed = output.getvalue()
|
||||
self.assertIn("[large PIFS result", printed)
|
||||
self.assertIn("line 0", printed)
|
||||
self.assertIn("more lines omitted from preview", printed)
|
||||
self.assertNotIn("line 49", printed)
|
||||
|
||||
def test_raw_reasoning_is_not_logged_by_default_but_summary_is(self):
|
||||
output = io.StringIO()
|
||||
stream_log = []
|
||||
previous = os.environ.pop("PAGEINDEX_PIFS_AGENT_RAW_REASONING", None)
|
||||
try:
|
||||
observer = PIFSAgentStreamObserver("model", stream_log=stream_log, output=output)
|
||||
observer.handle_event(self.raw_event("response.reasoning_text.delta", "private chain"))
|
||||
observer.handle_event(
|
||||
self.raw_event("response.reasoning_summary_text.delta", "visible summary")
|
||||
)
|
||||
observer.finish()
|
||||
finally:
|
||||
if previous is not None:
|
||||
os.environ["PAGEINDEX_PIFS_AGENT_RAW_REASONING"] = previous
|
||||
|
||||
printed = output.getvalue()
|
||||
self.assertNotIn("private chain", printed)
|
||||
self.assertIn("visible summary", printed)
|
||||
self.assertEqual(stream_log, [{"kind": "think_summary", "text": "visible summary"}])
|
||||
|
||||
def test_raw_reasoning_requires_debug_env_flag(self):
|
||||
self.assertFalse(pifs_agent_raw_reasoning_enabled({}))
|
||||
self.assertTrue(
|
||||
pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "on"})
|
||||
)
|
||||
self.assertTrue(
|
||||
pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "TRUE"})
|
||||
)
|
||||
self.assertFalse(
|
||||
pifs_agent_raw_reasoning_enabled({"PAGEINDEX_PIFS_AGENT_RAW_REASONING": "0"})
|
||||
)
|
||||
|
||||
def test_stream_mode_aliases(self):
|
||||
self.assertEqual(normalize_agent_stream_mode("think"), "model")
|
||||
self.assertEqual(normalize_agent_stream_mode("debug"), "all")
|
||||
self.assertEqual(normalize_agent_stream_mode(""), "off")
|
||||
with self.assertRaises(ValueError):
|
||||
normalize_agent_stream_mode("nope")
|
||||
|
||||
def test_reasoning_settings_enable_effort_and_summary(self):
|
||||
settings = build_agent_model_settings(
|
||||
reasoning_effort="medium",
|
||||
reasoning_summary="detailed",
|
||||
)
|
||||
|
||||
self.assertIsNotNone(settings)
|
||||
self.assertEqual(settings.reasoning.effort, "medium")
|
||||
self.assertEqual(settings.reasoning.summary, "detailed")
|
||||
self.assertEqual(settings.verbosity, "low")
|
||||
|
||||
def test_reasoning_effort_defaults_to_visible_summary(self):
|
||||
settings = build_agent_model_settings(reasoning_effort="low")
|
||||
|
||||
self.assertIsNotNone(settings)
|
||||
self.assertEqual(settings.reasoning.effort, "low")
|
||||
self.assertEqual(settings.reasoning.summary, "auto")
|
||||
|
||||
def test_reasoning_and_base_url_normalization(self):
|
||||
self.assertEqual(normalize_reasoning_effort("xhigh"), "xhigh")
|
||||
self.assertIsNone(normalize_reasoning_summary("none"))
|
||||
self.assertFalse(should_use_openai_compatible_chat_model(None))
|
||||
self.assertFalse(should_use_openai_compatible_chat_model("https://api.openai.com/v1/"))
|
||||
self.assertTrue(should_use_openai_compatible_chat_model("https://example.test/v1"))
|
||||
with self.assertRaises(ValueError):
|
||||
normalize_reasoning_effort("maximum")
|
||||
|
||||
def test_tracing_is_disabled_by_default_unless_env_enables_it(self):
|
||||
self.assertTrue(should_disable_pifs_agent_tracing({}))
|
||||
self.assertFalse(
|
||||
should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "1"})
|
||||
)
|
||||
self.assertFalse(
|
||||
should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "true"})
|
||||
)
|
||||
self.assertFalse(
|
||||
should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "on"})
|
||||
)
|
||||
self.assertTrue(
|
||||
should_disable_pifs_agent_tracing({"PAGEINDEX_PIFS_AGENT_TRACING": "0"})
|
||||
)
|
||||
|
||||
def test_structured_agent_output_serializes_to_json(self):
|
||||
output = serialize_agent_final_output(
|
||||
StructuredAnswer(answer="done", document_ids=["dsid_1"])
|
||||
)
|
||||
|
||||
self.assertEqual(output, '{"answer":"done","document_ids":["dsid_1"]}')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
53
tests/test_semantic_index.py
Normal file
53
tests/test_semantic_index.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(REPO_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
from pageindex.filesystem.semantic_index import (
|
||||
SemanticIndexRecord,
|
||||
SQLiteVecSemanticIndex,
|
||||
)
|
||||
|
||||
|
||||
def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
||||
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
|
||||
index.reset(dimension=3, metadata={"field_mode": "summary"})
|
||||
|
||||
index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="github",
|
||||
source_path="github/a.json",
|
||||
title="Multipart upload limits",
|
||||
text="multipart upload limits",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
metadata={"topic": "uploads"},
|
||||
),
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_b",
|
||||
external_id="doc_b",
|
||||
source_type="slack",
|
||||
source_path="slack/b.json",
|
||||
title="GPU cache issue",
|
||||
text="gpu cache issue",
|
||||
vector=[0.0, 1.0, 0.0],
|
||||
metadata={"topic": "runtime"},
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert index.info()["document_count"] == 2
|
||||
|
||||
results = index.search([0.9, 0.1, 0.0], limit=2)
|
||||
assert [item.external_id for item in results] == ["doc_a", "doc_b"]
|
||||
|
||||
filtered = index.search(
|
||||
[0.9, 0.1, 0.0],
|
||||
limit=2,
|
||||
filters={"source_type": "slack"},
|
||||
)
|
||||
assert [item.external_id for item in filtered] == ["doc_b"]
|
||||
Loading…
Add table
Add a link
Reference in a new issue