mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
766 lines
31 KiB
Python
766 lines
31 KiB
Python
import json
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
def write_pageindex_client_doc(workspace: Path, doc_id: str, doc: dict) -> None:
|
|
workspace.mkdir(parents=True, exist_ok=True)
|
|
(workspace / f"{doc_id}.json").write_text(
|
|
json.dumps(doc, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
meta = {
|
|
doc_id: {
|
|
"type": doc.get("type", ""),
|
|
"doc_name": doc.get("doc_name", ""),
|
|
"doc_description": doc.get("doc_description", ""),
|
|
"path": doc.get("path", ""),
|
|
}
|
|
}
|
|
if doc.get("type") == "pdf":
|
|
meta[doc_id]["page_count"] = doc.get("page_count")
|
|
elif doc.get("type") == "md":
|
|
meta[doc_id]["line_count"] = doc.get("line_count")
|
|
(workspace / "_meta.json").write_text(
|
|
json.dumps(meta, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
class RecordingMetadataGenerator:
|
|
values = {
|
|
"summary": "Generated retrieval summary.",
|
|
"doc_type": "technical_note",
|
|
"domain": "documentation",
|
|
"topic": "pageindex extraction",
|
|
}
|
|
|
|
def __init__(self):
|
|
self.calls = []
|
|
|
|
def generate(self, request, *, fields):
|
|
self.calls.append((request, list(fields)))
|
|
return {field: self.values[field] for field in fields if field in self.values}
|
|
|
|
|
|
def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source = Path(tmp) / "report.md"
|
|
source.write_text("# Report\n\nCached structure is not built yet.", encoding="utf-8")
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
|
|
def fail_index(*args, **kwargs):
|
|
raise RuntimeError("index failed: extractor unavailable")
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
|
filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="docs/report.md",
|
|
external_id="dsid_structural_missing",
|
|
title="Structural report",
|
|
content=source.read_text(encoding="utf-8"),
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
structure = json.loads(executor.execute("cat dsid_structural_missing --structure"))
|
|
node = json.loads(executor.execute("cat dsid_structural_missing --node 0001"))
|
|
pages = json.loads(executor.execute("cat dsid_structural_missing --page 1-2"))
|
|
stat = json.loads(executor.execute("stat dsid_structural_missing"))
|
|
|
|
assert structure["data"]["mode"] == "structure"
|
|
assert structure["data"]["available"] is False
|
|
assert structure["data"]["status"] == "failed"
|
|
assert "RuntimeError: index failed: extractor unavailable" in structure["data"]["message"]
|
|
assert stat["data"]["pageindex_tree_status"] == "failed"
|
|
assert stat["data"]["metadata_status"]["pageindex_tree"] == {
|
|
"status": "failed",
|
|
"owner": "pageindex",
|
|
"source": "PageIndexClient.index",
|
|
"error_type": "RuntimeError",
|
|
"message": "index failed: extractor unavailable",
|
|
}
|
|
|
|
assert node["data"]["mode"] == "node"
|
|
assert node["data"]["available"] is False
|
|
assert node["data"]["node_id"] == "0001"
|
|
|
|
assert pages["data"]["mode"] == "page"
|
|
assert pages["data"]["available"] is False
|
|
assert pages["data"]["pages"] == "1-2"
|
|
|
|
assert "cp" not in executor.allowed_commands()
|
|
assert "mkdir" not in executor.allowed_commands()
|
|
|
|
|
|
def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_fts(monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
|
|
def fake_index(self, file_path, mode="auto"):
|
|
suffix = Path(file_path).suffix.lower()
|
|
doc_id = f"doc_{suffix.lstrip('.')}"
|
|
if suffix == ".pdf":
|
|
doc = {
|
|
"id": doc_id,
|
|
"type": "pdf",
|
|
"path": str(Path(file_path).resolve()),
|
|
"doc_name": "report.pdf",
|
|
"doc_description": "",
|
|
"page_count": 2,
|
|
"structure": [{"title": "Report", "node_id": "0001", "nodes": []}],
|
|
"pages": [
|
|
{"page": 1, "content": "PageIndex PDF extracted alpha text."},
|
|
{"page": 2, "content": "Second PageIndex PDF extracted beta text."},
|
|
],
|
|
}
|
|
else:
|
|
doc = {
|
|
"id": doc_id,
|
|
"type": "md",
|
|
"path": str(Path(file_path).resolve()),
|
|
"doc_name": "notes",
|
|
"doc_description": "",
|
|
"line_count": 3,
|
|
"structure": [
|
|
{
|
|
"title": "Notes",
|
|
"node_id": "0001",
|
|
"line_num": 1,
|
|
"text": "# Notes\n\nPageIndex Markdown extracted gamma text.",
|
|
"nodes": [],
|
|
}
|
|
],
|
|
}
|
|
write_pageindex_client_doc(self.workspace, doc_id, doc)
|
|
self.documents[doc_id] = doc
|
|
return doc_id
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fake_index)
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source_pdf = Path(tmp) / "report.pdf"
|
|
source_md = Path(tmp) / "notes.md"
|
|
source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
|
source_md.write_text("# Notes\n\nCaller markdown content", encoding="utf-8")
|
|
generator = RecordingMetadataGenerator()
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=Path(tmp) / "workspace",
|
|
metadata_generator=generator,
|
|
)
|
|
|
|
filesystem.register_file(
|
|
storage_uri=source_pdf.as_uri(),
|
|
source_path="docs/report.pdf",
|
|
external_id="dsid_pdf_extracted",
|
|
title="PDF extracted",
|
|
content="CALLER PDF CONTENT MUST NOT REACH GENERATOR",
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri=source_md.as_uri(),
|
|
source_path="docs/notes.md",
|
|
external_id="dsid_md_extracted",
|
|
title="Markdown extracted",
|
|
content="CALLER MD CONTENT MUST NOT REACH GENERATOR",
|
|
)
|
|
|
|
pdf_request = generator.calls[0][0]
|
|
md_request = generator.calls[1][0]
|
|
pdf_stat = filesystem.store.file_info("dsid_pdf_extracted")
|
|
md_stat = filesystem.store.file_info("dsid_md_extracted")
|
|
|
|
assert "PageIndex PDF extracted alpha text" in pdf_request.text
|
|
assert "Second PageIndex PDF extracted beta text" in pdf_request.text
|
|
assert "CALLER PDF CONTENT" not in pdf_request.text
|
|
assert "PageIndex Markdown extracted gamma text" in md_request.text
|
|
assert "CALLER MD CONTENT" not in md_request.text
|
|
assert "PageIndex PDF extracted alpha text" in Path(
|
|
pdf_stat["text_artifact_path"]
|
|
).read_text(encoding="utf-8")
|
|
assert "PageIndex Markdown extracted gamma text" in Path(
|
|
md_stat["text_artifact_path"]
|
|
).read_text(encoding="utf-8")
|
|
assert [r.external_id for r in filesystem.search("alpha beta", limit=5)] == [
|
|
"dsid_pdf_extracted"
|
|
]
|
|
assert [r.external_id for r in filesystem.search("gamma", limit=5)] == [
|
|
"dsid_md_extracted"
|
|
]
|
|
assert filesystem.search("CALLER", limit=5) == []
|
|
|
|
|
|
def test_register_text_metadata_generation_keeps_caller_content_without_pageindex(monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
|
|
def fail_index(*args, **kwargs):
|
|
raise AssertionError("PageIndexClient.index should not be called for text files")
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
generator = RecordingMetadataGenerator()
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=Path(tmp) / "workspace",
|
|
metadata_generator=generator,
|
|
)
|
|
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/readme.txt",
|
|
source_path="docs/readme.txt",
|
|
external_id="dsid_text_generation",
|
|
title="Text generation",
|
|
content="Plain text caller content stays authoritative.",
|
|
content_type="text/plain",
|
|
)
|
|
|
|
stat = filesystem.store.file_info("dsid_text_generation")
|
|
|
|
assert generator.calls[0][0].text == "Plain text caller content stays authoritative."
|
|
assert stat["pageindex_doc_id"] is None
|
|
assert stat["pageindex_tree_status"] == "not_built"
|
|
assert Path(stat["text_artifact_path"]).read_text(
|
|
encoding="utf-8"
|
|
) == "Plain text caller content stays authoritative."
|
|
|
|
|
|
def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
|
|
calls: list[str] = []
|
|
|
|
def fake_index(self, file_path, mode="auto"):
|
|
calls.append(str(file_path))
|
|
doc_id = f"doc_{Path(file_path).suffix.lstrip('.')}"
|
|
doc_type = "pdf" if Path(file_path).suffix == ".pdf" else "md"
|
|
doc = {
|
|
"id": doc_id,
|
|
"type": doc_type,
|
|
"path": str(Path(file_path).resolve()),
|
|
"doc_name": Path(file_path).name,
|
|
"doc_description": "",
|
|
"structure": [{"title": Path(file_path).stem, "node_id": "0001", "nodes": []}],
|
|
}
|
|
if doc_type == "pdf":
|
|
doc["page_count"] = 1
|
|
doc["pages"] = [{"page": 1, "content": "Page one text"}]
|
|
else:
|
|
doc["line_count"] = 1
|
|
write_pageindex_client_doc(self.workspace, doc_id, doc)
|
|
self.documents[doc_id] = doc
|
|
return doc_id
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fake_index)
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source_pdf = Path(tmp) / "report.pdf"
|
|
source_md = Path(tmp) / "notes.md"
|
|
source_pdf.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
|
source_md.write_text("# Notes", encoding="utf-8")
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
|
|
filesystem.register_file(
|
|
storage_uri=str(source_pdf),
|
|
source_path="docs/report.pdf",
|
|
external_id="dsid_pdf_build",
|
|
title="PDF build",
|
|
content="pdf text",
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri=source_md.as_uri(),
|
|
source_path="docs/notes.md",
|
|
external_id="dsid_md_build",
|
|
title="Markdown build",
|
|
content=source_md.read_text(encoding="utf-8"),
|
|
)
|
|
|
|
pdf_stat = filesystem.store.file_info("dsid_pdf_build")
|
|
md_stat = filesystem.store.file_info("dsid_md_build")
|
|
|
|
assert calls == [str(source_pdf.resolve()), str(source_md.resolve())]
|
|
assert pdf_stat["pageindex_doc_id"] == "doc_pdf"
|
|
assert pdf_stat["pageindex_tree_status"] == "built"
|
|
assert md_stat["pageindex_doc_id"] == "doc_md"
|
|
assert md_stat["pageindex_tree_status"] == "built"
|
|
|
|
|
|
def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source = Path(tmp) / "report.pdf"
|
|
source.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
|
workspace = Path(tmp) / "workspace"
|
|
filesystem = PageIndexFileSystem(workspace=workspace)
|
|
write_pageindex_client_doc(
|
|
filesystem.pageindex_client_workspace,
|
|
"doc_cached_pdf",
|
|
{
|
|
"id": "doc_cached_pdf",
|
|
"type": "pdf",
|
|
"path": str(source.resolve()),
|
|
"doc_name": "report.pdf",
|
|
"doc_description": "",
|
|
"page_count": 2,
|
|
"structure": [
|
|
{
|
|
"title": "Introduction",
|
|
"node_id": "0001",
|
|
"text": "Intro section text",
|
|
"nodes": [
|
|
{
|
|
"title": "Findings",
|
|
"node_id": "0002",
|
|
"physical_index": 2,
|
|
"nodes": [],
|
|
}
|
|
],
|
|
}
|
|
],
|
|
"pages": [
|
|
{"page": 1, "content": "Page one text"},
|
|
{"page": 2, "content": "Page two text"},
|
|
],
|
|
},
|
|
)
|
|
|
|
def fail_index(*args, **kwargs):
|
|
raise AssertionError("PageIndexClient.index should not be called on cache hit")
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
|
filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="docs/report.pdf",
|
|
external_id="dsid_structural_cached",
|
|
title="Cached structural report",
|
|
content="text artifact remains available for grep, not cat --all",
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
structure = json.loads(executor.execute("cat dsid_structural_cached --structure"))
|
|
pages = json.loads(executor.execute("cat dsid_structural_cached --page 1-2"))
|
|
stat = json.loads(executor.execute("stat dsid_structural_cached"))
|
|
|
|
assert structure["data"]["available"] is True
|
|
assert structure["data"]["pageindex_doc_id"] == "doc_cached_pdf"
|
|
assert structure["data"]["structure"][0]["title"] == "Introduction"
|
|
assert structure["data"]["structure"][1]["title"] == "Findings"
|
|
assert structure["data"]["structure_pagination"]["limit"] == 25
|
|
assert "text" not in structure["data"]["structure"][0]
|
|
assert "text" not in structure["data"]["structure"][1]
|
|
|
|
assert pages["data"]["available"] is True
|
|
assert pages["data"]["text"] == "Page one text\n\nPage two text"
|
|
with pytest.raises(PIFSCommandError, match="target-first"):
|
|
executor.execute("cat --page 1-2 dsid_structural_cached")
|
|
with pytest.raises(PIFSCommandError, match="one file target"):
|
|
executor.execute("cat dsid_structural_cached --page 1 2")
|
|
|
|
assert stat["data"]["pageindex_doc_id"] == "doc_cached_pdf"
|
|
assert stat["data"]["pageindex_tree_status"] == "built"
|
|
|
|
|
|
def test_cat_node_reads_pageindex_client_structure_without_custom_pifs_artifact():
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source = Path(tmp) / "notes.md"
|
|
source.write_text("# Notes\n\nBody", encoding="utf-8")
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
write_pageindex_client_doc(
|
|
filesystem.pageindex_client_workspace,
|
|
"doc_cached_md",
|
|
{
|
|
"id": "doc_cached_md",
|
|
"type": "md",
|
|
"path": str(source.resolve()),
|
|
"doc_name": "notes",
|
|
"doc_description": "",
|
|
"line_count": 3,
|
|
"structure": [
|
|
{
|
|
"title": "Notes",
|
|
"node_id": "0001",
|
|
"line_num": 1,
|
|
"text": "# Notes\n\nBody",
|
|
"nodes": [],
|
|
}
|
|
],
|
|
},
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="docs/notes.md",
|
|
external_id="dsid_md_cached",
|
|
title="Cached markdown notes",
|
|
content=source.read_text(encoding="utf-8"),
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
node = json.loads(executor.execute("cat dsid_md_cached --node 0001"))
|
|
|
|
assert node["data"]["available"] is True
|
|
assert node["data"]["pageindex_doc_id"] == "doc_cached_md"
|
|
assert node["data"]["node"]["title"] == "Notes"
|
|
assert node["data"]["text"] == "# Notes\n\nBody"
|
|
assert "text" not in node["data"]["node"]
|
|
|
|
|
|
def test_cat_structure_page_node_and_text_outputs_are_hard_limited():
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source = Path(tmp) / "report.pdf"
|
|
source.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
structure_nodes = [
|
|
{
|
|
"title": f"Section {index}",
|
|
"node_id": f"{index:04d}",
|
|
"start_index": index,
|
|
"end_index": index,
|
|
"text": f"node {index} text",
|
|
"nodes": [],
|
|
}
|
|
for index in range(1, 31)
|
|
]
|
|
write_pageindex_client_doc(
|
|
filesystem.pageindex_client_workspace,
|
|
"doc_limited_pdf",
|
|
{
|
|
"id": "doc_limited_pdf",
|
|
"type": "pdf",
|
|
"path": str(source.resolve()),
|
|
"doc_name": "report.pdf",
|
|
"doc_description": "",
|
|
"page_count": 10,
|
|
"structure": structure_nodes,
|
|
"pages": [
|
|
{"page": index, "content": f"Page {index} text"}
|
|
for index in range(1, 11)
|
|
],
|
|
},
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="docs/report.pdf",
|
|
external_id="dsid_limited_pdf",
|
|
title="Limited structural report",
|
|
content="text artifact remains available for grep",
|
|
)
|
|
text_content = "\n".join(f"line {index}" for index in range(1, 106))
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/long.txt",
|
|
source_path="docs/long.txt",
|
|
external_id="dsid_long_text",
|
|
title="Long text",
|
|
content=text_content,
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
first_structure = json.loads(executor.execute("cat dsid_limited_pdf --structure"))
|
|
assert len(first_structure["data"]["structure"]) == 25
|
|
assert first_structure["data"]["structure_pagination"]["has_more"] is True
|
|
assert first_structure["data"]["structure_pagination"]["next_offset"] == 25
|
|
|
|
second_structure = json.loads(
|
|
executor.execute("cat dsid_limited_pdf --structure --offset 25")
|
|
)
|
|
assert len(second_structure["data"]["structure"]) == 5
|
|
assert second_structure["data"]["structure"][0]["node_id"] == "0026"
|
|
|
|
pages = json.loads(executor.execute("cat dsid_limited_pdf --page 1-5"))
|
|
assert pages["data"]["text"] == (
|
|
"Page 1 text\n\nPage 2 text\n\nPage 3 text\n\nPage 4 text\n\nPage 5 text"
|
|
)
|
|
assert pages["data"]["page_pagination"]["limit"] == 5
|
|
with pytest.raises(PIFSCommandError, match="at most 5"):
|
|
executor.execute("cat dsid_limited_pdf --page 1-6")
|
|
with pytest.raises(PIFSCommandError, match="evidence is sufficient"):
|
|
executor.execute("cat dsid_limited_pdf --page 1-6")
|
|
|
|
nodes = json.loads(
|
|
executor.execute(
|
|
"cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
|
|
"0006 0007 0008 0009 0010"
|
|
)
|
|
)
|
|
assert nodes["data"]["node_ids"] == [
|
|
"0001",
|
|
"0002",
|
|
"0003",
|
|
"0004",
|
|
"0005",
|
|
"0006",
|
|
"0007",
|
|
"0008",
|
|
"0009",
|
|
"0010",
|
|
]
|
|
comma_nodes = json.loads(
|
|
executor.execute("cat dsid_limited_pdf --node 0001,0002")
|
|
)
|
|
assert comma_nodes["data"]["node_ids"] == ["0001", "0002"]
|
|
with pytest.raises(PIFSCommandError, match="at most 10"):
|
|
executor.execute(
|
|
"cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
|
|
"0006 0007 0008 0009 0010 0011"
|
|
)
|
|
with pytest.raises(PIFSCommandError, match="continue with additional chunks"):
|
|
executor.execute(
|
|
"cat dsid_limited_pdf --node 0001 0002 0003 0004 0005 "
|
|
"0006 0007 0008 0009 0010 0011"
|
|
)
|
|
|
|
with pytest.raises(PIFSCommandError, match="quote the whole target"):
|
|
executor.execute("cat dsid_limited_pdf 0001")
|
|
|
|
text = json.loads(executor.execute("cat dsid_long_text --all"))
|
|
assert "line 100" in text["data"]["text"]
|
|
assert "line 101" not in text["data"]["text"]
|
|
assert text["data"]["pagination"]["has_more"] is True
|
|
assert text["data"]["pagination"]["next_range"] == "101-105"
|
|
with pytest.raises(PIFSCommandError, match="at most 100"):
|
|
executor.execute("cat dsid_long_text --range 1-101")
|
|
|
|
|
|
def test_tree_folder_behavior_is_preserved():
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/report.txt",
|
|
source_path="docs/report.txt",
|
|
folder_path="/docs/reports",
|
|
external_id="dsid_folder_tree",
|
|
title="Folder report",
|
|
content="folder tree behavior remains intact",
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
folder_tree = json.loads(executor.execute("tree /docs --depth 2"))
|
|
|
|
assert folder_tree["data"]["path"] == "/docs"
|
|
assert folder_tree["data"]["folders"][0]["path"] == "/docs/reports"
|
|
|
|
|
|
def test_tree_does_not_read_file_internal_pageindex_structure():
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source = Path(tmp) / "report.pdf"
|
|
source.write_bytes(b"%PDF-1.4\n% test fixture\n")
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
write_pageindex_client_doc(
|
|
filesystem.pageindex_client_workspace,
|
|
"doc_tree_is_folder_only",
|
|
{
|
|
"id": "doc_tree_is_folder_only",
|
|
"type": "pdf",
|
|
"path": str(source.resolve()),
|
|
"doc_name": "report.pdf",
|
|
"doc_description": "",
|
|
"page_count": 1,
|
|
"structure": [
|
|
{"title": "Introduction", "node_id": "0001", "nodes": []}
|
|
],
|
|
"pages": [{"page": 1, "content": "Page one text"}],
|
|
},
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="docs/report.pdf",
|
|
external_id="dsid_tree_is_folder_only",
|
|
title="Cached structural report",
|
|
content="text artifact remains available",
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
with pytest.raises(PIFSCommandError):
|
|
executor.execute("tree dsid_tree_is_folder_only")
|
|
|
|
structure = json.loads(executor.execute("cat dsid_tree_is_folder_only --structure"))
|
|
assert structure["data"]["structure"][0]["title"] == "Introduction"
|
|
|
|
|
|
def test_cat_all_is_limited_to_text_files():
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/readme.txt",
|
|
source_path="docs/readme.txt",
|
|
external_id="dsid_text_file",
|
|
title="Text readme",
|
|
content="plain text body",
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/report.pdf",
|
|
source_path="docs/report.pdf",
|
|
external_id="dsid_pdf_file",
|
|
title="PDF report",
|
|
content="extracted text should not be served through cat --all",
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/notes.md",
|
|
source_path="docs/notes.md",
|
|
external_id="dsid_md_file",
|
|
title="Markdown notes",
|
|
content="markdown text should use PageIndex structure reads",
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/data.json",
|
|
source_path="docs/data.json",
|
|
external_id="dsid_json_file",
|
|
title="JSON record",
|
|
content='{"body":"json"}',
|
|
content_type="application/json",
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
text = json.loads(executor.execute("cat dsid_text_file --all"))
|
|
assert text["data"]["text"] == "plain text body"
|
|
|
|
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
|
executor.execute("cat dsid_pdf_file --all")
|
|
with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
|
|
filesystem.open("dsid_pdf_file")
|
|
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
|
executor.execute("cat dsid_md_file --all")
|
|
with pytest.raises(ValueError, match="not supported for PDF/Markdown"):
|
|
filesystem.open("dsid_md_file")
|
|
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
|
executor.execute("cat dsid_json_file --all")
|
|
opened_json = filesystem.open("dsid_json_file")
|
|
assert opened_json.text == '{"body":"json"}'
|
|
for command in (
|
|
"head dsid_pdf_file",
|
|
"tail dsid_pdf_file",
|
|
"sed -n 1,1p dsid_pdf_file",
|
|
"head dsid_md_file",
|
|
"tail dsid_md_file",
|
|
"sed -n 1,1p dsid_md_file",
|
|
):
|
|
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
|
executor.execute(command)
|
|
|
|
|
|
def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/readme.txt",
|
|
source_path="docs/readme.txt",
|
|
external_id="dsid_text_only",
|
|
title="Text readme",
|
|
content="plain text body",
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
for command in (
|
|
"cat dsid_text_only --structure",
|
|
"cat dsid_text_only --page 1",
|
|
"cat dsid_text_only --node 0001",
|
|
):
|
|
with pytest.raises(PIFSCommandError, match="only supported for PDF/Markdown"):
|
|
executor.execute(command)
|
|
|
|
|
|
def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source = Path(tmp) / "uploaded"
|
|
source.write_text("# Uploaded\n\nBody", encoding="utf-8")
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
file_ref = filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="uploads/uploaded",
|
|
external_id="dsid_legacy_pageindex",
|
|
title="Legacy PageIndex record",
|
|
content="text/plain is only a weak default here",
|
|
)
|
|
write_pageindex_client_doc(
|
|
filesystem.pageindex_client_workspace,
|
|
"doc_legacy_pageindex",
|
|
{
|
|
"id": "doc_legacy_pageindex",
|
|
"type": "md",
|
|
"path": str(source.resolve()),
|
|
"doc_name": "uploaded",
|
|
"doc_description": "",
|
|
"line_count": 3,
|
|
"structure": [
|
|
{"title": "Uploaded", "node_id": "0001", "text": "Body", "nodes": []}
|
|
],
|
|
},
|
|
)
|
|
filesystem.store.update_pageindex_pointer(
|
|
file_ref,
|
|
pageindex_doc_id="doc_legacy_pageindex",
|
|
pageindex_tree_status="built",
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
structure = json.loads(executor.execute("cat dsid_legacy_pageindex --structure"))
|
|
assert structure["data"]["structure"][0]["title"] == "Uploaded"
|
|
with pytest.raises(PIFSCommandError, match="only supported for txt/text files"):
|
|
executor.execute("cat dsid_legacy_pageindex --all")
|
|
|
|
|
|
def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monkeypatch):
|
|
from pageindex import PageIndexClient
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
with tempfile.TemporaryDirectory() as tmp:
|
|
source = Path(tmp) / "late.md"
|
|
source.write_text("# Late\n\nBody", encoding="utf-8")
|
|
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
|
|
|
|
def fail_index(*args, **kwargs):
|
|
raise RuntimeError("index failed")
|
|
|
|
monkeypatch.setattr(PageIndexClient, "index", fail_index)
|
|
filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="docs/late.md",
|
|
external_id="dsid_late_cache",
|
|
title="Late cache",
|
|
content=source.read_text(encoding="utf-8"),
|
|
)
|
|
write_pageindex_client_doc(
|
|
filesystem.pageindex_client_workspace,
|
|
"doc_late_cache",
|
|
{
|
|
"id": "doc_late_cache",
|
|
"type": "md",
|
|
"path": str(source.resolve()),
|
|
"doc_name": "late",
|
|
"doc_description": "",
|
|
"line_count": 3,
|
|
"structure": [
|
|
{"title": "Late", "node_id": "0001", "text": "Body", "nodes": []}
|
|
],
|
|
},
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
structure = json.loads(executor.execute("cat dsid_late_cache --structure"))
|
|
stat = json.loads(executor.execute("stat dsid_late_cache"))
|
|
|
|
assert structure["data"]["available"] is False
|
|
assert stat["data"]["pageindex_doc_id"] is None
|
|
assert stat["data"]["pageindex_tree_status"] == "failed"
|