mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
874 lines
32 KiB
Python
874 lines
32 KiB
Python
import json
|
|
from types import SimpleNamespace
|
|
|
|
import pytest
|
|
|
|
|
|
def test_filesystem_lazy_exports_remain_public():
|
|
import pageindex.filesystem as filesystem
|
|
from pageindex.filesystem import (
|
|
HybridProjectionSearchBackend,
|
|
RebuildableSemanticIndex,
|
|
SemanticIndexRecord,
|
|
SemanticSearchResult,
|
|
SQLiteVecSemanticIndex,
|
|
SummaryProjectionIndexer,
|
|
)
|
|
|
|
for name in (
|
|
"HybridProjectionSearchBackend",
|
|
"RebuildableSemanticIndex",
|
|
"SemanticIndexRecord",
|
|
"SemanticSearchResult",
|
|
"SQLiteVecSemanticIndex",
|
|
"SummaryProjectionIndexer",
|
|
):
|
|
assert name in filesystem.__all__
|
|
assert name in dir(filesystem)
|
|
|
|
assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend"
|
|
assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex"
|
|
assert SemanticIndexRecord.__name__ == "SemanticIndexRecord"
|
|
assert SemanticSearchResult.__name__ == "SemanticSearchResult"
|
|
assert SQLiteVecSemanticIndex.__name__ == "SQLiteVecSemanticIndex"
|
|
assert SummaryProjectionIndexer.__name__ == "SummaryProjectionIndexer"
|
|
|
|
|
|
class SummaryBackend:
|
|
def __init__(self, document_id):
|
|
self.document_id = document_id
|
|
self.calls = []
|
|
|
|
def available_channels(self):
|
|
return ("summary",)
|
|
|
|
def search_channel(self, channel, query, *, limit=10, filters=None):
|
|
self.calls.append((channel, query, filters))
|
|
return [
|
|
SimpleNamespace(
|
|
document_id=self.document_id,
|
|
snippet=f"summary candidate: {query}",
|
|
)
|
|
]
|
|
|
|
|
|
class ChannelBackend:
|
|
def __init__(self, document_id, channels=("summary", "entity", "relation")):
|
|
self.document_id = document_id
|
|
self.channels = channels
|
|
|
|
def available_channels(self):
|
|
return self.channels
|
|
|
|
def search_channel(self, channel, query, *, limit=10, filters=None):
|
|
return [
|
|
SimpleNamespace(
|
|
document_id=self.document_id,
|
|
snippet=f"{channel} candidate: {query}",
|
|
)
|
|
]
|
|
|
|
|
|
class BrowseBackend:
|
|
def __init__(self, document_ids, channels=("summary",), file_refs_by_document_id=None):
|
|
self.document_ids = list(document_ids)
|
|
self.channels = channels
|
|
self.file_refs_by_document_id = dict(file_refs_by_document_id or {})
|
|
self.calls = []
|
|
|
|
def available_channels(self):
|
|
return self.channels
|
|
|
|
def search_channel(self, channel, query, *, limit=10, filters=None):
|
|
self.calls.append((channel, query, limit, filters))
|
|
file_ref_filter = set()
|
|
if isinstance(filters, dict):
|
|
raw_file_refs = filters.get("file_ref") or filters.get("file_refs") or []
|
|
if isinstance(raw_file_refs, str):
|
|
file_ref_filter = {raw_file_refs}
|
|
else:
|
|
file_ref_filter = {str(item) for item in raw_file_refs}
|
|
document_ids = self.document_ids
|
|
if file_ref_filter and self.file_refs_by_document_id:
|
|
document_ids = [
|
|
document_id
|
|
for document_id in document_ids
|
|
if self.file_refs_by_document_id.get(document_id) in file_ref_filter
|
|
]
|
|
return [
|
|
SimpleNamespace(
|
|
document_id=document_id,
|
|
snippet=f"{channel} candidate {rank}: {query}",
|
|
score=1.0 - rank * 0.01,
|
|
sources=[{"channel": channel, "rank": rank, "distance": rank / 10}],
|
|
)
|
|
for rank, document_id in enumerate(document_ids[:limit], 1)
|
|
]
|
|
|
|
|
|
def _register_browse_file(
|
|
filesystem,
|
|
external_id,
|
|
folder_path,
|
|
*,
|
|
department="ops",
|
|
summary=None,
|
|
):
|
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
|
|
|
class SummaryGenerator:
|
|
def generate(self, document, *, fields):
|
|
values = {
|
|
"summary": summary
|
|
if summary is not None
|
|
else f"summary for {document.external_id}",
|
|
"doc_type": "memo",
|
|
"domain": "finance",
|
|
"topic": "risk",
|
|
}
|
|
return MetadataGenerationResult(
|
|
values={field: values[field] for field in fields if field in values}
|
|
)
|
|
|
|
filesystem.metadata_generator = SummaryGenerator()
|
|
return filesystem.register_file(
|
|
storage_uri=f"file:///tmp/{external_id}.txt",
|
|
source_path=f"documents/{external_id}.txt",
|
|
folder_path=folder_path,
|
|
external_id=external_id,
|
|
title=f"{external_id}.txt",
|
|
content=f"{external_id} discusses vector databases and retrieval.",
|
|
metadata={"department": department},
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
|
|
|
|
def test_browse_is_agent_visible_semantic_command(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
executor = PIFSCommandExecutor(filesystem)
|
|
|
|
assert "browse" in executor.allowed_commands()
|
|
assert 'browse [-R] <folder> "<query>"' in executor.describe_available_command_surfaces()
|
|
|
|
|
|
def test_browse_requires_positional_query_and_rejects_removed_options(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
_register_browse_file(filesystem, "doc_direct", "/documents")
|
|
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"])
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
with pytest.raises(PIFSCommandError, match="browse requires a query"):
|
|
executor.execute("browse /documents")
|
|
with pytest.raises(PIFSCommandError, match="--query"):
|
|
executor.execute('browse /documents "vector database" --query "other"')
|
|
with pytest.raises(PIFSCommandError, match="--limit"):
|
|
executor.execute('browse /documents "vector database" --limit 10')
|
|
with pytest.raises(PIFSCommandError, match="--offset"):
|
|
executor.execute('browse /documents "vector database" --offset 10')
|
|
with pytest.raises(PIFSCommandError, match="browse accepts a folder and one quoted query"):
|
|
executor.execute("browse /documents vector database")
|
|
|
|
|
|
def test_browse_validates_space_availability_and_page(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
_register_browse_file(filesystem, "doc_direct", "/documents")
|
|
filesystem.semantic_retrieval_backend = BrowseBackend(["doc_direct"], channels=("summary",))
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
with pytest.raises(PIFSCommandError, match="Unsupported browse --space: hybrid"):
|
|
executor.execute('browse /documents "vector database" --space hybrid')
|
|
with pytest.raises(PIFSCommandError, match="available spaces: summary"):
|
|
executor.execute('browse /documents "vector database" --space entity')
|
|
with pytest.raises(PIFSCommandError, match="browse --page must be at least 1"):
|
|
executor.execute('browse /documents "vector database" --page 0')
|
|
|
|
|
|
def test_browse_default_summary_does_not_fallback_to_other_spaces(tmp_path):
|
|
import json
|
|
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
_register_browse_file(filesystem, "doc_direct", "/documents")
|
|
backend = BrowseBackend(["doc_direct"], channels=("entity",))
|
|
filesystem.semantic_retrieval_backend = backend
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
with pytest.raises(PIFSCommandError, match="available spaces: entity"):
|
|
executor.execute('browse /documents "vector database"')
|
|
assert backend.calls == []
|
|
|
|
result = json.loads(
|
|
executor.execute('browse /documents "vector database" --space entity')
|
|
)["data"]
|
|
assert [item["document_id"] for item in result["data"]] == ["doc_direct"]
|
|
assert backend.calls[-1][0] == "entity"
|
|
|
|
|
|
def test_browse_non_recursive_searches_only_direct_files_and_recursive_is_global(tmp_path):
|
|
import json
|
|
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
_register_browse_file(filesystem, "doc_direct", "/documents")
|
|
_register_browse_file(filesystem, "doc_deep", "/documents/reports")
|
|
backend = BrowseBackend(["doc_deep", "doc_direct"])
|
|
filesystem.semantic_retrieval_backend = backend
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
|
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
|
|
assert direct["recursive"] is False
|
|
assert direct["space"] == "summary"
|
|
assert direct["page"] == 1
|
|
assert direct["page_size"] == 10
|
|
assert backend.calls[-1][0] == "summary"
|
|
|
|
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
|
|
assert [item["document_id"] for item in recursive["data"]] == [
|
|
"doc_deep",
|
|
"doc_direct",
|
|
]
|
|
assert [item["rank"] for item in recursive["data"]] == [1, 2]
|
|
assert recursive["recursive"] is True
|
|
|
|
|
|
def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp_path):
|
|
import json
|
|
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
document_ids = []
|
|
for index in range(12):
|
|
external_id = f"doc_{index:02d}"
|
|
document_ids.append(external_id)
|
|
department = "finance" if index == 10 else "ops"
|
|
_register_browse_file(filesystem, external_id, "/documents", department=department)
|
|
filesystem.semantic_retrieval_backend = BrowseBackend(document_ids)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
first_page = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
|
assert len(first_page["data"]) == 10
|
|
assert first_page["has_more"] is True
|
|
assert first_page["data"][0]["rank"] == 1
|
|
|
|
second_page = json.loads(
|
|
executor.execute('browse /documents "vector database" --page 2')
|
|
)["data"]
|
|
assert [item["document_id"] for item in second_page["data"]] == ["doc_10", "doc_11"]
|
|
assert [item["rank"] for item in second_page["data"]] == [11, 12]
|
|
assert second_page["has_more"] is False
|
|
|
|
filtered = json.loads(
|
|
executor.execute(
|
|
'browse /documents "vector database" --where \'{"department":"finance"}\''
|
|
)
|
|
)["data"]
|
|
assert [item["document_id"] for item in filtered["data"]] == ["doc_10"]
|
|
assert filtered["data"][0]["summary"] == "summary for doc_10"
|
|
|
|
|
|
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
|
|
import json
|
|
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
file_refs_by_document_id = {}
|
|
candidate_ids = []
|
|
for index in range(150):
|
|
external_id = f"off_scope_{index:02d}"
|
|
candidate_ids.append(external_id)
|
|
file_refs_by_document_id[external_id] = _register_browse_file(
|
|
filesystem,
|
|
external_id,
|
|
"/other",
|
|
)
|
|
file_refs_by_document_id["doc_deep"] = _register_browse_file(
|
|
filesystem,
|
|
"doc_deep",
|
|
"/documents/reports",
|
|
)
|
|
file_refs_by_document_id["doc_direct"] = _register_browse_file(
|
|
filesystem,
|
|
"doc_direct",
|
|
"/documents",
|
|
)
|
|
backend = BrowseBackend(
|
|
[*candidate_ids, "doc_deep", "doc_direct"],
|
|
file_refs_by_document_id=file_refs_by_document_id,
|
|
)
|
|
filesystem.semantic_retrieval_backend = backend
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
direct = json.loads(executor.execute('browse /documents "vector database"'))["data"]
|
|
assert [item["document_id"] for item in direct["data"]] == ["doc_direct"]
|
|
|
|
recursive = json.loads(executor.execute('browse -R /documents "vector database"'))["data"]
|
|
assert [item["document_id"] for item in recursive["data"]] == [
|
|
"doc_deep",
|
|
"doc_direct",
|
|
]
|
|
|
|
|
|
def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path):
|
|
import re
|
|
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
document_ids = []
|
|
for index in range(12):
|
|
external_id = f"doc_{index:02d}"
|
|
document_ids.append(external_id)
|
|
_register_browse_file(
|
|
filesystem,
|
|
external_id,
|
|
"/documents",
|
|
department="finance",
|
|
summary=(
|
|
"first line\nsecond\tline with spaces"
|
|
if index == 0
|
|
else f"summary for {external_id}"
|
|
),
|
|
)
|
|
filesystem.semantic_retrieval_backend = BrowseBackend(
|
|
document_ids,
|
|
channels=("summary", "entity"),
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem)
|
|
|
|
rendered = executor.execute(
|
|
'browse -R /documents "vector database" --space entity '
|
|
'--where \'{"department":"finance"}\''
|
|
)
|
|
lines = rendered.splitlines()
|
|
|
|
assert lines[:6] == [
|
|
"# page=1 page_size=10 has_more=true",
|
|
"rank: 1",
|
|
"similarity: 0.91",
|
|
"path: /documents/doc_00.txt",
|
|
"summary: first line second line with spaces",
|
|
"",
|
|
]
|
|
assert lines[6:10] == [
|
|
"rank: 2",
|
|
"similarity: 0.83",
|
|
"path: /documents/doc_01.txt",
|
|
"summary: summary for doc_01",
|
|
]
|
|
similarity_lines = [line for line in lines if line.startswith("similarity: ")]
|
|
assert len(similarity_lines) == 10
|
|
assert all(re.fullmatch(r"similarity: [01]\.\d{2}", line) for line in similarity_lines)
|
|
assert all(0.0 <= float(line.removeprefix("similarity: ")) <= 1.0 for line in similarity_lines)
|
|
assert lines[-1] == (
|
|
"# next: browse -R /documents 'vector database' --space entity "
|
|
'--where \'{"department":"finance"}\' --page 2'
|
|
)
|
|
assert "mode:" not in rendered
|
|
assert "data:" not in rendered
|
|
assert "score:" not in rendered
|
|
|
|
|
|
def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
|
|
|
class SummaryGenerator:
|
|
def generate(self, document, *, fields):
|
|
return MetadataGenerationResult(
|
|
values={"summary": f"summary for {document.external_id}"}
|
|
)
|
|
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=tmp_path / "workspace",
|
|
metadata_generator=SummaryGenerator(),
|
|
)
|
|
first_ref = filesystem.register_file(
|
|
storage_uri="file:///tmp/first.json",
|
|
source_path="shared/source.json",
|
|
folder_path="/documents",
|
|
external_id="dsid_first",
|
|
title="First",
|
|
content="first content",
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/second.json",
|
|
source_path="shared/source.json",
|
|
folder_path="/documents",
|
|
external_id="dsid_second",
|
|
title="Second",
|
|
content="second content",
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_first"])
|
|
executor = PIFSCommandExecutor(filesystem)
|
|
|
|
rendered = executor.execute('browse /documents "first"')
|
|
|
|
assert "path: dsid_first" in rendered
|
|
assert "path: /shared/source.json" not in rendered
|
|
assert filesystem.store.resolve_file_ref("dsid_first") == first_ref
|
|
with pytest.raises(KeyError, match="Ambiguous file target"):
|
|
filesystem.store.resolve_file_ref("/shared/source.json")
|
|
|
|
|
|
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
|
|
|
class SummaryGenerator:
|
|
def generate(self, document, *, fields):
|
|
return MetadataGenerationResult(
|
|
values={"summary": "Federal Reserve annual report summary"}
|
|
)
|
|
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=tmp_path / "workspace",
|
|
metadata_generator=SummaryGenerator(),
|
|
)
|
|
file_ref = filesystem.register_file(
|
|
storage_uri="file:///tmp/report.pdf",
|
|
source_path="examples/documents/report.pdf",
|
|
folder_path="/documents",
|
|
external_id="dsid_report",
|
|
title="report.pdf",
|
|
metadata={"source_type": "examples-documents"},
|
|
content="Federal Reserve supervision and regulation annual report.",
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
backend = SummaryBackend("dsid_report")
|
|
filesystem.semantic_retrieval_backend = backend
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
result = json.loads(
|
|
executor.execute('search-summary "Federal Reserve annual report" /documents')
|
|
)
|
|
|
|
assert backend.calls[0][2] == {}
|
|
assert result["data"]["data"][0] == {
|
|
"path": "/examples/documents/report.pdf",
|
|
"summary": "Federal Reserve annual report summary",
|
|
"line_text": "1: Federal Reserve supervision and regulation annual report.",
|
|
}
|
|
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref
|
|
|
|
executor.json_output = False
|
|
rendered = executor.execute('search-summary "Federal Reserve annual report" /documents')
|
|
assert "path: /examples/documents/report.pdf" in rendered
|
|
assert "summary: Federal Reserve annual report summary" in rendered
|
|
assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered
|
|
assert "id=dsid_report" not in rendered
|
|
assert "file_ref=" not in rendered
|
|
|
|
|
|
def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
|
|
|
class SummaryGenerator:
|
|
def generate(self, document, *, fields):
|
|
return MetadataGenerationResult(
|
|
values={"summary": f"summary for {document.external_id}"}
|
|
)
|
|
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=tmp_path / "workspace",
|
|
metadata_generator=SummaryGenerator(),
|
|
)
|
|
first_ref = filesystem.register_file(
|
|
storage_uri="file:///tmp/first.json",
|
|
source_path="slack/dsid_first.json",
|
|
folder_path="/documents",
|
|
external_id="dsid_first",
|
|
title="announcements",
|
|
content="first announcement mentions H200 reservations.",
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/second.json",
|
|
source_path="slack/dsid_second.json",
|
|
folder_path="/documents",
|
|
external_id="dsid_second",
|
|
title="announcements",
|
|
content="second announcement mentions unrelated maintenance.",
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
result = json.loads(executor.execute('search-summary "H200 reservations" /documents'))
|
|
|
|
assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json"
|
|
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
|
|
with pytest.raises(KeyError, match="Ambiguous file target"):
|
|
filesystem.store.resolve_file_ref("/documents/announcements")
|
|
|
|
|
|
def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
|
|
|
class SummaryGenerator:
|
|
def generate(self, document, *, fields):
|
|
return MetadataGenerationResult(
|
|
values={"summary": f"summary for {document.external_id}"}
|
|
)
|
|
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=tmp_path / "workspace",
|
|
metadata_generator=SummaryGenerator(),
|
|
)
|
|
first_ref = filesystem.register_file(
|
|
storage_uri="file:///tmp/first.json",
|
|
source_path="shared/source.json",
|
|
folder_path="/documents",
|
|
external_id="dsid_first",
|
|
title="First",
|
|
content="first content",
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/second.json",
|
|
source_path="shared/source.json",
|
|
folder_path="/documents",
|
|
external_id="dsid_second",
|
|
title="Second",
|
|
content="second content",
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
result = json.loads(executor.execute('search-summary "first" /documents'))
|
|
|
|
assert result["data"]["data"][0]["path"] == "dsid_first"
|
|
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
|
|
|
|
|
|
def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
|
|
|
class MetadataGenerator:
|
|
def generate(self, document, *, fields):
|
|
values = {
|
|
"summary": "Risk and compliance summary",
|
|
"entity": "Federal Reserve; Disney",
|
|
"relation": "Federal Reserve affects Disney valuation",
|
|
}
|
|
return MetadataGenerationResult(values={field: values[field] for field in fields})
|
|
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=tmp_path / "workspace",
|
|
metadata_generator=MetadataGenerator(),
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/market-note.pdf",
|
|
source_path="examples/documents/market-note.pdf",
|
|
folder_path="/documents",
|
|
external_id="dsid_market_note",
|
|
title="market-note.pdf",
|
|
content="Federal Reserve policy affects Disney valuation.",
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
"entity": True,
|
|
"relation": True,
|
|
}
|
|
},
|
|
)
|
|
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note")
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents'))
|
|
assert entity["data"]["data"][0] == {
|
|
"path": "/examples/documents/market-note.pdf",
|
|
"summary": "Risk and compliance summary",
|
|
"line_text": "1: Federal Reserve policy affects Disney valuation.",
|
|
"entity": "Federal Reserve; Disney",
|
|
}
|
|
|
|
relation = json.loads(executor.execute('search-relation "Disney valuation" /documents'))
|
|
assert relation["data"]["data"][0] == {
|
|
"path": "/examples/documents/market-note.pdf",
|
|
"summary": "Risk and compliance summary",
|
|
"line_text": "1: Federal Reserve policy affects Disney valuation.",
|
|
"relation": "Federal Reserve affects Disney valuation",
|
|
}
|
|
|
|
executor.json_output = False
|
|
rendered = executor.execute('search-entity "Federal Reserve" /documents')
|
|
assert "path: /examples/documents/market-note.pdf" in rendered
|
|
assert "summary: Risk and compliance summary" in rendered
|
|
assert "entity: Federal Reserve; Disney" in rendered
|
|
assert "file_ref=" not in rendered
|
|
|
|
|
|
def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
from pageindex.filesystem.commands import PIFSCommandError
|
|
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
filesystem.register_file(
|
|
storage_uri="file:///tmp/report.pdf",
|
|
source_path="examples/documents/report.pdf",
|
|
folder_path="/documents",
|
|
external_id="dsid_report",
|
|
title="Annual report",
|
|
content="Federal Reserve supervision and regulation annual report.",
|
|
)
|
|
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_report")
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
with pytest.raises(PIFSCommandError, match="Quote multi-word queries"):
|
|
executor.execute("search-summary Federal Reserve /documents")
|
|
|
|
with pytest.raises(PIFSCommandError, match="quote it"):
|
|
executor.execute("search-summary Federal Reserve")
|
|
|
|
with pytest.raises(PIFSCommandError, match="does not support regex alternation"):
|
|
executor.execute('search-summary "Federal|Reserve" /documents')
|
|
|
|
|
|
def test_semantic_search_scope_filters_explicit_source_type_facets():
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
|
|
assert PageIndexFileSystem._semantic_filters_for_scope(
|
|
{"folder_path": "/source_type=google-drive"}
|
|
) == {"source_type": "google_drive"}
|
|
assert PageIndexFileSystem._semantic_filters_for_scope(
|
|
{"folder_path": "/semantic/source_type=google-drive"}
|
|
) == {"source_type": "google_drive"}
|
|
assert PageIndexFileSystem._semantic_filters_for_scope(
|
|
{"folder_path": "/documents"}
|
|
) == {}
|
|
|
|
|
|
def test_grep_source_file_requires_terms_on_same_line(tmp_path):
|
|
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
|
|
|
source_dir = tmp_path / "source" / "documents"
|
|
source_dir.mkdir(parents=True)
|
|
source = source_dir / "split.json"
|
|
source.write_text(
|
|
'{\n "first": "alpha evidence lives here",\n'
|
|
' "second": "omega evidence lives there"\n}\n',
|
|
encoding="utf-8",
|
|
)
|
|
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
|
filesystem.register_file(
|
|
storage_uri=str(source),
|
|
source_path="documents/split.json",
|
|
folder_path="/documents",
|
|
external_id="doc_split_terms",
|
|
title="Split source terms",
|
|
content="registered artifact without the searched tokens",
|
|
)
|
|
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
|
|
|
result = json.loads(executor.execute('grep -R "alpha omega" /documents'))
|
|
|
|
assert result["data"]["mode"] == "files"
|
|
assert result["data"]["data"] == []
|
|
|
|
matched = json.loads(executor.execute('grep -R "alpha evidence" /documents'))
|
|
|
|
assert matched["data"]["data"][0]["external_id"] == "doc_split_terms"
|
|
assert matched["data"]["data"][0]["line"] == 2
|
|
assert "alpha evidence" in matched["data"]["data"][0]["text"]
|
|
|
|
|
|
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
|
|
|
workspace = tmp_path / "workspace"
|
|
index_dir = workspace / "artifacts" / "projection_indexes"
|
|
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
|
summary_index.reset(
|
|
dimension=3,
|
|
metadata={
|
|
"channel": "summary",
|
|
"embedding_provider": "openai",
|
|
"embedding_model": "test-embedding",
|
|
"embedding_dimensions": 3,
|
|
},
|
|
)
|
|
summary_index.upsert_many(
|
|
[
|
|
SemanticIndexRecord(
|
|
file_ref="file_a",
|
|
external_id="doc_a",
|
|
source_type="documents",
|
|
source_path="documents/a.pdf",
|
|
title="A",
|
|
text="summary",
|
|
vector=[1.0, 0.0, 0.0],
|
|
)
|
|
]
|
|
)
|
|
filesystem = PageIndexFileSystem(workspace)
|
|
calls = []
|
|
|
|
def fake_configure(index_dir_arg, **kwargs):
|
|
calls.append((index_dir_arg, kwargs))
|
|
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
|
return filesystem.semantic_retrieval_backend
|
|
|
|
monkeypatch.setattr(
|
|
filesystem,
|
|
"configure_hybrid_projection_retrieval",
|
|
fake_configure,
|
|
)
|
|
|
|
assert filesystem.configure_existing_projection_retrieval() is True
|
|
assert calls == [
|
|
(
|
|
filesystem.summary_projection_index_dir,
|
|
{
|
|
"embedding_provider": "openai",
|
|
"embedding_model": "test-embedding",
|
|
"embedding_dimensions": 3,
|
|
"embedding_timeout": 60,
|
|
},
|
|
)
|
|
]
|
|
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
|
|
|
|
|
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
|
|
from pageindex.filesystem import PageIndexFileSystem
|
|
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
|
|
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
|
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
|
|
|
class FixedEmbedder:
|
|
def embed(self, texts):
|
|
return [[1.0, 0.0, 0.0] for _ in texts]
|
|
|
|
class SummaryGenerator:
|
|
def generate(self, document, *, fields):
|
|
return MetadataGenerationResult(
|
|
values={"summary": "vendor renewal risk matrix"}
|
|
)
|
|
|
|
source = tmp_path / "source.txt"
|
|
source.write_text("ordinary fixture body", encoding="utf-8")
|
|
index_dir = tmp_path / "workspace" / "artifacts" / "projection_indexes"
|
|
indexer = SummaryProjectionIndexer(
|
|
index_dir,
|
|
embedder=FixedEmbedder(),
|
|
embedding_provider="test",
|
|
embedding_model="fake",
|
|
embedding_dimensions=3,
|
|
)
|
|
backend = HybridProjectionSearchBackend(
|
|
index_dir,
|
|
embedder=FixedEmbedder(),
|
|
embedding_provider="test",
|
|
embedding_model="fake",
|
|
embedding_dimensions=3,
|
|
)
|
|
filesystem = PageIndexFileSystem(
|
|
workspace=tmp_path / "workspace",
|
|
metadata_generator=SummaryGenerator(),
|
|
summary_projection_indexer=indexer,
|
|
semantic_retrieval_backend=backend,
|
|
)
|
|
filesystem.register_file(
|
|
storage_uri=source.as_uri(),
|
|
source_path="docs/source.txt",
|
|
folder_path="/documents",
|
|
external_id="doc_summary_only",
|
|
title="Operations note",
|
|
content=source.read_text(encoding="utf-8"),
|
|
metadata={"department": "ops"},
|
|
metadata_policy={
|
|
"fields": {
|
|
"summary": True,
|
|
"doc_type": False,
|
|
"domain": False,
|
|
"topic": False,
|
|
}
|
|
},
|
|
)
|
|
|
|
assert filesystem.search("purchase order exposure", semantic=False) == []
|
|
|
|
results = filesystem.search("purchase order exposure", semantic=True)
|
|
|
|
assert [result.external_id for result in results] == ["doc_summary_only"]
|
|
assert results[0].snippet == "summary_vector rank=1"
|