PageIndex/tests/test_pageindex_filesystem_scope.py

383 lines
14 KiB
Python

import json
from types import SimpleNamespace
import pytest
def test_filesystem_lazy_exports_remain_public():
import pageindex.filesystem as filesystem
from pageindex.filesystem import (
HybridProjectionSearchBackend,
RebuildableSemanticIndex,
SemanticIndexRecord,
SemanticSearchResult,
SQLiteVecSemanticIndex,
SummaryProjectionIndexer,
)
for name in (
"HybridProjectionSearchBackend",
"RebuildableSemanticIndex",
"SemanticIndexRecord",
"SemanticSearchResult",
"SQLiteVecSemanticIndex",
"SummaryProjectionIndexer",
):
assert name in filesystem.__all__
assert name in dir(filesystem)
assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend"
assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex"
assert SemanticIndexRecord.__name__ == "SemanticIndexRecord"
assert SemanticSearchResult.__name__ == "SemanticSearchResult"
assert SQLiteVecSemanticIndex.__name__ == "SQLiteVecSemanticIndex"
assert SummaryProjectionIndexer.__name__ == "SummaryProjectionIndexer"
class SummaryBackend:
def __init__(self, document_id):
self.document_id = document_id
self.calls = []
def available_channels(self):
return ("summary",)
def search_channel(self, channel, query, *, limit=10, filters=None):
self.calls.append((channel, query, filters))
return [
SimpleNamespace(
document_id=self.document_id,
snippet=f"summary candidate: {query}",
)
]
class ChannelBackend:
def __init__(self, document_id, channels=("summary", "entity", "relation")):
self.document_id = document_id
self.channels = channels
def available_channels(self):
return self.channels
def search_channel(self, channel, query, *, limit=10, filters=None):
return [
SimpleNamespace(
document_id=self.document_id,
snippet=f"{channel} candidate: {query}",
)
]
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(
values={"summary": "Federal Reserve annual report summary"}
)
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
filesystem.register_file(
storage_uri="file:///tmp/report.pdf",
source_path="examples/documents/report.pdf",
folder_path="/documents",
external_id="dsid_report",
title="report.pdf",
metadata={"source_type": "examples-documents"},
content="Federal Reserve supervision and regulation annual report.",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
backend = SummaryBackend("dsid_report")
filesystem.semantic_retrieval_backend = backend
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(
executor.execute('search-summary "Federal Reserve annual report" /documents')
)
assert backend.calls[0][2] == {}
assert result["data"]["data"][0] == {
"path": "/documents/report.pdf",
"summary": "Federal Reserve annual report summary",
"line_text": "1: Federal Reserve supervision and regulation annual report.",
}
executor.json_output = False
rendered = executor.execute('search-summary "Federal Reserve annual report" /documents')
assert "path: /documents/report.pdf" in rendered
assert "summary: Federal Reserve annual report summary" in rendered
assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered
assert "id=dsid_report" not in rendered
assert "file_ref=" not in rendered
def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class MetadataGenerator:
def generate(self, document, *, fields):
values = {
"summary": "Risk and compliance summary",
"entity": "Federal Reserve; Disney",
"relation": "Federal Reserve affects Disney valuation",
}
return MetadataGenerationResult(values={field: values[field] for field in fields})
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=MetadataGenerator(),
)
filesystem.register_file(
storage_uri="file:///tmp/market-note.pdf",
source_path="examples/documents/market-note.pdf",
folder_path="/documents",
external_id="dsid_market_note",
title="market-note.pdf",
content="Federal Reserve policy affects Disney valuation.",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
"entity": True,
"relation": True,
}
},
)
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note")
executor = PIFSCommandExecutor(filesystem, json_output=True)
entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents'))
assert entity["data"]["data"][0] == {
"path": "/documents/market-note.pdf",
"summary": "Risk and compliance summary",
"line_text": "1: Federal Reserve policy affects Disney valuation.",
"entity": "Federal Reserve; Disney",
}
relation = json.loads(executor.execute('search-relation "Disney valuation" /documents'))
assert relation["data"]["data"][0] == {
"path": "/documents/market-note.pdf",
"summary": "Risk and compliance summary",
"line_text": "1: Federal Reserve policy affects Disney valuation.",
"relation": "Federal Reserve affects Disney valuation",
}
executor.json_output = False
rendered = executor.execute('search-entity "Federal Reserve" /documents')
assert "path: /documents/market-note.pdf" in rendered
assert "summary: Risk and compliance summary" in rendered
assert "entity: Federal Reserve; Disney" in rendered
assert "file_ref=" not in rendered
def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.commands import PIFSCommandError
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
filesystem.register_file(
storage_uri="file:///tmp/report.pdf",
source_path="examples/documents/report.pdf",
folder_path="/documents",
external_id="dsid_report",
title="Annual report",
content="Federal Reserve supervision and regulation annual report.",
)
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_report")
executor = PIFSCommandExecutor(filesystem, json_output=True)
with pytest.raises(PIFSCommandError, match="Quote multi-word queries"):
executor.execute("search-summary Federal Reserve /documents")
with pytest.raises(PIFSCommandError, match="quote it"):
executor.execute("search-summary Federal Reserve")
with pytest.raises(PIFSCommandError, match="does not support regex alternation"):
executor.execute('search-summary "Federal|Reserve" /documents')
def test_semantic_search_scope_filters_explicit_source_type_facets():
from pageindex.filesystem import PageIndexFileSystem
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/source_type=google-drive"}
) == {"source_type": "google_drive"}
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/semantic/source_type=google-drive"}
) == {"source_type": "google_drive"}
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/documents"}
) == {}
def test_grep_source_file_requires_terms_on_same_line(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
source_dir = tmp_path / "source" / "documents"
source_dir.mkdir(parents=True)
source = source_dir / "split.json"
source.write_text(
'{\n "first": "alpha evidence lives here",\n'
' "second": "omega evidence lives there"\n}\n',
encoding="utf-8",
)
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
filesystem.register_file(
storage_uri=str(source),
source_path="documents/split.json",
folder_path="/documents",
external_id="doc_split_terms",
title="Split source terms",
content="registered artifact without the searched tokens",
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(executor.execute('grep -R "alpha omega" /documents'))
assert result["data"]["mode"] == "files"
assert result["data"]["data"] == []
matched = json.loads(executor.execute('grep -R "alpha evidence" /documents'))
assert matched["data"]["data"][0]["external_id"] == "doc_split_terms"
assert matched["data"]["data"][0]["line"] == 2
assert "alpha evidence" in matched["data"]["data"][0]["text"]
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
workspace = tmp_path / "workspace"
index_dir = workspace / "artifacts" / "projection_indexes"
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
summary_index.reset(
dimension=3,
metadata={
"channel": "summary",
"embedding_provider": "openai",
"embedding_model": "test-embedding",
"embedding_dimensions": 3,
},
)
summary_index.upsert_many(
[
SemanticIndexRecord(
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="documents/a.pdf",
title="A",
text="summary",
vector=[1.0, 0.0, 0.0],
)
]
)
filesystem = PageIndexFileSystem(workspace)
calls = []
def fake_configure(index_dir_arg, **kwargs):
calls.append((index_dir_arg, kwargs))
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
return filesystem.semantic_retrieval_backend
monkeypatch.setattr(
filesystem,
"configure_hybrid_projection_retrieval",
fake_configure,
)
assert filesystem.configure_existing_projection_retrieval() is True
assert calls == [
(
filesystem.summary_projection_index_dir,
{
"embedding_provider": "openai",
"embedding_model": "test-embedding",
"embedding_dimensions": 3,
"embedding_timeout": 60,
},
)
]
assert filesystem.semantic_retrieval_channels() == ("summary",)
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
class FixedEmbedder:
def embed(self, texts):
return [[1.0, 0.0, 0.0] for _ in texts]
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(
values={"summary": "vendor renewal risk matrix"}
)
source = tmp_path / "source.txt"
source.write_text("ordinary fixture body", encoding="utf-8")
index_dir = tmp_path / "workspace" / "artifacts" / "projection_indexes"
indexer = SummaryProjectionIndexer(
index_dir,
embedder=FixedEmbedder(),
embedding_provider="test",
embedding_model="fake",
embedding_dimensions=3,
)
backend = HybridProjectionSearchBackend(
index_dir,
embedder=FixedEmbedder(),
embedding_provider="test",
embedding_model="fake",
embedding_dimensions=3,
)
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
summary_projection_indexer=indexer,
semantic_retrieval_backend=backend,
)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_summary_only",
title="Operations note",
content=source.read_text(encoding="utf-8"),
metadata={"department": "ops"},
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
assert filesystem.search("purchase order exposure", semantic=False) == []
results = filesystem.search("purchase order exposure", semantic=True)
assert [result.external_id for result in results] == ["doc_summary_only"]
assert results[0].snippet == "summary_vector rank=1"