import sys from pathlib import Path import pytest REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from pageindex.filesystem.semantic_index import ( SemanticIndexRecord, SQLiteVecSemanticIndex, ) def test_sqlite_vec_semantic_index_round_trip(tmp_path): index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite") index.reset(dimension=3, metadata={"field_mode": "summary"}) index.upsert_many( [ SemanticIndexRecord( file_ref="file_a", external_id="doc_a", source_type="github", source_path="github/a.json", title="Multipart upload limits", text="multipart upload limits", vector=[1.0, 0.0, 0.0], metadata={"topic": "uploads"}, ), SemanticIndexRecord( file_ref="file_b", external_id="doc_b", source_type="slack", source_path="slack/b.json", title="GPU cache issue", text="gpu cache issue", vector=[0.0, 1.0, 0.0], metadata={"topic": "runtime"}, ), ] ) assert index.info()["document_count"] == 2 results = index.search([0.9, 0.1, 0.0], limit=2) assert [item.external_id for item in results] == ["doc_a", "doc_b"] filtered = index.search( [0.9, 0.1, 0.0], limit=2, filters={"source_type": "slack"}, ) assert [item.external_id for item in filtered] == ["doc_b"] def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tmp_path): index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite") index.reset(dimension=2, metadata={"field_mode": "summary"}) records = [ SemanticIndexRecord( file_ref=f"file_off_{item:02d}", external_id=f"doc_off_{item:02d}", source_type="documents", source_path=f"other/{item:02d}.pdf", title=f"Off scope {item:02d}", text="off scope", vector=[1.0, 0.0], ) for item in range(30) ] records.append( SemanticIndexRecord( file_ref="file_in_scope", external_id="doc_in_scope", source_type="documents", source_path="documents/in-scope.pdf", title="In scope", text="in scope", vector=[0.0, 1.0], ) ) index.upsert_many(records) results = index.search( [1.0, 0.0], limit=1, filters={"file_ref": ["file_in_scope"]}, ) assert [item.file_ref for item in results] == ["file_in_scope"] def test_summary_projection_indexes_unified_metadata_summary(tmp_path): from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer class FakeEmbedder: def embed(self, texts): return [[1.0, 0.0, 0.0] for _ in texts] indexer = SummaryProjectionIndexer( tmp_path / "projection", embedder=FakeEmbedder(), embedding_provider="test", embedding_model="fake", embedding_dimensions=3, ) result = indexer.upsert_summary( { "file_ref": "file_a", "external_id": "doc_a", "source_type": "documents", "source_path": "docs/a.pdf", "title": "A", "metadata": { "summary": "Unified metadata summary.", "department": "ops", }, } ) assert result["status"] == "ready" hits = indexer.index.search([1.0, 0.0, 0.0], limit=1) assert hits[0].external_id == "doc_a" assert hits[0].metadata["summary"] == "Unified metadata summary." assert hits[0].metadata["department"] == "ops" def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path): from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer class FakeEmbedder: def embed(self, texts): return [[1.0, 0.0, 0.0, 0.0] for _ in texts] index_dir = tmp_path / "projection" index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") index.reset( dimension=3, metadata={ "channel": "summary", "embedding_provider": "test", "embedding_model": "fake", "embedding_dimensions": 3, }, ) index.upsert_many( [ SemanticIndexRecord( file_ref="file_a", external_id="doc_a", source_type="documents", source_path="docs/a.pdf", title="A", text="summary", vector=[1.0, 0.0, 0.0], ) ] ) with pytest.raises(RuntimeError, match="summary projection index dimension mismatch"): SummaryProjectionIndexer( index_dir, embedder=FakeEmbedder(), embedding_provider="test", embedding_model="fake", embedding_dimensions=4, ) preserved = SQLiteVecSemanticIndex(index.db_path) assert preserved.info()["dimension"] == 3 assert preserved.info()["document_count"] == 1 assert preserved.search([1.0, 0.0, 0.0], limit=1)[0].external_id == "doc_a" def test_hash_embedding_provider_is_not_available(): from pageindex.filesystem.hybrid_projection import make_embedder with pytest.raises(ValueError, match="unknown embedding provider: hash"): make_embedder("hash", "unused", dimensions=256, timeout=1)