mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
import sys
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
if str(REPO_ROOT) not in sys.path:
|
|
sys.path.insert(0, str(REPO_ROOT))
|
|
|
|
from pageindex.filesystem.semantic_index import (
|
|
SemanticIndexRecord,
|
|
SQLiteVecSemanticIndex,
|
|
)
|
|
|
|
|
|
def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
|
index = SQLiteVecSemanticIndex(tmp_path / "semantic.sqlite")
|
|
index.reset(dimension=3, metadata={"field_mode": "summary"})
|
|
|
|
index.upsert_many(
|
|
[
|
|
SemanticIndexRecord(
|
|
file_ref="file_a",
|
|
external_id="doc_a",
|
|
source_type="github",
|
|
source_path="github/a.json",
|
|
title="Multipart upload limits",
|
|
text="multipart upload limits",
|
|
vector=[1.0, 0.0, 0.0],
|
|
metadata={"topic": "uploads"},
|
|
),
|
|
SemanticIndexRecord(
|
|
file_ref="file_b",
|
|
external_id="doc_b",
|
|
source_type="slack",
|
|
source_path="slack/b.json",
|
|
title="GPU cache issue",
|
|
text="gpu cache issue",
|
|
vector=[0.0, 1.0, 0.0],
|
|
metadata={"topic": "runtime"},
|
|
),
|
|
]
|
|
)
|
|
|
|
assert index.info()["document_count"] == 2
|
|
|
|
results = index.search([0.9, 0.1, 0.0], limit=2)
|
|
assert [item.external_id for item in results] == ["doc_a", "doc_b"]
|
|
|
|
filtered = index.search(
|
|
[0.9, 0.1, 0.0],
|
|
limit=2,
|
|
filters={"source_type": "slack"},
|
|
)
|
|
assert [item.external_id for item in filtered] == ["doc_b"]
|
|
|
|
|
|
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
|
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
|
|
|
class FakeEmbedder:
|
|
def embed(self, texts):
|
|
return [[1.0, 0.0, 0.0] for _ in texts]
|
|
|
|
indexer = SummaryProjectionIndexer(
|
|
tmp_path / "projection",
|
|
embedder=FakeEmbedder(),
|
|
embedding_provider="test",
|
|
embedding_model="fake",
|
|
embedding_dimensions=3,
|
|
)
|
|
|
|
result = indexer.upsert_summary(
|
|
{
|
|
"file_ref": "file_a",
|
|
"external_id": "doc_a",
|
|
"source_type": "documents",
|
|
"source_path": "docs/a.pdf",
|
|
"title": "A",
|
|
"metadata": {
|
|
"summary": "Unified metadata summary.",
|
|
"department": "ops",
|
|
},
|
|
}
|
|
)
|
|
|
|
assert result["status"] == "ready"
|
|
hits = indexer.index.search([1.0, 0.0, 0.0], limit=1)
|
|
assert hits[0].external_id == "doc_a"
|
|
assert hits[0].metadata["summary"] == "Unified metadata summary."
|
|
assert hits[0].metadata["department"] == "ops"
|