mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
fix(pifs): avoid vector imports for empty workspaces
This commit is contained in:
parent
20b2225444
commit
8f87cee6ce
2 changed files with 70 additions and 15 deletions
|
|
@ -77,6 +77,11 @@ PROJECTION_INDEX_STATUSES = {
|
|||
}
|
||||
|
||||
SEMANTIC_RETRIEVAL_CHANNELS = ("summary", "entity", "relation")
|
||||
SEMANTIC_PROJECTION_INDEX_NAMES = {
|
||||
"summary": "summary_only_vector",
|
||||
"entity": "entity_vectors",
|
||||
"relation": "relation_vectors",
|
||||
}
|
||||
PAGEINDEX_DOCUMENT_SUFFIXES = {".pdf", ".md", ".markdown"}
|
||||
PAGEINDEX_DOCUMENT_CONTENT_TYPES = {
|
||||
"application/pdf",
|
||||
|
|
@ -277,16 +282,15 @@ class PageIndexFileSystem:
|
|||
return bool(self.semantic_retrieval_channels())
|
||||
|
||||
def _existing_projection_index_config(self) -> dict[str, Any] | None:
|
||||
from .hybrid_projection import INDEX_BY_CHANNEL
|
||||
from .semantic_index import SQLiteVecSemanticIndex
|
||||
|
||||
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||
index_name = INDEX_BY_CHANNEL.get(channel)
|
||||
index_name = SEMANTIC_PROJECTION_INDEX_NAMES.get(channel)
|
||||
if not index_name:
|
||||
continue
|
||||
index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
|
||||
if not index_path.exists():
|
||||
continue
|
||||
from .semantic_index import SQLiteVecSemanticIndex
|
||||
|
||||
try:
|
||||
info = SQLiteVecSemanticIndex(index_path).info()
|
||||
except Exception:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
import builtins
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class FakeFileSystem:
|
||||
def __init__(self, workspace):
|
||||
|
|
@ -25,22 +29,69 @@ def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp
|
|||
assert filesystem.projection_retrieval_configured is True
|
||||
|
||||
|
||||
def test_cli_workspace_surfaces_projection_dimension_mismatch(monkeypatch, tmp_path):
|
||||
import pytest
|
||||
|
||||
def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec(
|
||||
monkeypatch, tmp_path
|
||||
):
|
||||
from pageindex.filesystem import cli
|
||||
|
||||
class MismatchedFileSystem:
|
||||
def __init__(self, workspace):
|
||||
self.workspace = Path(workspace)
|
||||
workspace = tmp_path / "workspace"
|
||||
real_import = builtins.__import__
|
||||
|
||||
def configure_existing_projection_retrieval(self):
|
||||
raise RuntimeError("summary projection index dimension mismatch: rebuild")
|
||||
monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False)
|
||||
monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False)
|
||||
monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False)
|
||||
|
||||
monkeypatch.setattr(cli, "PageIndexFileSystem", MismatchedFileSystem)
|
||||
def block_sqlite_vec(name, globals=None, locals=None, fromlist=(), level=0):
|
||||
if name.split(".", 1)[0] == "sqlite_vec":
|
||||
raise ModuleNotFoundError("No module named 'sqlite_vec'", name="sqlite_vec")
|
||||
return real_import(name, globals, locals, fromlist, level)
|
||||
|
||||
with pytest.raises(RuntimeError, match="dimension mismatch"):
|
||||
cli._filesystem_from_workspace(str(tmp_path / "workspace"))
|
||||
monkeypatch.setattr(builtins, "__import__", block_sqlite_vec)
|
||||
|
||||
filesystem = cli._filesystem_from_workspace(str(workspace))
|
||||
|
||||
assert filesystem.workspace == workspace
|
||||
assert filesystem.semantic_retrieval_channels() == ()
|
||||
|
||||
|
||||
def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path):
|
||||
from pageindex.filesystem import cli
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
index_dir = workspace / "artifacts" / "projection_indexes"
|
||||
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
summary_index.reset(
|
||||
dimension=3,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "test",
|
||||
"embedding_model": "fake",
|
||||
"embedding_dimensions": 3,
|
||||
},
|
||||
)
|
||||
summary_index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
with pytest.raises(
|
||||
RuntimeError,
|
||||
match=(
|
||||
"summary projection index dimension mismatch: .*"
|
||||
"dimension 3.*summary_projection_embedding_dimensions is 1024.*Rebuild"
|
||||
),
|
||||
):
|
||||
cli._filesystem_from_workspace(str(workspace))
|
||||
|
||||
|
||||
def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue