mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-24 20:28:12 +02:00
fix(filesystem): avoid eager vector imports for embedding defaults
This commit is contained in:
parent
58409d1ec5
commit
decfe29fe4
6 changed files with 5 additions and 65 deletions
|
|
@ -42,7 +42,7 @@ os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "true")
|
||||||
from pageindex import PageIndexClient
|
from pageindex import PageIndexClient
|
||||||
from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor
|
from pageindex.filesystem import MetadataGenerator, PageIndexFileSystem, PIFSCommandExecutor
|
||||||
from pageindex.filesystem.agent import run_pifs_agent
|
from pageindex.filesystem.agent import run_pifs_agent
|
||||||
from pageindex.filesystem.hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS
|
from pageindex.filesystem.embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||||
|
|
||||||
|
|
||||||
EXAMPLES_DIR = Path(__file__).parent
|
EXAMPLES_DIR = Path(__file__).parent
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ from .metadata_generation import (
|
||||||
MetadataGenerationResult,
|
MetadataGenerationResult,
|
||||||
MetadataGenerator,
|
MetadataGenerator,
|
||||||
)
|
)
|
||||||
|
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||||
from .semantic_folder_policy import (
|
from .semantic_folder_policy import (
|
||||||
SEMANTIC_FOLDER_BASE_FIELDS,
|
SEMANTIC_FOLDER_BASE_FIELDS,
|
||||||
SEMANTIC_FOLDER_ROOT,
|
SEMANTIC_FOLDER_ROOT,
|
||||||
|
|
@ -22,7 +23,6 @@ from .semantic_folder_policy import (
|
||||||
is_semantic_folder_forbidden_field,
|
is_semantic_folder_forbidden_field,
|
||||||
semantic_folder_allowed_extension_fields,
|
semantic_folder_allowed_extension_fields,
|
||||||
)
|
)
|
||||||
from .hybrid_projection import DEFAULT_EMBEDDING_DIMENSIONS
|
|
||||||
from .store import (
|
from .store import (
|
||||||
SQLiteFileSystemStore,
|
SQLiteFileSystemStore,
|
||||||
fingerprint,
|
fingerprint,
|
||||||
|
|
|
||||||
1
pageindex/filesystem/embedding_defaults.py
Normal file
1
pageindex/filesystem/embedding_defaults.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
DEFAULT_EMBEDDING_DIMENSIONS = 1024
|
||||||
|
|
@ -10,6 +10,7 @@ from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||||
from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
|
from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -28,7 +29,6 @@ HYBRID_ENTITY_RELATION_WEIGHTS = {
|
||||||
"relation": 0.30,
|
"relation": 0.30,
|
||||||
"constraint": 0.20,
|
"constraint": 0.20,
|
||||||
}
|
}
|
||||||
DEFAULT_EMBEDDING_DIMENSIONS = 1024
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,8 @@ from __future__ import annotations
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||||
from .hybrid_projection import (
|
from .hybrid_projection import (
|
||||||
DEFAULT_EMBEDDING_DIMENSIONS,
|
|
||||||
EmbeddingCache,
|
EmbeddingCache,
|
||||||
INDEX_BY_CHANNEL,
|
INDEX_BY_CHANNEL,
|
||||||
embedding_cache_model_key,
|
embedding_cache_model_key,
|
||||||
|
|
|
||||||
|
|
@ -844,67 +844,6 @@ def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path
|
||||||
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
||||||
|
|
||||||
|
|
||||||
def test_existing_256_summary_projection_index_uses_metadata_dimension_with_new_default(
|
|
||||||
tmp_path, monkeypatch
|
|
||||||
):
|
|
||||||
from pageindex.filesystem import PageIndexFileSystem
|
|
||||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
|
||||||
|
|
||||||
workspace = tmp_path / "workspace"
|
|
||||||
index_dir = workspace / "artifacts" / "projection_indexes"
|
|
||||||
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
|
||||||
summary_index.reset(
|
|
||||||
dimension=256,
|
|
||||||
metadata={
|
|
||||||
"channel": "summary",
|
|
||||||
"embedding_provider": "openai",
|
|
||||||
"embedding_model": "text-embedding-3-small",
|
|
||||||
"embedding_dimensions": 256,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
summary_index.upsert_many(
|
|
||||||
[
|
|
||||||
SemanticIndexRecord(
|
|
||||||
file_ref="file_a",
|
|
||||||
external_id="doc_a",
|
|
||||||
source_type="documents",
|
|
||||||
source_path="documents/a.pdf",
|
|
||||||
title="A",
|
|
||||||
text="summary",
|
|
||||||
vector=[1.0, *([0.0] * 255)],
|
|
||||||
)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
filesystem = PageIndexFileSystem(workspace)
|
|
||||||
calls = []
|
|
||||||
|
|
||||||
def fake_configure(index_dir_arg, **kwargs):
|
|
||||||
calls.append((index_dir_arg, kwargs))
|
|
||||||
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
|
||||||
return filesystem.semantic_retrieval_backend
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
filesystem,
|
|
||||||
"configure_hybrid_projection_retrieval",
|
|
||||||
fake_configure,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert filesystem.summary_projection_embedding_dimensions == 1024
|
|
||||||
assert filesystem.configure_existing_projection_retrieval() is True
|
|
||||||
assert calls == [
|
|
||||||
(
|
|
||||||
filesystem.summary_projection_index_dir,
|
|
||||||
{
|
|
||||||
"embedding_provider": "openai",
|
|
||||||
"embedding_model": "text-embedding-3-small",
|
|
||||||
"embedding_dimensions": 256,
|
|
||||||
"embedding_timeout": 60,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
]
|
|
||||||
assert summary_index.info()["dimension"] == 256
|
|
||||||
|
|
||||||
|
|
||||||
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
|
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
|
||||||
from pageindex.filesystem import PageIndexFileSystem
|
from pageindex.filesystem import PageIndexFileSystem
|
||||||
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
|
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue