mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
refactor(filesystem): consolidate semantic projection modules
This commit is contained in:
parent
34fa8f7b42
commit
e368562e03
10 changed files with 233 additions and 385 deletions
|
|
@ -338,7 +338,7 @@ def configure_summary_projection_backend(
|
|||
) -> None:
|
||||
if not (filesystem.summary_projection_index_dir / "summary_only_vector.sqlite").exists():
|
||||
return
|
||||
filesystem.configure_hybrid_projection_retrieval(
|
||||
filesystem.configure_semantic_projection_retrieval(
|
||||
filesystem.summary_projection_index_dir,
|
||||
embedding_provider=embedding_provider,
|
||||
embedding_model=embedding_model,
|
||||
|
|
|
|||
|
|
@ -13,8 +13,8 @@ from .metadata_generation import (
|
|||
from .types import OpenResult, SearchResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .hybrid_projection import HybridProjectionSearchBackend
|
||||
from .projection_indexing import SummaryProjectionIndexer
|
||||
from .semantic_projection import SemanticProjectionSearchBackend
|
||||
from .semantic_projection import SummaryProjectionIndexer
|
||||
from .semantic_index import (
|
||||
RebuildableSemanticIndex,
|
||||
SemanticIndexRecord,
|
||||
|
|
@ -23,17 +23,17 @@ if TYPE_CHECKING:
|
|||
)
|
||||
|
||||
_LAZY_EXPORTS = {
|
||||
"HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"),
|
||||
"SemanticProjectionSearchBackend": (".semantic_projection", "SemanticProjectionSearchBackend"),
|
||||
"RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"),
|
||||
"SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"),
|
||||
"SemanticSearchResult": (".semantic_index", "SemanticSearchResult"),
|
||||
"SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"),
|
||||
"SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"),
|
||||
"SummaryProjectionIndexer": (".semantic_projection", "SummaryProjectionIndexer"),
|
||||
}
|
||||
|
||||
__all__ = [
|
||||
"OpenResult",
|
||||
"HybridProjectionSearchBackend",
|
||||
"SemanticProjectionSearchBackend",
|
||||
"MetadataGenerationBackend",
|
||||
"MetadataGenerationError",
|
||||
"MetadataGenerationInput",
|
||||
|
|
|
|||
|
|
@ -23,17 +23,11 @@ from .store import (
|
|||
metadata_text,
|
||||
normalize_path,
|
||||
)
|
||||
from .structural_read import (
|
||||
flatten_pageindex_structure_nodes,
|
||||
first_node_location,
|
||||
find_pageindex_node,
|
||||
strip_pageindex_text_fields,
|
||||
)
|
||||
from .types import OpenResult, SearchResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..client import PageIndexClient
|
||||
from .projection_indexing import SummaryProjectionIndexer
|
||||
from .semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
DEFAULT_METADATA_GENERATION_FIELDS = {
|
||||
"summary": True,
|
||||
|
|
@ -94,6 +88,18 @@ ADD_FILE_CONTENT_TYPES = {
|
|||
}
|
||||
|
||||
|
||||
def strip_pageindex_text_fields(value: Any) -> Any:
|
||||
if isinstance(value, list):
|
||||
return [strip_pageindex_text_fields(item) for item in value]
|
||||
if isinstance(value, dict):
|
||||
return {
|
||||
key: strip_pageindex_text_fields(item)
|
||||
for key, item in value.items()
|
||||
if key != "text"
|
||||
}
|
||||
return value
|
||||
|
||||
|
||||
class PageIndexFileSystem:
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -325,9 +331,9 @@ class PageIndexFileSystem:
|
|||
model=self.metadata_model,
|
||||
base_url=self.metadata_base_url,
|
||||
max_text_chars=self.metadata_max_text_chars,
|
||||
)
|
||||
)
|
||||
if self.summary_projection_index and self.summary_projection_indexer is None:
|
||||
from .projection_indexing import SummaryProjectionIndexer
|
||||
from .semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
|
||||
self.summary_projection_index_dir,
|
||||
|
|
@ -337,7 +343,7 @@ class PageIndexFileSystem:
|
|||
embedding_timeout=self.summary_projection_embedding_timeout,
|
||||
)
|
||||
if self.summary_projection_index and self.semantic_retrieval_backend is None:
|
||||
self.configure_hybrid_projection_retrieval(
|
||||
self.configure_semantic_projection_retrieval(
|
||||
self.summary_projection_index_dir,
|
||||
embedding_provider=self.summary_projection_embedding_provider,
|
||||
embedding_model=self.summary_projection_embedding_model,
|
||||
|
|
@ -352,9 +358,9 @@ class PageIndexFileSystem:
|
|||
model=self.metadata_model,
|
||||
base_url=self.metadata_base_url,
|
||||
max_text_chars=self.metadata_max_text_chars,
|
||||
)
|
||||
)
|
||||
if self.summary_projection_index and self.summary_projection_indexer is None:
|
||||
from .projection_indexing import SummaryProjectionIndexer
|
||||
from .semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
|
||||
self.summary_projection_index_dir,
|
||||
|
|
@ -368,12 +374,12 @@ class PageIndexFileSystem:
|
|||
indexer = self.summary_projection_indexer
|
||||
if indexer is None:
|
||||
raise RuntimeError("pifs add requires a summary projection indexer")
|
||||
from .hybrid_projection import HybridProjectionSearchBackend
|
||||
from .semantic_projection import SemanticProjectionSearchBackend
|
||||
|
||||
index_dir = Path(getattr(indexer, "index_dir", self.summary_projection_index_dir))
|
||||
embedder = getattr(indexer, "embedder", None)
|
||||
if embedder is None:
|
||||
self.configure_hybrid_projection_retrieval(
|
||||
self.configure_semantic_projection_retrieval(
|
||||
index_dir,
|
||||
embedding_provider=str(
|
||||
getattr(
|
||||
|
|
@ -396,7 +402,7 @@ class PageIndexFileSystem:
|
|||
)
|
||||
else:
|
||||
embedding_cache = getattr(indexer, "embedding_cache", None)
|
||||
self.semantic_retrieval_backend = HybridProjectionSearchBackend(
|
||||
self.semantic_retrieval_backend = SemanticProjectionSearchBackend(
|
||||
index_dir,
|
||||
embedder=embedder,
|
||||
embedding_provider=str(
|
||||
|
|
@ -458,7 +464,7 @@ class PageIndexFileSystem:
|
|||
f"{self.summary_projection_embedding_dimensions}. Rebuild the "
|
||||
"projection index or use a matching embedding configuration."
|
||||
)
|
||||
self.configure_hybrid_projection_retrieval(
|
||||
self.configure_semantic_projection_retrieval(
|
||||
self.summary_projection_index_dir,
|
||||
embedding_provider=self.summary_projection_embedding_provider,
|
||||
embedding_model=self.summary_projection_embedding_model,
|
||||
|
|
@ -731,7 +737,7 @@ class PageIndexFileSystem:
|
|||
)
|
||||
return results
|
||||
|
||||
def configure_hybrid_projection_retrieval(
|
||||
def configure_semantic_projection_retrieval(
|
||||
self,
|
||||
index_dir: Union[str, Path],
|
||||
*,
|
||||
|
|
@ -741,9 +747,9 @@ class PageIndexFileSystem:
|
|||
embedding_timeout: float = 60,
|
||||
fetch_multiplier: int = 100,
|
||||
) -> Any:
|
||||
from .hybrid_projection import HybridProjectionSearchBackend
|
||||
from .semantic_projection import SemanticProjectionSearchBackend
|
||||
|
||||
self.semantic_retrieval_backend = HybridProjectionSearchBackend.from_provider(
|
||||
self.semantic_retrieval_backend = SemanticProjectionSearchBackend.from_provider(
|
||||
index_dir,
|
||||
embedding_provider=embedding_provider,
|
||||
embedding_model=embedding_model,
|
||||
|
|
@ -795,7 +801,7 @@ class PageIndexFileSystem:
|
|||
if self._file_format(entry) in {"pdf", "markdown", "pageindex"}:
|
||||
raise ValueError(
|
||||
"open() text artifact reads are not supported for PDF/Markdown PageIndex files; "
|
||||
"use pageindex_structure(), pageindex_pages(), or pageindex_node()."
|
||||
"use pageindex_structure() or pageindex_pages()."
|
||||
)
|
||||
if str(location).strip().lower() in {"all", "full", "*"}:
|
||||
return self._open_all(file_ref)
|
||||
|
|
@ -814,9 +820,6 @@ class PageIndexFileSystem:
|
|||
def pageindex_structure(
|
||||
self,
|
||||
target: str,
|
||||
*,
|
||||
offset: int = 0,
|
||||
limit: int = 25,
|
||||
) -> dict[str, Any]:
|
||||
file_ref = self._resolve_target(target)
|
||||
entry = self.store.get_file(file_ref)
|
||||
|
|
@ -838,12 +841,6 @@ class PageIndexFileSystem:
|
|||
entry,
|
||||
message=str(structure["error"]),
|
||||
)
|
||||
node_rows = flatten_pageindex_structure_nodes(structure)
|
||||
offset = max(0, offset)
|
||||
limit = max(0, limit)
|
||||
window = node_rows[offset : offset + limit] if limit else []
|
||||
next_offset = offset + len(window)
|
||||
has_more = next_offset < len(node_rows)
|
||||
return {
|
||||
"mode": "structure",
|
||||
"file_ref": file_ref,
|
||||
|
|
@ -852,67 +849,7 @@ class PageIndexFileSystem:
|
|||
"status": entry.pageindex_tree_status,
|
||||
"available": True,
|
||||
"pageindex_doc_id": doc_id,
|
||||
"structure": window,
|
||||
"structure_pagination": {
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"returned_nodes": len(window),
|
||||
"total_nodes": len(node_rows),
|
||||
"has_more": has_more,
|
||||
"next_offset": next_offset if has_more else None,
|
||||
},
|
||||
}
|
||||
|
||||
def pageindex_node(self, target: str, node_id: str) -> dict[str, Any]:
|
||||
file_ref = self._resolve_target(target)
|
||||
entry = self.store.get_file(file_ref)
|
||||
self._require_pageindex_document_file(entry, "cat --node")
|
||||
client, doc_id = self._pageindex_client_doc_for_entry(entry)
|
||||
if doc_id is None:
|
||||
return self._structural_unavailable(
|
||||
"node",
|
||||
entry,
|
||||
node_id=node_id,
|
||||
message=(
|
||||
"PageIndex structure is not cached for this file in the "
|
||||
"PageIndexClient workspace."
|
||||
),
|
||||
)
|
||||
client._ensure_doc_loaded(doc_id)
|
||||
doc = client.documents.get(doc_id, {})
|
||||
node = find_pageindex_node(doc.get("structure", []), node_id)
|
||||
if node is None:
|
||||
return self._structural_unavailable(
|
||||
"node",
|
||||
entry,
|
||||
node_id=node_id,
|
||||
message="PageIndex node was not found in the cached structure.",
|
||||
)
|
||||
text = str(node.get("text") or "")
|
||||
if not text:
|
||||
location = first_node_location(node)
|
||||
if location:
|
||||
content = self._client_json(client.get_page_content(doc_id, location))
|
||||
if isinstance(content, list):
|
||||
text = "\n\n".join(str(page.get("content") or "") for page in content)
|
||||
if not text:
|
||||
return self._structural_unavailable(
|
||||
"node",
|
||||
entry,
|
||||
node_id=node_id,
|
||||
message="Cached PageIndex node has no text content.",
|
||||
)
|
||||
return {
|
||||
"mode": "node",
|
||||
"file_ref": file_ref,
|
||||
"external_id": entry.external_id,
|
||||
"source_path": entry.source_path,
|
||||
"status": entry.pageindex_tree_status,
|
||||
"available": True,
|
||||
"pageindex_doc_id": doc_id,
|
||||
"node_id": node_id,
|
||||
"node": strip_pageindex_text_fields(node),
|
||||
"text": text,
|
||||
"structure": strip_pageindex_text_fields(structure),
|
||||
}
|
||||
|
||||
def pageindex_pages(self, target: str, pages: str) -> dict[str, Any]:
|
||||
|
|
@ -970,8 +907,7 @@ class PageIndexFileSystem:
|
|||
f"{command} is only supported for txt/text files; "
|
||||
f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
|
||||
"Use cat <path|file_ref|document_id> --structure, "
|
||||
"cat <path|file_ref|document_id> --page, or "
|
||||
"cat <path|file_ref|document_id> --node for PDF/Markdown PageIndex files."
|
||||
"or cat <path|file_ref|document_id> --page for PDF/Markdown PageIndex files."
|
||||
)
|
||||
|
||||
def _require_pageindex_document_file(self, entry: Any, command: str) -> None:
|
||||
|
|
@ -1748,7 +1684,6 @@ class PageIndexFileSystem:
|
|||
entry: Any,
|
||||
*,
|
||||
message: str,
|
||||
node_id: str | None = None,
|
||||
pages: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
pageindex_tree_error = cls._pageindex_tree_failure_message(entry.metadata_status)
|
||||
|
|
@ -1765,8 +1700,6 @@ class PageIndexFileSystem:
|
|||
}
|
||||
if pageindex_tree_error:
|
||||
result["pageindex_tree_error"] = pageindex_tree_error
|
||||
if node_id is not None:
|
||||
result["node_id"] = node_id
|
||||
if pages is not None:
|
||||
result["pages"] = pages
|
||||
return result
|
||||
|
|
|
|||
|
|
@ -1,179 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .core import DEFAULT_EMBEDDING_DIMENSIONS
|
||||
from .hybrid_projection import (
|
||||
EmbeddingCache,
|
||||
INDEX_BY_CHANNEL,
|
||||
embedding_cache_model_key,
|
||||
make_embedder,
|
||||
)
|
||||
from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexRecord
|
||||
|
||||
|
||||
class SummaryProjectionIndexer:
|
||||
"""Synchronous register-time summary projection indexer."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index_dir: str | Path,
|
||||
*,
|
||||
embedder: Any,
|
||||
embedding_provider: str,
|
||||
embedding_model: str,
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_cache_path: str | Path | None = None,
|
||||
) -> None:
|
||||
self.index_dir = Path(index_dir).expanduser()
|
||||
self.index_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.embedder = embedder
|
||||
self.embedding_provider = embedding_provider
|
||||
self.embedding_model = embedding_model
|
||||
self.embedding_dimensions = embedding_dimensions
|
||||
self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
|
||||
self.embedding_cache = EmbeddingCache(
|
||||
Path(embedding_cache_path).expanduser()
|
||||
if embedding_cache_path is not None
|
||||
else self.index_dir / "embedding_cache.sqlite"
|
||||
)
|
||||
self.index = SQLiteVecSemanticIndex(
|
||||
self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
|
||||
)
|
||||
self._ensure_index()
|
||||
|
||||
@classmethod
|
||||
def from_provider(
|
||||
cls,
|
||||
index_dir: str | Path,
|
||||
*,
|
||||
embedding_provider: str = "openai",
|
||||
embedding_model: str = "text-embedding-3-small",
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_timeout: float = 60,
|
||||
**kwargs: Any,
|
||||
) -> "SummaryProjectionIndexer":
|
||||
cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
|
||||
return cls(
|
||||
index_dir,
|
||||
embedder=make_embedder(
|
||||
embedding_provider,
|
||||
embedding_model,
|
||||
dimensions=embedding_dimensions,
|
||||
timeout=embedding_timeout,
|
||||
),
|
||||
embedding_provider=embedding_provider,
|
||||
embedding_model=embedding_model,
|
||||
embedding_dimensions=embedding_dimensions,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
|
||||
summary = str((record.get("metadata") or {}).get("summary") or "").strip()
|
||||
if not summary:
|
||||
return {"status": "skipped", "reason": "missing_summary"}
|
||||
vector = self.embedding_cache.embed_texts(
|
||||
[summary],
|
||||
provider=self.embedding_provider,
|
||||
model=self.cache_model,
|
||||
embedder=self.embedder,
|
||||
batch_size=1,
|
||||
)[0]
|
||||
metadata = dict(record.get("metadata") or {})
|
||||
count = self.index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref=str(record["file_ref"]),
|
||||
vector=vector,
|
||||
text=summary,
|
||||
external_id=record.get("external_id"),
|
||||
source_type=str(record.get("source_type") or ""),
|
||||
source_path=str(record.get("source_path") or ""),
|
||||
title=str(record.get("title") or ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
]
|
||||
)
|
||||
return {
|
||||
"status": "ready",
|
||||
"indexed_rows": count,
|
||||
"index_path": str(self.index.db_path),
|
||||
"embedding_provider": self.embedding_provider,
|
||||
"embedding_model": self.embedding_model,
|
||||
"embedding_dimensions": self.embedding_dimensions,
|
||||
}
|
||||
|
||||
def delete_summary(self, file_ref: str) -> int:
|
||||
return self.index.delete_file_refs([file_ref])
|
||||
|
||||
def _ensure_index(self) -> None:
|
||||
if not self.index.db_path.exists():
|
||||
self.index.reset(
|
||||
dimension=self.embedding_dimensions,
|
||||
metadata=self._index_metadata(),
|
||||
)
|
||||
return
|
||||
try:
|
||||
existing_dimension = self.index.dimension()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
"could not validate existing summary projection index config; "
|
||||
f"refusing to reset {self.index.db_path}. Move the existing index "
|
||||
"aside or rebuild it intentionally before changing embedding config."
|
||||
) from exc
|
||||
if existing_dimension != self.embedding_dimensions:
|
||||
raise self._dimension_mismatch_error(
|
||||
self.index.db_path,
|
||||
existing_dimension,
|
||||
self.embedding_dimensions,
|
||||
)
|
||||
|
||||
def _index_metadata(self) -> dict[str, Any]:
|
||||
return {
|
||||
"channel": "summary",
|
||||
"embedding_provider": self.embedding_provider,
|
||||
"embedding_model": self.embedding_model,
|
||||
"embedding_dimensions": self.embedding_dimensions,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _validate_existing_index_dimension(
|
||||
cls,
|
||||
index_dir: str | Path,
|
||||
embedding_dimensions: int,
|
||||
) -> None:
|
||||
index_path = (
|
||||
Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
|
||||
)
|
||||
if not index_path.exists():
|
||||
return
|
||||
index = SQLiteVecSemanticIndex(index_path)
|
||||
try:
|
||||
existing_dimension = index.dimension()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
"could not validate existing summary projection index config; "
|
||||
f"refusing to reset {index_path}. Move the existing index "
|
||||
"aside or rebuild it intentionally before changing embedding config."
|
||||
) from exc
|
||||
if existing_dimension != embedding_dimensions:
|
||||
raise cls._dimension_mismatch_error(
|
||||
index_path,
|
||||
existing_dimension,
|
||||
embedding_dimensions,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _dimension_mismatch_error(
|
||||
index_path: Path,
|
||||
existing_dimension: int,
|
||||
embedding_dimensions: int,
|
||||
) -> RuntimeError:
|
||||
return RuntimeError(
|
||||
"summary projection index dimension mismatch: "
|
||||
f"{index_path} was built with dimension {existing_dimension}, "
|
||||
f"but configured embedding_dimensions is {embedding_dimensions}. "
|
||||
"Use the matching embedding config, or rebuild the projection index "
|
||||
"at a new path after preserving the existing data."
|
||||
)
|
||||
|
|
@ -11,7 +11,12 @@ from pathlib import Path
|
|||
from typing import Any
|
||||
|
||||
from .core import DEFAULT_EMBEDDING_DIMENSIONS
|
||||
from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
|
||||
from .semantic_index import (
|
||||
SQLiteVecSemanticIndex,
|
||||
SemanticIndexError,
|
||||
SemanticIndexRecord,
|
||||
SemanticSearchResult,
|
||||
)
|
||||
|
||||
|
||||
INDEX_BY_CHANNEL = {
|
||||
|
|
@ -29,7 +34,7 @@ class QueryProjection:
|
|||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class HybridProjectionCandidate:
|
||||
class SemanticProjectionCandidate:
|
||||
document_id: str
|
||||
score: float
|
||||
sources: list[dict[str, Any]]
|
||||
|
|
@ -40,7 +45,7 @@ class HybridProjectionCandidate:
|
|||
snippet: str
|
||||
|
||||
|
||||
class HybridProjectionSearchBackend:
|
||||
class SemanticProjectionSearchBackend:
|
||||
"""Semantic channel retrieval over rebuildable projection indexes.
|
||||
|
||||
The SQLite catalog remains the source of truth. This backend only reads
|
||||
|
|
@ -86,7 +91,7 @@ class HybridProjectionSearchBackend:
|
|||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_timeout: float = 60,
|
||||
**kwargs: Any,
|
||||
) -> "HybridProjectionSearchBackend":
|
||||
) -> "SemanticProjectionSearchBackend":
|
||||
return cls(
|
||||
index_dir,
|
||||
embedder=make_embedder(
|
||||
|
|
@ -108,7 +113,7 @@ class HybridProjectionSearchBackend:
|
|||
*,
|
||||
limit: int = 10,
|
||||
filters: dict[str, Any] | None = None,
|
||||
) -> list[HybridProjectionCandidate]:
|
||||
) -> list[SemanticProjectionCandidate]:
|
||||
if channel not in SEMANTIC_TOOL_CHANNELS:
|
||||
raise ValueError(f"unsupported semantic channel: {channel}")
|
||||
if channel not in self.available_channels():
|
||||
|
|
@ -180,6 +185,172 @@ class HybridProjectionSearchBackend:
|
|||
return {**info, "available": int(info.get("document_count") or 0) > 0}
|
||||
|
||||
|
||||
class SummaryProjectionIndexer:
|
||||
"""Synchronous register-time summary projection indexer."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
index_dir: str | Path,
|
||||
*,
|
||||
embedder: Any,
|
||||
embedding_provider: str,
|
||||
embedding_model: str,
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_cache_path: str | Path | None = None,
|
||||
) -> None:
|
||||
self.index_dir = Path(index_dir).expanduser()
|
||||
self.index_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.embedder = embedder
|
||||
self.embedding_provider = embedding_provider
|
||||
self.embedding_model = embedding_model
|
||||
self.embedding_dimensions = embedding_dimensions
|
||||
self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
|
||||
self.embedding_cache = EmbeddingCache(
|
||||
Path(embedding_cache_path).expanduser()
|
||||
if embedding_cache_path is not None
|
||||
else self.index_dir / "embedding_cache.sqlite"
|
||||
)
|
||||
self.index = SQLiteVecSemanticIndex(
|
||||
self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
|
||||
)
|
||||
self._ensure_index()
|
||||
|
||||
@classmethod
|
||||
def from_provider(
|
||||
cls,
|
||||
index_dir: str | Path,
|
||||
*,
|
||||
embedding_provider: str = "openai",
|
||||
embedding_model: str = "text-embedding-3-small",
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_timeout: float = 60,
|
||||
**kwargs: Any,
|
||||
) -> "SummaryProjectionIndexer":
|
||||
cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
|
||||
return cls(
|
||||
index_dir,
|
||||
embedder=make_embedder(
|
||||
embedding_provider,
|
||||
embedding_model,
|
||||
dimensions=embedding_dimensions,
|
||||
timeout=embedding_timeout,
|
||||
),
|
||||
embedding_provider=embedding_provider,
|
||||
embedding_model=embedding_model,
|
||||
embedding_dimensions=embedding_dimensions,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
|
||||
summary = str((record.get("metadata") or {}).get("summary") or "").strip()
|
||||
if not summary:
|
||||
return {"status": "skipped", "reason": "missing_summary"}
|
||||
vector = self.embedding_cache.embed_texts(
|
||||
[summary],
|
||||
provider=self.embedding_provider,
|
||||
model=self.cache_model,
|
||||
embedder=self.embedder,
|
||||
batch_size=1,
|
||||
)[0]
|
||||
metadata = dict(record.get("metadata") or {})
|
||||
count = self.index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref=str(record["file_ref"]),
|
||||
vector=vector,
|
||||
text=summary,
|
||||
external_id=record.get("external_id"),
|
||||
source_type=str(record.get("source_type") or ""),
|
||||
source_path=str(record.get("source_path") or ""),
|
||||
title=str(record.get("title") or ""),
|
||||
metadata=metadata,
|
||||
)
|
||||
]
|
||||
)
|
||||
return {
|
||||
"status": "ready",
|
||||
"indexed_rows": count,
|
||||
"index_path": str(self.index.db_path),
|
||||
"embedding_provider": self.embedding_provider,
|
||||
"embedding_model": self.embedding_model,
|
||||
"embedding_dimensions": self.embedding_dimensions,
|
||||
}
|
||||
|
||||
def delete_summary(self, file_ref: str) -> int:
|
||||
return self.index.delete_file_refs([file_ref])
|
||||
|
||||
def _ensure_index(self) -> None:
|
||||
if not self.index.db_path.exists():
|
||||
self.index.reset(
|
||||
dimension=self.embedding_dimensions,
|
||||
metadata=self._index_metadata(),
|
||||
)
|
||||
return
|
||||
try:
|
||||
existing_dimension = self.index.dimension()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
"could not validate existing summary projection index config; "
|
||||
f"refusing to reset {self.index.db_path}. Move the existing index "
|
||||
"aside or rebuild it intentionally before changing embedding config."
|
||||
) from exc
|
||||
if existing_dimension != self.embedding_dimensions:
|
||||
raise self._dimension_mismatch_error(
|
||||
self.index.db_path,
|
||||
existing_dimension,
|
||||
self.embedding_dimensions,
|
||||
)
|
||||
|
||||
def _index_metadata(self) -> dict[str, Any]:
|
||||
return {
|
||||
"channel": "summary",
|
||||
"embedding_provider": self.embedding_provider,
|
||||
"embedding_model": self.embedding_model,
|
||||
"embedding_dimensions": self.embedding_dimensions,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _validate_existing_index_dimension(
|
||||
cls,
|
||||
index_dir: str | Path,
|
||||
embedding_dimensions: int,
|
||||
) -> None:
|
||||
index_path = (
|
||||
Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
|
||||
)
|
||||
if not index_path.exists():
|
||||
return
|
||||
index = SQLiteVecSemanticIndex(index_path)
|
||||
try:
|
||||
existing_dimension = index.dimension()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
"could not validate existing summary projection index config; "
|
||||
f"refusing to reset {index_path}. Move the existing index "
|
||||
"aside or rebuild it intentionally before changing embedding config."
|
||||
) from exc
|
||||
if existing_dimension != embedding_dimensions:
|
||||
raise cls._dimension_mismatch_error(
|
||||
index_path,
|
||||
existing_dimension,
|
||||
embedding_dimensions,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _dimension_mismatch_error(
|
||||
index_path: Path,
|
||||
existing_dimension: int,
|
||||
embedding_dimensions: int,
|
||||
) -> RuntimeError:
|
||||
return RuntimeError(
|
||||
"summary projection index dimension mismatch: "
|
||||
f"{index_path} was built with dimension {existing_dimension}, "
|
||||
f"but configured embedding_dimensions is {embedding_dimensions}. "
|
||||
"Use the matching embedding config, or rebuild the projection index "
|
||||
"at a new path after preserving the existing data."
|
||||
)
|
||||
|
||||
|
||||
class EmbeddingCache:
|
||||
def __init__(self, db_path: Path):
|
||||
self.db_path = db_path
|
||||
|
|
@ -308,8 +479,8 @@ def query_text_for_channel(channel: str, query: str, projection: QueryProjection
|
|||
def rank_single_semantic_channel(
|
||||
channel: str,
|
||||
results: list[SemanticSearchResult],
|
||||
) -> list[HybridProjectionCandidate]:
|
||||
rows: list[HybridProjectionCandidate] = []
|
||||
) -> list[SemanticProjectionCandidate]:
|
||||
rows: list[SemanticProjectionCandidate] = []
|
||||
seen: set[str] = set()
|
||||
for rank, result in enumerate(results, 1):
|
||||
doc_id = str(result.external_id or result.file_ref)
|
||||
|
|
@ -317,7 +488,7 @@ def rank_single_semantic_channel(
|
|||
continue
|
||||
seen.add(doc_id)
|
||||
rows.append(
|
||||
HybridProjectionCandidate(
|
||||
SemanticProjectionCandidate(
|
||||
document_id=doc_id,
|
||||
score=1 / (60 + rank),
|
||||
sources=[{"channel": channel, "rank": rank, "distance": result.distance}],
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
from typing import Any
|
||||
|
||||
|
||||
def strip_pageindex_text_fields(value: Any) -> Any:
|
||||
if isinstance(value, list):
|
||||
return [strip_pageindex_text_fields(item) for item in value]
|
||||
if isinstance(value, dict):
|
||||
return {
|
||||
key: strip_pageindex_text_fields(item)
|
||||
for key, item in value.items()
|
||||
if key != "text"
|
||||
}
|
||||
return value
|
||||
|
||||
|
||||
def flatten_pageindex_structure_nodes(structure: Any) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
|
||||
def visit(value: Any, *, depth: int, parent_node_id: str | None) -> None:
|
||||
if isinstance(value, list):
|
||||
for item in value:
|
||||
visit(item, depth=depth, parent_node_id=parent_node_id)
|
||||
return
|
||||
if not isinstance(value, dict):
|
||||
return
|
||||
|
||||
node_id = value.get("node_id")
|
||||
child_values: list[Any] = []
|
||||
for child_key in ("nodes", "children"):
|
||||
children = value.get(child_key)
|
||||
if isinstance(children, list):
|
||||
child_values.extend(children)
|
||||
|
||||
row = {
|
||||
key: strip_pageindex_text_fields(item)
|
||||
for key, item in value.items()
|
||||
if key not in {"text", "nodes", "children"}
|
||||
}
|
||||
row["depth"] = depth
|
||||
row["children_count"] = len(child_values)
|
||||
if parent_node_id:
|
||||
row["parent_node_id"] = parent_node_id
|
||||
rows.append(row)
|
||||
|
||||
next_parent = str(node_id) if node_id is not None else parent_node_id
|
||||
for child in child_values:
|
||||
visit(child, depth=depth + 1, parent_node_id=next_parent)
|
||||
|
||||
visit(structure, depth=0, parent_node_id=None)
|
||||
return rows
|
||||
|
||||
|
||||
def find_pageindex_node(structure: Any, node_id: str) -> dict[str, Any] | None:
|
||||
if isinstance(structure, dict):
|
||||
if str(structure.get("node_id", "")) == str(node_id):
|
||||
return deepcopy(structure)
|
||||
for child_key in ("nodes", "children"):
|
||||
found = find_pageindex_node(structure.get(child_key), node_id)
|
||||
if found is not None:
|
||||
return found
|
||||
if isinstance(structure, list):
|
||||
for item in structure:
|
||||
found = find_pageindex_node(item, node_id)
|
||||
if found is not None:
|
||||
return found
|
||||
return None
|
||||
|
||||
|
||||
def first_node_location(node: dict[str, Any]) -> str | None:
|
||||
for key in ("line_num", "physical_index", "start_index"):
|
||||
value = node.get(key)
|
||||
if value is not None and value != "":
|
||||
return str(value)
|
||||
return None
|
||||
|
|
@ -7,7 +7,7 @@ import pytest
|
|||
def test_filesystem_lazy_exports_remain_public():
|
||||
import pageindex.filesystem as filesystem
|
||||
from pageindex.filesystem import (
|
||||
HybridProjectionSearchBackend,
|
||||
SemanticProjectionSearchBackend,
|
||||
RebuildableSemanticIndex,
|
||||
SemanticIndexRecord,
|
||||
SemanticSearchResult,
|
||||
|
|
@ -16,7 +16,7 @@ def test_filesystem_lazy_exports_remain_public():
|
|||
)
|
||||
|
||||
for name in (
|
||||
"HybridProjectionSearchBackend",
|
||||
"SemanticProjectionSearchBackend",
|
||||
"RebuildableSemanticIndex",
|
||||
"SemanticIndexRecord",
|
||||
"SemanticSearchResult",
|
||||
|
|
@ -26,7 +26,7 @@ def test_filesystem_lazy_exports_remain_public():
|
|||
assert name in filesystem.__all__
|
||||
assert name in dir(filesystem)
|
||||
|
||||
assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend"
|
||||
assert SemanticProjectionSearchBackend.__name__ == "SemanticProjectionSearchBackend"
|
||||
assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex"
|
||||
assert SemanticIndexRecord.__name__ == "SemanticIndexRecord"
|
||||
assert SemanticSearchResult.__name__ == "SemanticSearchResult"
|
||||
|
|
@ -819,7 +819,7 @@ def test_existing_summary_projection_index_uses_current_config_when_dimensions_m
|
|||
|
||||
monkeypatch.setattr(
|
||||
filesystem,
|
||||
"configure_hybrid_projection_retrieval",
|
||||
"configure_semantic_projection_retrieval",
|
||||
fake_configure,
|
||||
)
|
||||
|
||||
|
|
@ -876,7 +876,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
|
|||
|
||||
monkeypatch.setattr(
|
||||
filesystem,
|
||||
"configure_hybrid_projection_retrieval",
|
||||
"configure_semantic_projection_retrieval",
|
||||
fail_configure,
|
||||
)
|
||||
|
||||
|
|
@ -892,9 +892,9 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
|
|||
|
||||
def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
|
||||
from pageindex.filesystem.semantic_projection import SemanticProjectionSearchBackend
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
class FixedEmbedder:
|
||||
def embed(self, texts):
|
||||
|
|
@ -916,7 +916,7 @@ def test_browse_semantic_files_uses_summary_projection_when_only_summary_availab
|
|||
embedding_model="fake",
|
||||
embedding_dimensions=3,
|
||||
)
|
||||
backend = HybridProjectionSearchBackend(
|
||||
backend = SemanticProjectionSearchBackend(
|
||||
index_dir,
|
||||
embedder=FixedEmbedder(),
|
||||
embedding_provider="test",
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ class StaticEmbedder:
|
|||
|
||||
|
||||
def make_summary_indexer(workspace: Path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
return SummaryProjectionIndexer(
|
||||
workspace / "artifacts" / "projection_indexes",
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec(
|
|||
workspace = tmp_path / "workspace"
|
||||
real_import = builtins.__import__
|
||||
|
||||
monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False)
|
||||
monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_projection", raising=False)
|
||||
monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False)
|
||||
monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False)
|
||||
|
||||
|
|
|
|||
|
|
@ -102,7 +102,7 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
|
|||
|
||||
|
||||
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
|
|
@ -134,7 +134,7 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
|||
|
||||
|
||||
def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
|
|
@ -164,7 +164,7 @@ def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
|
|||
|
||||
|
||||
def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
|
|
@ -188,7 +188,7 @@ def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
|
|||
|
||||
|
||||
def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
|
|
@ -216,8 +216,8 @@ def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_pa
|
|||
def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder(
|
||||
tmp_path, monkeypatch
|
||||
):
|
||||
from pageindex.filesystem import projection_indexing
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
from pageindex.filesystem import semantic_projection
|
||||
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
|
|
@ -234,14 +234,14 @@ def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embe
|
|||
def fail_make_embedder(*args, **kwargs):
|
||||
raise AssertionError("embedder should not be constructed before dimension validation")
|
||||
|
||||
monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder)
|
||||
monkeypatch.setattr(semantic_projection, "make_embedder", fail_make_embedder)
|
||||
|
||||
with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
|
||||
SummaryProjectionIndexer.from_provider(index_dir)
|
||||
|
||||
|
||||
def test_embedding_cache_key_separates_model_dimensions(tmp_path):
|
||||
from pageindex.filesystem.hybrid_projection import (
|
||||
from pageindex.filesystem.semantic_projection import (
|
||||
EmbeddingCache,
|
||||
embedding_cache_model_key,
|
||||
)
|
||||
|
|
@ -285,7 +285,7 @@ def test_embedding_cache_key_separates_model_dimensions(tmp_path):
|
|||
|
||||
|
||||
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
|
||||
|
||||
index_dir = tmp_path / "projection"
|
||||
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
|
|
@ -328,7 +328,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
|
|||
|
||||
|
||||
def test_hash_embedding_provider_is_not_available():
|
||||
from pageindex.filesystem.hybrid_projection import make_embedder
|
||||
from pageindex.filesystem.semantic_projection import make_embedder
|
||||
|
||||
with pytest.raises(ValueError, match="unknown embedding provider: hash"):
|
||||
make_embedder("hash", "unused", dimensions=256, timeout=1)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue