refactor(filesystem): consolidate semantic projection modules

This commit is contained in:
BukeLy 2026-05-31 23:57:12 +08:00
parent 34fa8f7b42
commit e368562e03
10 changed files with 233 additions and 385 deletions

View file

@ -338,7 +338,7 @@ def configure_summary_projection_backend(
) -> None:
if not (filesystem.summary_projection_index_dir / "summary_only_vector.sqlite").exists():
return
filesystem.configure_hybrid_projection_retrieval(
filesystem.configure_semantic_projection_retrieval(
filesystem.summary_projection_index_dir,
embedding_provider=embedding_provider,
embedding_model=embedding_model,

View file

@ -13,8 +13,8 @@ from .metadata_generation import (
from .types import OpenResult, SearchResult
if TYPE_CHECKING:
from .hybrid_projection import HybridProjectionSearchBackend
from .projection_indexing import SummaryProjectionIndexer
from .semantic_projection import SemanticProjectionSearchBackend
from .semantic_projection import SummaryProjectionIndexer
from .semantic_index import (
RebuildableSemanticIndex,
SemanticIndexRecord,
@ -23,17 +23,17 @@ if TYPE_CHECKING:
)
_LAZY_EXPORTS = {
"HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"),
"SemanticProjectionSearchBackend": (".semantic_projection", "SemanticProjectionSearchBackend"),
"RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"),
"SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"),
"SemanticSearchResult": (".semantic_index", "SemanticSearchResult"),
"SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"),
"SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"),
"SummaryProjectionIndexer": (".semantic_projection", "SummaryProjectionIndexer"),
}
__all__ = [
"OpenResult",
"HybridProjectionSearchBackend",
"SemanticProjectionSearchBackend",
"MetadataGenerationBackend",
"MetadataGenerationError",
"MetadataGenerationInput",

View file

@ -23,17 +23,11 @@ from .store import (
metadata_text,
normalize_path,
)
from .structural_read import (
flatten_pageindex_structure_nodes,
first_node_location,
find_pageindex_node,
strip_pageindex_text_fields,
)
from .types import OpenResult, SearchResult
if TYPE_CHECKING:
from ..client import PageIndexClient
from .projection_indexing import SummaryProjectionIndexer
from .semantic_projection import SummaryProjectionIndexer
DEFAULT_METADATA_GENERATION_FIELDS = {
"summary": True,
@ -94,6 +88,18 @@ ADD_FILE_CONTENT_TYPES = {
}
def strip_pageindex_text_fields(value: Any) -> Any:
if isinstance(value, list):
return [strip_pageindex_text_fields(item) for item in value]
if isinstance(value, dict):
return {
key: strip_pageindex_text_fields(item)
for key, item in value.items()
if key != "text"
}
return value
class PageIndexFileSystem:
def __init__(
self,
@ -325,9 +331,9 @@ class PageIndexFileSystem:
model=self.metadata_model,
base_url=self.metadata_base_url,
max_text_chars=self.metadata_max_text_chars,
)
)
if self.summary_projection_index and self.summary_projection_indexer is None:
from .projection_indexing import SummaryProjectionIndexer
from .semantic_projection import SummaryProjectionIndexer
self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
self.summary_projection_index_dir,
@ -337,7 +343,7 @@ class PageIndexFileSystem:
embedding_timeout=self.summary_projection_embedding_timeout,
)
if self.summary_projection_index and self.semantic_retrieval_backend is None:
self.configure_hybrid_projection_retrieval(
self.configure_semantic_projection_retrieval(
self.summary_projection_index_dir,
embedding_provider=self.summary_projection_embedding_provider,
embedding_model=self.summary_projection_embedding_model,
@ -352,9 +358,9 @@ class PageIndexFileSystem:
model=self.metadata_model,
base_url=self.metadata_base_url,
max_text_chars=self.metadata_max_text_chars,
)
)
if self.summary_projection_index and self.summary_projection_indexer is None:
from .projection_indexing import SummaryProjectionIndexer
from .semantic_projection import SummaryProjectionIndexer
self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
self.summary_projection_index_dir,
@ -368,12 +374,12 @@ class PageIndexFileSystem:
indexer = self.summary_projection_indexer
if indexer is None:
raise RuntimeError("pifs add requires a summary projection indexer")
from .hybrid_projection import HybridProjectionSearchBackend
from .semantic_projection import SemanticProjectionSearchBackend
index_dir = Path(getattr(indexer, "index_dir", self.summary_projection_index_dir))
embedder = getattr(indexer, "embedder", None)
if embedder is None:
self.configure_hybrid_projection_retrieval(
self.configure_semantic_projection_retrieval(
index_dir,
embedding_provider=str(
getattr(
@ -396,7 +402,7 @@ class PageIndexFileSystem:
)
else:
embedding_cache = getattr(indexer, "embedding_cache", None)
self.semantic_retrieval_backend = HybridProjectionSearchBackend(
self.semantic_retrieval_backend = SemanticProjectionSearchBackend(
index_dir,
embedder=embedder,
embedding_provider=str(
@ -458,7 +464,7 @@ class PageIndexFileSystem:
f"{self.summary_projection_embedding_dimensions}. Rebuild the "
"projection index or use a matching embedding configuration."
)
self.configure_hybrid_projection_retrieval(
self.configure_semantic_projection_retrieval(
self.summary_projection_index_dir,
embedding_provider=self.summary_projection_embedding_provider,
embedding_model=self.summary_projection_embedding_model,
@ -731,7 +737,7 @@ class PageIndexFileSystem:
)
return results
def configure_hybrid_projection_retrieval(
def configure_semantic_projection_retrieval(
self,
index_dir: Union[str, Path],
*,
@ -741,9 +747,9 @@ class PageIndexFileSystem:
embedding_timeout: float = 60,
fetch_multiplier: int = 100,
) -> Any:
from .hybrid_projection import HybridProjectionSearchBackend
from .semantic_projection import SemanticProjectionSearchBackend
self.semantic_retrieval_backend = HybridProjectionSearchBackend.from_provider(
self.semantic_retrieval_backend = SemanticProjectionSearchBackend.from_provider(
index_dir,
embedding_provider=embedding_provider,
embedding_model=embedding_model,
@ -795,7 +801,7 @@ class PageIndexFileSystem:
if self._file_format(entry) in {"pdf", "markdown", "pageindex"}:
raise ValueError(
"open() text artifact reads are not supported for PDF/Markdown PageIndex files; "
"use pageindex_structure(), pageindex_pages(), or pageindex_node()."
"use pageindex_structure() or pageindex_pages()."
)
if str(location).strip().lower() in {"all", "full", "*"}:
return self._open_all(file_ref)
@ -814,9 +820,6 @@ class PageIndexFileSystem:
def pageindex_structure(
self,
target: str,
*,
offset: int = 0,
limit: int = 25,
) -> dict[str, Any]:
file_ref = self._resolve_target(target)
entry = self.store.get_file(file_ref)
@ -838,12 +841,6 @@ class PageIndexFileSystem:
entry,
message=str(structure["error"]),
)
node_rows = flatten_pageindex_structure_nodes(structure)
offset = max(0, offset)
limit = max(0, limit)
window = node_rows[offset : offset + limit] if limit else []
next_offset = offset + len(window)
has_more = next_offset < len(node_rows)
return {
"mode": "structure",
"file_ref": file_ref,
@ -852,67 +849,7 @@ class PageIndexFileSystem:
"status": entry.pageindex_tree_status,
"available": True,
"pageindex_doc_id": doc_id,
"structure": window,
"structure_pagination": {
"offset": offset,
"limit": limit,
"returned_nodes": len(window),
"total_nodes": len(node_rows),
"has_more": has_more,
"next_offset": next_offset if has_more else None,
},
}
def pageindex_node(self, target: str, node_id: str) -> dict[str, Any]:
file_ref = self._resolve_target(target)
entry = self.store.get_file(file_ref)
self._require_pageindex_document_file(entry, "cat --node")
client, doc_id = self._pageindex_client_doc_for_entry(entry)
if doc_id is None:
return self._structural_unavailable(
"node",
entry,
node_id=node_id,
message=(
"PageIndex structure is not cached for this file in the "
"PageIndexClient workspace."
),
)
client._ensure_doc_loaded(doc_id)
doc = client.documents.get(doc_id, {})
node = find_pageindex_node(doc.get("structure", []), node_id)
if node is None:
return self._structural_unavailable(
"node",
entry,
node_id=node_id,
message="PageIndex node was not found in the cached structure.",
)
text = str(node.get("text") or "")
if not text:
location = first_node_location(node)
if location:
content = self._client_json(client.get_page_content(doc_id, location))
if isinstance(content, list):
text = "\n\n".join(str(page.get("content") or "") for page in content)
if not text:
return self._structural_unavailable(
"node",
entry,
node_id=node_id,
message="Cached PageIndex node has no text content.",
)
return {
"mode": "node",
"file_ref": file_ref,
"external_id": entry.external_id,
"source_path": entry.source_path,
"status": entry.pageindex_tree_status,
"available": True,
"pageindex_doc_id": doc_id,
"node_id": node_id,
"node": strip_pageindex_text_fields(node),
"text": text,
"structure": strip_pageindex_text_fields(structure),
}
def pageindex_pages(self, target: str, pages: str) -> dict[str, Any]:
@ -970,8 +907,7 @@ class PageIndexFileSystem:
f"{command} is only supported for txt/text files; "
f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
"Use cat <path|file_ref|document_id> --structure, "
"cat <path|file_ref|document_id> --page, or "
"cat <path|file_ref|document_id> --node for PDF/Markdown PageIndex files."
"or cat <path|file_ref|document_id> --page for PDF/Markdown PageIndex files."
)
def _require_pageindex_document_file(self, entry: Any, command: str) -> None:
@ -1748,7 +1684,6 @@ class PageIndexFileSystem:
entry: Any,
*,
message: str,
node_id: str | None = None,
pages: str | None = None,
) -> dict[str, Any]:
pageindex_tree_error = cls._pageindex_tree_failure_message(entry.metadata_status)
@ -1765,8 +1700,6 @@ class PageIndexFileSystem:
}
if pageindex_tree_error:
result["pageindex_tree_error"] = pageindex_tree_error
if node_id is not None:
result["node_id"] = node_id
if pages is not None:
result["pages"] = pages
return result

View file

@ -1,179 +0,0 @@
from __future__ import annotations
from pathlib import Path
from typing import Any
from .core import DEFAULT_EMBEDDING_DIMENSIONS
from .hybrid_projection import (
EmbeddingCache,
INDEX_BY_CHANNEL,
embedding_cache_model_key,
make_embedder,
)
from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexRecord
class SummaryProjectionIndexer:
"""Synchronous register-time summary projection indexer."""
def __init__(
self,
index_dir: str | Path,
*,
embedder: Any,
embedding_provider: str,
embedding_model: str,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_cache_path: str | Path | None = None,
) -> None:
self.index_dir = Path(index_dir).expanduser()
self.index_dir.mkdir(parents=True, exist_ok=True)
self.embedder = embedder
self.embedding_provider = embedding_provider
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
self.embedding_cache = EmbeddingCache(
Path(embedding_cache_path).expanduser()
if embedding_cache_path is not None
else self.index_dir / "embedding_cache.sqlite"
)
self.index = SQLiteVecSemanticIndex(
self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
)
self._ensure_index()
@classmethod
def from_provider(
cls,
index_dir: str | Path,
*,
embedding_provider: str = "openai",
embedding_model: str = "text-embedding-3-small",
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_timeout: float = 60,
**kwargs: Any,
) -> "SummaryProjectionIndexer":
cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
return cls(
index_dir,
embedder=make_embedder(
embedding_provider,
embedding_model,
dimensions=embedding_dimensions,
timeout=embedding_timeout,
),
embedding_provider=embedding_provider,
embedding_model=embedding_model,
embedding_dimensions=embedding_dimensions,
**kwargs,
)
def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
summary = str((record.get("metadata") or {}).get("summary") or "").strip()
if not summary:
return {"status": "skipped", "reason": "missing_summary"}
vector = self.embedding_cache.embed_texts(
[summary],
provider=self.embedding_provider,
model=self.cache_model,
embedder=self.embedder,
batch_size=1,
)[0]
metadata = dict(record.get("metadata") or {})
count = self.index.upsert_many(
[
SemanticIndexRecord(
file_ref=str(record["file_ref"]),
vector=vector,
text=summary,
external_id=record.get("external_id"),
source_type=str(record.get("source_type") or ""),
source_path=str(record.get("source_path") or ""),
title=str(record.get("title") or ""),
metadata=metadata,
)
]
)
return {
"status": "ready",
"indexed_rows": count,
"index_path": str(self.index.db_path),
"embedding_provider": self.embedding_provider,
"embedding_model": self.embedding_model,
"embedding_dimensions": self.embedding_dimensions,
}
def delete_summary(self, file_ref: str) -> int:
return self.index.delete_file_refs([file_ref])
def _ensure_index(self) -> None:
if not self.index.db_path.exists():
self.index.reset(
dimension=self.embedding_dimensions,
metadata=self._index_metadata(),
)
return
try:
existing_dimension = self.index.dimension()
except Exception as exc:
raise RuntimeError(
"could not validate existing summary projection index config; "
f"refusing to reset {self.index.db_path}. Move the existing index "
"aside or rebuild it intentionally before changing embedding config."
) from exc
if existing_dimension != self.embedding_dimensions:
raise self._dimension_mismatch_error(
self.index.db_path,
existing_dimension,
self.embedding_dimensions,
)
def _index_metadata(self) -> dict[str, Any]:
return {
"channel": "summary",
"embedding_provider": self.embedding_provider,
"embedding_model": self.embedding_model,
"embedding_dimensions": self.embedding_dimensions,
}
@classmethod
def _validate_existing_index_dimension(
cls,
index_dir: str | Path,
embedding_dimensions: int,
) -> None:
index_path = (
Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
)
if not index_path.exists():
return
index = SQLiteVecSemanticIndex(index_path)
try:
existing_dimension = index.dimension()
except Exception as exc:
raise RuntimeError(
"could not validate existing summary projection index config; "
f"refusing to reset {index_path}. Move the existing index "
"aside or rebuild it intentionally before changing embedding config."
) from exc
if existing_dimension != embedding_dimensions:
raise cls._dimension_mismatch_error(
index_path,
existing_dimension,
embedding_dimensions,
)
@staticmethod
def _dimension_mismatch_error(
index_path: Path,
existing_dimension: int,
embedding_dimensions: int,
) -> RuntimeError:
return RuntimeError(
"summary projection index dimension mismatch: "
f"{index_path} was built with dimension {existing_dimension}, "
f"but configured embedding_dimensions is {embedding_dimensions}. "
"Use the matching embedding config, or rebuild the projection index "
"at a new path after preserving the existing data."
)

View file

@ -11,7 +11,12 @@ from pathlib import Path
from typing import Any
from .core import DEFAULT_EMBEDDING_DIMENSIONS
from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
from .semantic_index import (
SQLiteVecSemanticIndex,
SemanticIndexError,
SemanticIndexRecord,
SemanticSearchResult,
)
INDEX_BY_CHANNEL = {
@ -29,7 +34,7 @@ class QueryProjection:
@dataclass(frozen=True)
class HybridProjectionCandidate:
class SemanticProjectionCandidate:
document_id: str
score: float
sources: list[dict[str, Any]]
@ -40,7 +45,7 @@ class HybridProjectionCandidate:
snippet: str
class HybridProjectionSearchBackend:
class SemanticProjectionSearchBackend:
"""Semantic channel retrieval over rebuildable projection indexes.
The SQLite catalog remains the source of truth. This backend only reads
@ -86,7 +91,7 @@ class HybridProjectionSearchBackend:
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_timeout: float = 60,
**kwargs: Any,
) -> "HybridProjectionSearchBackend":
) -> "SemanticProjectionSearchBackend":
return cls(
index_dir,
embedder=make_embedder(
@ -108,7 +113,7 @@ class HybridProjectionSearchBackend:
*,
limit: int = 10,
filters: dict[str, Any] | None = None,
) -> list[HybridProjectionCandidate]:
) -> list[SemanticProjectionCandidate]:
if channel not in SEMANTIC_TOOL_CHANNELS:
raise ValueError(f"unsupported semantic channel: {channel}")
if channel not in self.available_channels():
@ -180,6 +185,172 @@ class HybridProjectionSearchBackend:
return {**info, "available": int(info.get("document_count") or 0) > 0}
class SummaryProjectionIndexer:
"""Synchronous register-time summary projection indexer."""
def __init__(
self,
index_dir: str | Path,
*,
embedder: Any,
embedding_provider: str,
embedding_model: str,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_cache_path: str | Path | None = None,
) -> None:
self.index_dir = Path(index_dir).expanduser()
self.index_dir.mkdir(parents=True, exist_ok=True)
self.embedder = embedder
self.embedding_provider = embedding_provider
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
self.embedding_cache = EmbeddingCache(
Path(embedding_cache_path).expanduser()
if embedding_cache_path is not None
else self.index_dir / "embedding_cache.sqlite"
)
self.index = SQLiteVecSemanticIndex(
self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
)
self._ensure_index()
@classmethod
def from_provider(
cls,
index_dir: str | Path,
*,
embedding_provider: str = "openai",
embedding_model: str = "text-embedding-3-small",
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_timeout: float = 60,
**kwargs: Any,
) -> "SummaryProjectionIndexer":
cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
return cls(
index_dir,
embedder=make_embedder(
embedding_provider,
embedding_model,
dimensions=embedding_dimensions,
timeout=embedding_timeout,
),
embedding_provider=embedding_provider,
embedding_model=embedding_model,
embedding_dimensions=embedding_dimensions,
**kwargs,
)
def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
summary = str((record.get("metadata") or {}).get("summary") or "").strip()
if not summary:
return {"status": "skipped", "reason": "missing_summary"}
vector = self.embedding_cache.embed_texts(
[summary],
provider=self.embedding_provider,
model=self.cache_model,
embedder=self.embedder,
batch_size=1,
)[0]
metadata = dict(record.get("metadata") or {})
count = self.index.upsert_many(
[
SemanticIndexRecord(
file_ref=str(record["file_ref"]),
vector=vector,
text=summary,
external_id=record.get("external_id"),
source_type=str(record.get("source_type") or ""),
source_path=str(record.get("source_path") or ""),
title=str(record.get("title") or ""),
metadata=metadata,
)
]
)
return {
"status": "ready",
"indexed_rows": count,
"index_path": str(self.index.db_path),
"embedding_provider": self.embedding_provider,
"embedding_model": self.embedding_model,
"embedding_dimensions": self.embedding_dimensions,
}
def delete_summary(self, file_ref: str) -> int:
return self.index.delete_file_refs([file_ref])
def _ensure_index(self) -> None:
if not self.index.db_path.exists():
self.index.reset(
dimension=self.embedding_dimensions,
metadata=self._index_metadata(),
)
return
try:
existing_dimension = self.index.dimension()
except Exception as exc:
raise RuntimeError(
"could not validate existing summary projection index config; "
f"refusing to reset {self.index.db_path}. Move the existing index "
"aside or rebuild it intentionally before changing embedding config."
) from exc
if existing_dimension != self.embedding_dimensions:
raise self._dimension_mismatch_error(
self.index.db_path,
existing_dimension,
self.embedding_dimensions,
)
def _index_metadata(self) -> dict[str, Any]:
return {
"channel": "summary",
"embedding_provider": self.embedding_provider,
"embedding_model": self.embedding_model,
"embedding_dimensions": self.embedding_dimensions,
}
@classmethod
def _validate_existing_index_dimension(
cls,
index_dir: str | Path,
embedding_dimensions: int,
) -> None:
index_path = (
Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
)
if not index_path.exists():
return
index = SQLiteVecSemanticIndex(index_path)
try:
existing_dimension = index.dimension()
except Exception as exc:
raise RuntimeError(
"could not validate existing summary projection index config; "
f"refusing to reset {index_path}. Move the existing index "
"aside or rebuild it intentionally before changing embedding config."
) from exc
if existing_dimension != embedding_dimensions:
raise cls._dimension_mismatch_error(
index_path,
existing_dimension,
embedding_dimensions,
)
@staticmethod
def _dimension_mismatch_error(
index_path: Path,
existing_dimension: int,
embedding_dimensions: int,
) -> RuntimeError:
return RuntimeError(
"summary projection index dimension mismatch: "
f"{index_path} was built with dimension {existing_dimension}, "
f"but configured embedding_dimensions is {embedding_dimensions}. "
"Use the matching embedding config, or rebuild the projection index "
"at a new path after preserving the existing data."
)
class EmbeddingCache:
def __init__(self, db_path: Path):
self.db_path = db_path
@ -308,8 +479,8 @@ def query_text_for_channel(channel: str, query: str, projection: QueryProjection
def rank_single_semantic_channel(
channel: str,
results: list[SemanticSearchResult],
) -> list[HybridProjectionCandidate]:
rows: list[HybridProjectionCandidate] = []
) -> list[SemanticProjectionCandidate]:
rows: list[SemanticProjectionCandidate] = []
seen: set[str] = set()
for rank, result in enumerate(results, 1):
doc_id = str(result.external_id or result.file_ref)
@ -317,7 +488,7 @@ def rank_single_semantic_channel(
continue
seen.add(doc_id)
rows.append(
HybridProjectionCandidate(
SemanticProjectionCandidate(
document_id=doc_id,
score=1 / (60 + rank),
sources=[{"channel": channel, "rank": rank, "distance": result.distance}],

View file

@ -1,77 +0,0 @@
from __future__ import annotations
from copy import deepcopy
from typing import Any
def strip_pageindex_text_fields(value: Any) -> Any:
if isinstance(value, list):
return [strip_pageindex_text_fields(item) for item in value]
if isinstance(value, dict):
return {
key: strip_pageindex_text_fields(item)
for key, item in value.items()
if key != "text"
}
return value
def flatten_pageindex_structure_nodes(structure: Any) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
def visit(value: Any, *, depth: int, parent_node_id: str | None) -> None:
if isinstance(value, list):
for item in value:
visit(item, depth=depth, parent_node_id=parent_node_id)
return
if not isinstance(value, dict):
return
node_id = value.get("node_id")
child_values: list[Any] = []
for child_key in ("nodes", "children"):
children = value.get(child_key)
if isinstance(children, list):
child_values.extend(children)
row = {
key: strip_pageindex_text_fields(item)
for key, item in value.items()
if key not in {"text", "nodes", "children"}
}
row["depth"] = depth
row["children_count"] = len(child_values)
if parent_node_id:
row["parent_node_id"] = parent_node_id
rows.append(row)
next_parent = str(node_id) if node_id is not None else parent_node_id
for child in child_values:
visit(child, depth=depth + 1, parent_node_id=next_parent)
visit(structure, depth=0, parent_node_id=None)
return rows
def find_pageindex_node(structure: Any, node_id: str) -> dict[str, Any] | None:
if isinstance(structure, dict):
if str(structure.get("node_id", "")) == str(node_id):
return deepcopy(structure)
for child_key in ("nodes", "children"):
found = find_pageindex_node(structure.get(child_key), node_id)
if found is not None:
return found
if isinstance(structure, list):
for item in structure:
found = find_pageindex_node(item, node_id)
if found is not None:
return found
return None
def first_node_location(node: dict[str, Any]) -> str | None:
for key in ("line_num", "physical_index", "start_index"):
value = node.get(key)
if value is not None and value != "":
return str(value)
return None

View file

@ -7,7 +7,7 @@ import pytest
def test_filesystem_lazy_exports_remain_public():
import pageindex.filesystem as filesystem
from pageindex.filesystem import (
HybridProjectionSearchBackend,
SemanticProjectionSearchBackend,
RebuildableSemanticIndex,
SemanticIndexRecord,
SemanticSearchResult,
@ -16,7 +16,7 @@ def test_filesystem_lazy_exports_remain_public():
)
for name in (
"HybridProjectionSearchBackend",
"SemanticProjectionSearchBackend",
"RebuildableSemanticIndex",
"SemanticIndexRecord",
"SemanticSearchResult",
@ -26,7 +26,7 @@ def test_filesystem_lazy_exports_remain_public():
assert name in filesystem.__all__
assert name in dir(filesystem)
assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend"
assert SemanticProjectionSearchBackend.__name__ == "SemanticProjectionSearchBackend"
assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex"
assert SemanticIndexRecord.__name__ == "SemanticIndexRecord"
assert SemanticSearchResult.__name__ == "SemanticSearchResult"
@ -819,7 +819,7 @@ def test_existing_summary_projection_index_uses_current_config_when_dimensions_m
monkeypatch.setattr(
filesystem,
"configure_hybrid_projection_retrieval",
"configure_semantic_projection_retrieval",
fake_configure,
)
@ -876,7 +876,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
monkeypatch.setattr(
filesystem,
"configure_hybrid_projection_retrieval",
"configure_semantic_projection_retrieval",
fail_configure,
)
@ -892,9 +892,9 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
from pageindex.filesystem.semantic_projection import SemanticProjectionSearchBackend
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
class FixedEmbedder:
def embed(self, texts):
@ -916,7 +916,7 @@ def test_browse_semantic_files_uses_summary_projection_when_only_summary_availab
embedding_model="fake",
embedding_dimensions=3,
)
backend = HybridProjectionSearchBackend(
backend = SemanticProjectionSearchBackend(
index_dir,
embedder=FixedEmbedder(),
embedding_provider="test",

View file

@ -25,7 +25,7 @@ class StaticEmbedder:
def make_summary_indexer(workspace: Path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
return SummaryProjectionIndexer(
workspace / "artifacts" / "projection_indexes",

View file

@ -37,7 +37,7 @@ def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec(
workspace = tmp_path / "workspace"
real_import = builtins.__import__
monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False)
monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_projection", raising=False)
monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False)
monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False)

View file

@ -102,7 +102,7 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
indexer = SummaryProjectionIndexer(
tmp_path / "projection",
@ -134,7 +134,7 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
indexer = SummaryProjectionIndexer(
tmp_path / "projection",
@ -164,7 +164,7 @@ def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
indexer = SummaryProjectionIndexer(
tmp_path / "projection",
@ -188,7 +188,7 @@ def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
index_dir = tmp_path / "projection"
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@ -216,8 +216,8 @@ def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_pa
def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder(
tmp_path, monkeypatch
):
from pageindex.filesystem import projection_indexing
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
from pageindex.filesystem import semantic_projection
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
index_dir = tmp_path / "projection"
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@ -234,14 +234,14 @@ def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embe
def fail_make_embedder(*args, **kwargs):
raise AssertionError("embedder should not be constructed before dimension validation")
monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder)
monkeypatch.setattr(semantic_projection, "make_embedder", fail_make_embedder)
with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
SummaryProjectionIndexer.from_provider(index_dir)
def test_embedding_cache_key_separates_model_dimensions(tmp_path):
from pageindex.filesystem.hybrid_projection import (
from pageindex.filesystem.semantic_projection import (
EmbeddingCache,
embedding_cache_model_key,
)
@ -285,7 +285,7 @@ def test_embedding_cache_key_separates_model_dimensions(tmp_path):
def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
index_dir = tmp_path / "projection"
index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@ -328,7 +328,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
def test_hash_embedding_provider_is_not_available():
from pageindex.filesystem.hybrid_projection import make_embedder
from pageindex.filesystem.semantic_projection import make_embedder
with pytest.raises(ValueError, match="unknown embedding provider: hash"):
make_embedder("hash", "unused", dimensions=256, timeout=1)