refactor(filesystem): consolidate semantic projection modules

2026-06-12 19:55:17 +02:00 · 2026-05-31 23:57:12 +08:00 · 2026-05-31 23:57:12 +08:00 · e368562e03
commit e368562e03
parent 34fa8f7b42
10 changed files with 233 additions and 385 deletions
--- a/examples/pifs_demo.py
+++ b/examples/pifs_demo.py
@ -338,7 +338,7 @@ def configure_summary_projection_backend(
 ) -> None:
    if not (filesystem.summary_projection_index_dir / "summary_only_vector.sqlite").exists():
        return
-    filesystem.configure_hybrid_projection_retrieval(
+    filesystem.configure_semantic_projection_retrieval(
        filesystem.summary_projection_index_dir,
        embedding_provider=embedding_provider,
        embedding_model=embedding_model,
--- a/pageindex/filesystem/init.py
+++ b/pageindex/filesystem/init.py
@ -13,8 +13,8 @@ from .metadata_generation import (
 from .types import OpenResult, SearchResult

 if TYPE_CHECKING:
-    from .hybrid_projection import HybridProjectionSearchBackend
-    from .projection_indexing import SummaryProjectionIndexer
+    from .semantic_projection import SemanticProjectionSearchBackend
+    from .semantic_projection import SummaryProjectionIndexer
    from .semantic_index import (
        RebuildableSemanticIndex,
        SemanticIndexRecord,
@ -23,17 +23,17 @@ if TYPE_CHECKING:
    )

 _LAZY_EXPORTS = {
-    "HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"),
+    "SemanticProjectionSearchBackend": (".semantic_projection", "SemanticProjectionSearchBackend"),
    "RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"),
    "SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"),
    "SemanticSearchResult": (".semantic_index", "SemanticSearchResult"),
    "SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"),
-    "SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"),
+    "SummaryProjectionIndexer": (".semantic_projection", "SummaryProjectionIndexer"),
 }

 __all__ = [
    "OpenResult",
-    "HybridProjectionSearchBackend",
+    "SemanticProjectionSearchBackend",
    "MetadataGenerationBackend",
    "MetadataGenerationError",
    "MetadataGenerationInput",
--- a/pageindex/filesystem/core.py
+++ b/pageindex/filesystem/core.py
@ -23,17 +23,11 @@ from .store import (
    metadata_text,
    normalize_path,
 )
-from .structural_read import (
-    flatten_pageindex_structure_nodes,
-    first_node_location,
-    find_pageindex_node,
-    strip_pageindex_text_fields,
-)
 from .types import OpenResult, SearchResult

 if TYPE_CHECKING:
    from ..client import PageIndexClient
-    from .projection_indexing import SummaryProjectionIndexer
+    from .semantic_projection import SummaryProjectionIndexer

 DEFAULT_METADATA_GENERATION_FIELDS = {
    "summary": True,
@ -94,6 +88,18 @@ ADD_FILE_CONTENT_TYPES = {
 }


+def strip_pageindex_text_fields(value: Any) -> Any:
+    if isinstance(value, list):
+        return [strip_pageindex_text_fields(item) for item in value]
+    if isinstance(value, dict):
+        return {
+            key: strip_pageindex_text_fields(item)
+            for key, item in value.items()
+            if key != "text"
+        }
+    return value
+
+
 class PageIndexFileSystem:
    def __init__(
        self,
@ -325,9 +331,9 @@ class PageIndexFileSystem:
                model=self.metadata_model,
                base_url=self.metadata_base_url,
                max_text_chars=self.metadata_max_text_chars,
-            )
+        )
        if self.summary_projection_index and self.summary_projection_indexer is None:
-            from .projection_indexing import SummaryProjectionIndexer
+            from .semantic_projection import SummaryProjectionIndexer

            self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
                self.summary_projection_index_dir,
@ -337,7 +343,7 @@ class PageIndexFileSystem:
                embedding_timeout=self.summary_projection_embedding_timeout,
            )
        if self.summary_projection_index and self.semantic_retrieval_backend is None:
-            self.configure_hybrid_projection_retrieval(
+            self.configure_semantic_projection_retrieval(
                self.summary_projection_index_dir,
                embedding_provider=self.summary_projection_embedding_provider,
                embedding_model=self.summary_projection_embedding_model,
@ -352,9 +358,9 @@ class PageIndexFileSystem:
                model=self.metadata_model,
                base_url=self.metadata_base_url,
                max_text_chars=self.metadata_max_text_chars,
-            )
+        )
        if self.summary_projection_index and self.summary_projection_indexer is None:
-            from .projection_indexing import SummaryProjectionIndexer
+            from .semantic_projection import SummaryProjectionIndexer

            self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
                self.summary_projection_index_dir,
@ -368,12 +374,12 @@ class PageIndexFileSystem:
        indexer = self.summary_projection_indexer
        if indexer is None:
            raise RuntimeError("pifs add requires a summary projection indexer")
-        from .hybrid_projection import HybridProjectionSearchBackend
+        from .semantic_projection import SemanticProjectionSearchBackend

        index_dir = Path(getattr(indexer, "index_dir", self.summary_projection_index_dir))
        embedder = getattr(indexer, "embedder", None)
        if embedder is None:
-            self.configure_hybrid_projection_retrieval(
+            self.configure_semantic_projection_retrieval(
                index_dir,
                embedding_provider=str(
                    getattr(
@ -396,7 +402,7 @@ class PageIndexFileSystem:
            )
        else:
            embedding_cache = getattr(indexer, "embedding_cache", None)
-            self.semantic_retrieval_backend = HybridProjectionSearchBackend(
+            self.semantic_retrieval_backend = SemanticProjectionSearchBackend(
                index_dir,
                embedder=embedder,
                embedding_provider=str(
@ -458,7 +464,7 @@ class PageIndexFileSystem:
                f"{self.summary_projection_embedding_dimensions}. Rebuild the "
                "projection index or use a matching embedding configuration."
            )
-        self.configure_hybrid_projection_retrieval(
+        self.configure_semantic_projection_retrieval(
            self.summary_projection_index_dir,
            embedding_provider=self.summary_projection_embedding_provider,
            embedding_model=self.summary_projection_embedding_model,
@ -731,7 +737,7 @@ class PageIndexFileSystem:
            )
        return results

-    def configure_hybrid_projection_retrieval(
+    def configure_semantic_projection_retrieval(
        self,
        index_dir: Union[str, Path],
        *,
@ -741,9 +747,9 @@ class PageIndexFileSystem:
        embedding_timeout: float = 60,
        fetch_multiplier: int = 100,
    ) -> Any:
-        from .hybrid_projection import HybridProjectionSearchBackend
+        from .semantic_projection import SemanticProjectionSearchBackend

-        self.semantic_retrieval_backend = HybridProjectionSearchBackend.from_provider(
+        self.semantic_retrieval_backend = SemanticProjectionSearchBackend.from_provider(
            index_dir,
            embedding_provider=embedding_provider,
            embedding_model=embedding_model,
@ -795,7 +801,7 @@ class PageIndexFileSystem:
        if self._file_format(entry) in {"pdf", "markdown", "pageindex"}:
            raise ValueError(
                "open() text artifact reads are not supported for PDF/Markdown PageIndex files; "
-                "use pageindex_structure(), pageindex_pages(), or pageindex_node()."
+                "use pageindex_structure() or pageindex_pages()."
            )
        if str(location).strip().lower() in {"all", "full", "*"}:
            return self._open_all(file_ref)
@ -814,9 +820,6 @@ class PageIndexFileSystem:
    def pageindex_structure(
        self,
        target: str,
-        *,
-        offset: int = 0,
-        limit: int = 25,
    ) -> dict[str, Any]:
        file_ref = self._resolve_target(target)
        entry = self.store.get_file(file_ref)
@ -838,12 +841,6 @@ class PageIndexFileSystem:
                entry,
                message=str(structure["error"]),
            )
-        node_rows = flatten_pageindex_structure_nodes(structure)
-        offset = max(0, offset)
-        limit = max(0, limit)
-        window = node_rows[offset : offset + limit] if limit else []
-        next_offset = offset + len(window)
-        has_more = next_offset < len(node_rows)
        return {
            "mode": "structure",
            "file_ref": file_ref,
@ -852,67 +849,7 @@ class PageIndexFileSystem:
            "status": entry.pageindex_tree_status,
            "available": True,
            "pageindex_doc_id": doc_id,
-            "structure": window,
-            "structure_pagination": {
-                "offset": offset,
-                "limit": limit,
-                "returned_nodes": len(window),
-                "total_nodes": len(node_rows),
-                "has_more": has_more,
-                "next_offset": next_offset if has_more else None,
-            },
-        }
-
-    def pageindex_node(self, target: str, node_id: str) -> dict[str, Any]:
-        file_ref = self._resolve_target(target)
-        entry = self.store.get_file(file_ref)
-        self._require_pageindex_document_file(entry, "cat --node")
-        client, doc_id = self._pageindex_client_doc_for_entry(entry)
-        if doc_id is None:
-            return self._structural_unavailable(
-                "node",
-                entry,
-                node_id=node_id,
-                message=(
-                    "PageIndex structure is not cached for this file in the "
-                    "PageIndexClient workspace."
-                ),
-            )
-        client._ensure_doc_loaded(doc_id)
-        doc = client.documents.get(doc_id, {})
-        node = find_pageindex_node(doc.get("structure", []), node_id)
-        if node is None:
-            return self._structural_unavailable(
-                "node",
-                entry,
-                node_id=node_id,
-                message="PageIndex node was not found in the cached structure.",
-            )
-        text = str(node.get("text") or "")
-        if not text:
-            location = first_node_location(node)
-            if location:
-                content = self._client_json(client.get_page_content(doc_id, location))
-                if isinstance(content, list):
-                    text = "\n\n".join(str(page.get("content") or "") for page in content)
-        if not text:
-            return self._structural_unavailable(
-                "node",
-                entry,
-                node_id=node_id,
-                message="Cached PageIndex node has no text content.",
-            )
-        return {
-            "mode": "node",
-            "file_ref": file_ref,
-            "external_id": entry.external_id,
-            "source_path": entry.source_path,
-            "status": entry.pageindex_tree_status,
-            "available": True,
-            "pageindex_doc_id": doc_id,
-            "node_id": node_id,
-            "node": strip_pageindex_text_fields(node),
-            "text": text,
+            "structure": strip_pageindex_text_fields(structure),
        }

    def pageindex_pages(self, target: str, pages: str) -> dict[str, Any]:
@ -970,8 +907,7 @@ class PageIndexFileSystem:
            f"{command} is only supported for txt/text files; "
            f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
            "Use cat <path|file_ref|document_id> --structure, "
-            "cat <path|file_ref|document_id> --page, or "
-            "cat <path|file_ref|document_id> --node for PDF/Markdown PageIndex files."
+            "or cat <path|file_ref|document_id> --page for PDF/Markdown PageIndex files."
        )

    def _require_pageindex_document_file(self, entry: Any, command: str) -> None:
@ -1748,7 +1684,6 @@ class PageIndexFileSystem:
        entry: Any,
        *,
        message: str,
-        node_id: str | None = None,
        pages: str | None = None,
    ) -> dict[str, Any]:
        pageindex_tree_error = cls._pageindex_tree_failure_message(entry.metadata_status)
@ -1765,8 +1700,6 @@ class PageIndexFileSystem:
        }
        if pageindex_tree_error:
            result["pageindex_tree_error"] = pageindex_tree_error
-        if node_id is not None:
-            result["node_id"] = node_id
        if pages is not None:
            result["pages"] = pages
        return result
--- a/pageindex/filesystem/projection_indexing.py
+++ b/pageindex/filesystem/projection_indexing.py
@ -1,179 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-from .core import DEFAULT_EMBEDDING_DIMENSIONS
-from .hybrid_projection import (
-    EmbeddingCache,
-    INDEX_BY_CHANNEL,
-    embedding_cache_model_key,
-    make_embedder,
-)
-from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexRecord
-
-
-class SummaryProjectionIndexer:
-    """Synchronous register-time summary projection indexer."""
-
-    def __init__(
-        self,
-        index_dir: str | Path,
-        *,
-        embedder: Any,
-        embedding_provider: str,
-        embedding_model: str,
-        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
-        embedding_cache_path: str | Path | None = None,
-    ) -> None:
-        self.index_dir = Path(index_dir).expanduser()
-        self.index_dir.mkdir(parents=True, exist_ok=True)
-        self.embedder = embedder
-        self.embedding_provider = embedding_provider
-        self.embedding_model = embedding_model
-        self.embedding_dimensions = embedding_dimensions
-        self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
-        self.embedding_cache = EmbeddingCache(
-            Path(embedding_cache_path).expanduser()
-            if embedding_cache_path is not None
-            else self.index_dir / "embedding_cache.sqlite"
-        )
-        self.index = SQLiteVecSemanticIndex(
-            self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
-        )
-        self._ensure_index()
-
-    @classmethod
-    def from_provider(
-        cls,
-        index_dir: str | Path,
-        *,
-        embedding_provider: str = "openai",
-        embedding_model: str = "text-embedding-3-small",
-        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
-        embedding_timeout: float = 60,
-        **kwargs: Any,
-    ) -> "SummaryProjectionIndexer":
-        cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
-        return cls(
-            index_dir,
-            embedder=make_embedder(
-                embedding_provider,
-                embedding_model,
-                dimensions=embedding_dimensions,
-                timeout=embedding_timeout,
-            ),
-            embedding_provider=embedding_provider,
-            embedding_model=embedding_model,
-            embedding_dimensions=embedding_dimensions,
-            **kwargs,
-        )
-
-    def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
-        summary = str((record.get("metadata") or {}).get("summary") or "").strip()
-        if not summary:
-            return {"status": "skipped", "reason": "missing_summary"}
-        vector = self.embedding_cache.embed_texts(
-            [summary],
-            provider=self.embedding_provider,
-            model=self.cache_model,
-            embedder=self.embedder,
-            batch_size=1,
-        )[0]
-        metadata = dict(record.get("metadata") or {})
-        count = self.index.upsert_many(
-            [
-                SemanticIndexRecord(
-                    file_ref=str(record["file_ref"]),
-                    vector=vector,
-                    text=summary,
-                    external_id=record.get("external_id"),
-                    source_type=str(record.get("source_type") or ""),
-                    source_path=str(record.get("source_path") or ""),
-                    title=str(record.get("title") or ""),
-                    metadata=metadata,
-                )
-            ]
-        )
-        return {
-            "status": "ready",
-            "indexed_rows": count,
-            "index_path": str(self.index.db_path),
-            "embedding_provider": self.embedding_provider,
-            "embedding_model": self.embedding_model,
-            "embedding_dimensions": self.embedding_dimensions,
-        }
-
-    def delete_summary(self, file_ref: str) -> int:
-        return self.index.delete_file_refs([file_ref])
-
-    def _ensure_index(self) -> None:
-        if not self.index.db_path.exists():
-            self.index.reset(
-                dimension=self.embedding_dimensions,
-                metadata=self._index_metadata(),
-            )
-            return
-        try:
-            existing_dimension = self.index.dimension()
-        except Exception as exc:
-            raise RuntimeError(
-                "could not validate existing summary projection index config; "
-                f"refusing to reset {self.index.db_path}. Move the existing index "
-                "aside or rebuild it intentionally before changing embedding config."
-            ) from exc
-        if existing_dimension != self.embedding_dimensions:
-            raise self._dimension_mismatch_error(
-                self.index.db_path,
-                existing_dimension,
-                self.embedding_dimensions,
-            )
-
-    def _index_metadata(self) -> dict[str, Any]:
-        return {
-            "channel": "summary",
-            "embedding_provider": self.embedding_provider,
-            "embedding_model": self.embedding_model,
-            "embedding_dimensions": self.embedding_dimensions,
-        }
-
-    @classmethod
-    def _validate_existing_index_dimension(
-        cls,
-        index_dir: str | Path,
-        embedding_dimensions: int,
-    ) -> None:
-        index_path = (
-            Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
-        )
-        if not index_path.exists():
-            return
-        index = SQLiteVecSemanticIndex(index_path)
-        try:
-            existing_dimension = index.dimension()
-        except Exception as exc:
-            raise RuntimeError(
-                "could not validate existing summary projection index config; "
-                f"refusing to reset {index_path}. Move the existing index "
-                "aside or rebuild it intentionally before changing embedding config."
-            ) from exc
-        if existing_dimension != embedding_dimensions:
-            raise cls._dimension_mismatch_error(
-                index_path,
-                existing_dimension,
-                embedding_dimensions,
-            )
-
-    @staticmethod
-    def _dimension_mismatch_error(
-        index_path: Path,
-        existing_dimension: int,
-        embedding_dimensions: int,
-    ) -> RuntimeError:
-        return RuntimeError(
-            "summary projection index dimension mismatch: "
-            f"{index_path} was built with dimension {existing_dimension}, "
-            f"but configured embedding_dimensions is {embedding_dimensions}. "
-            "Use the matching embedding config, or rebuild the projection index "
-            "at a new path after preserving the existing data."
-        )
--- a/pageindex/filesystem/semantic_projection.py
+++ b/pageindex/filesystem/semantic_projection.py
@ -11,7 +11,12 @@ from pathlib import Path
 from typing import Any

 from .core import DEFAULT_EMBEDDING_DIMENSIONS
-from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
+from .semantic_index import (
+    SQLiteVecSemanticIndex,
+    SemanticIndexError,
+    SemanticIndexRecord,
+    SemanticSearchResult,
+)


 INDEX_BY_CHANNEL = {
@ -29,7 +34,7 @@ class QueryProjection:


@dataclass(frozen=True)
-class HybridProjectionCandidate:
+class SemanticProjectionCandidate:
    document_id: str
    score: float
    sources: list[dict[str, Any]]
@ -40,7 +45,7 @@ class HybridProjectionCandidate:
    snippet: str


-class HybridProjectionSearchBackend:
+class SemanticProjectionSearchBackend:
    """Semantic channel retrieval over rebuildable projection indexes.

    The SQLite catalog remains the source of truth. This backend only reads
@ -86,7 +91,7 @@ class HybridProjectionSearchBackend:
        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
        embedding_timeout: float = 60,
        **kwargs: Any,
-    ) -> "HybridProjectionSearchBackend":
+    ) -> "SemanticProjectionSearchBackend":
        return cls(
            index_dir,
            embedder=make_embedder(
@ -108,7 +113,7 @@ class HybridProjectionSearchBackend:
        *,
        limit: int = 10,
        filters: dict[str, Any] | None = None,
-    ) -> list[HybridProjectionCandidate]:
+    ) -> list[SemanticProjectionCandidate]:
        if channel not in SEMANTIC_TOOL_CHANNELS:
            raise ValueError(f"unsupported semantic channel: {channel}")
        if channel not in self.available_channels():
@ -180,6 +185,172 @@ class HybridProjectionSearchBackend:
        return {**info, "available": int(info.get("document_count") or 0) > 0}


+class SummaryProjectionIndexer:
+    """Synchronous register-time summary projection indexer."""
+
+    def __init__(
+        self,
+        index_dir: str | Path,
+        *,
+        embedder: Any,
+        embedding_provider: str,
+        embedding_model: str,
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
+        embedding_cache_path: str | Path | None = None,
+    ) -> None:
+        self.index_dir = Path(index_dir).expanduser()
+        self.index_dir.mkdir(parents=True, exist_ok=True)
+        self.embedder = embedder
+        self.embedding_provider = embedding_provider
+        self.embedding_model = embedding_model
+        self.embedding_dimensions = embedding_dimensions
+        self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
+        self.embedding_cache = EmbeddingCache(
+            Path(embedding_cache_path).expanduser()
+            if embedding_cache_path is not None
+            else self.index_dir / "embedding_cache.sqlite"
+        )
+        self.index = SQLiteVecSemanticIndex(
+            self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
+        )
+        self._ensure_index()
+
+    @classmethod
+    def from_provider(
+        cls,
+        index_dir: str | Path,
+        *,
+        embedding_provider: str = "openai",
+        embedding_model: str = "text-embedding-3-small",
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
+        embedding_timeout: float = 60,
+        **kwargs: Any,
+    ) -> "SummaryProjectionIndexer":
+        cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
+        return cls(
+            index_dir,
+            embedder=make_embedder(
+                embedding_provider,
+                embedding_model,
+                dimensions=embedding_dimensions,
+                timeout=embedding_timeout,
+            ),
+            embedding_provider=embedding_provider,
+            embedding_model=embedding_model,
+            embedding_dimensions=embedding_dimensions,
+            **kwargs,
+        )
+
+    def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
+        summary = str((record.get("metadata") or {}).get("summary") or "").strip()
+        if not summary:
+            return {"status": "skipped", "reason": "missing_summary"}
+        vector = self.embedding_cache.embed_texts(
+            [summary],
+            provider=self.embedding_provider,
+            model=self.cache_model,
+            embedder=self.embedder,
+            batch_size=1,
+        )[0]
+        metadata = dict(record.get("metadata") or {})
+        count = self.index.upsert_many(
+            [
+                SemanticIndexRecord(
+                    file_ref=str(record["file_ref"]),
+                    vector=vector,
+                    text=summary,
+                    external_id=record.get("external_id"),
+                    source_type=str(record.get("source_type") or ""),
+                    source_path=str(record.get("source_path") or ""),
+                    title=str(record.get("title") or ""),
+                    metadata=metadata,
+                )
+            ]
+        )
+        return {
+            "status": "ready",
+            "indexed_rows": count,
+            "index_path": str(self.index.db_path),
+            "embedding_provider": self.embedding_provider,
+            "embedding_model": self.embedding_model,
+            "embedding_dimensions": self.embedding_dimensions,
+        }
+
+    def delete_summary(self, file_ref: str) -> int:
+        return self.index.delete_file_refs([file_ref])
+
+    def _ensure_index(self) -> None:
+        if not self.index.db_path.exists():
+            self.index.reset(
+                dimension=self.embedding_dimensions,
+                metadata=self._index_metadata(),
+            )
+            return
+        try:
+            existing_dimension = self.index.dimension()
+        except Exception as exc:
+            raise RuntimeError(
+                "could not validate existing summary projection index config; "
+                f"refusing to reset {self.index.db_path}. Move the existing index "
+                "aside or rebuild it intentionally before changing embedding config."
+            ) from exc
+        if existing_dimension != self.embedding_dimensions:
+            raise self._dimension_mismatch_error(
+                self.index.db_path,
+                existing_dimension,
+                self.embedding_dimensions,
+            )
+
+    def _index_metadata(self) -> dict[str, Any]:
+        return {
+            "channel": "summary",
+            "embedding_provider": self.embedding_provider,
+            "embedding_model": self.embedding_model,
+            "embedding_dimensions": self.embedding_dimensions,
+        }
+
+    @classmethod
+    def _validate_existing_index_dimension(
+        cls,
+        index_dir: str | Path,
+        embedding_dimensions: int,
+    ) -> None:
+        index_path = (
+            Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
+        )
+        if not index_path.exists():
+            return
+        index = SQLiteVecSemanticIndex(index_path)
+        try:
+            existing_dimension = index.dimension()
+        except Exception as exc:
+            raise RuntimeError(
+                "could not validate existing summary projection index config; "
+                f"refusing to reset {index_path}. Move the existing index "
+                "aside or rebuild it intentionally before changing embedding config."
+            ) from exc
+        if existing_dimension != embedding_dimensions:
+            raise cls._dimension_mismatch_error(
+                index_path,
+                existing_dimension,
+                embedding_dimensions,
+            )
+
+    @staticmethod
+    def _dimension_mismatch_error(
+        index_path: Path,
+        existing_dimension: int,
+        embedding_dimensions: int,
+    ) -> RuntimeError:
+        return RuntimeError(
+            "summary projection index dimension mismatch: "
+            f"{index_path} was built with dimension {existing_dimension}, "
+            f"but configured embedding_dimensions is {embedding_dimensions}. "
+            "Use the matching embedding config, or rebuild the projection index "
+            "at a new path after preserving the existing data."
+        )
+
+
 class EmbeddingCache:
    def __init__(self, db_path: Path):
        self.db_path = db_path
@ -308,8 +479,8 @@ def query_text_for_channel(channel: str, query: str, projection: QueryProjection
 def rank_single_semantic_channel(
    channel: str,
    results: list[SemanticSearchResult],
-) -> list[HybridProjectionCandidate]:
-    rows: list[HybridProjectionCandidate] = []
+) -> list[SemanticProjectionCandidate]:
+    rows: list[SemanticProjectionCandidate] = []
    seen: set[str] = set()
    for rank, result in enumerate(results, 1):
        doc_id = str(result.external_id or result.file_ref)
@ -317,7 +488,7 @@ def rank_single_semantic_channel(
            continue
        seen.add(doc_id)
        rows.append(
-            HybridProjectionCandidate(
+            SemanticProjectionCandidate(
                document_id=doc_id,
                score=1 / (60 + rank),
                sources=[{"channel": channel, "rank": rank, "distance": result.distance}],
--- a/pageindex/filesystem/structural_read.py
+++ b/pageindex/filesystem/structural_read.py
@ -1,77 +0,0 @@
-from __future__ import annotations
-
-from copy import deepcopy
-from typing import Any
-
-
-def strip_pageindex_text_fields(value: Any) -> Any:
-    if isinstance(value, list):
-        return [strip_pageindex_text_fields(item) for item in value]
-    if isinstance(value, dict):
-        return {
-            key: strip_pageindex_text_fields(item)
-            for key, item in value.items()
-            if key != "text"
-        }
-    return value
-
-
-def flatten_pageindex_structure_nodes(structure: Any) -> list[dict[str, Any]]:
-    rows: list[dict[str, Any]] = []
-
-    def visit(value: Any, *, depth: int, parent_node_id: str | None) -> None:
-        if isinstance(value, list):
-            for item in value:
-                visit(item, depth=depth, parent_node_id=parent_node_id)
-            return
-        if not isinstance(value, dict):
-            return
-
-        node_id = value.get("node_id")
-        child_values: list[Any] = []
-        for child_key in ("nodes", "children"):
-            children = value.get(child_key)
-            if isinstance(children, list):
-                child_values.extend(children)
-
-        row = {
-            key: strip_pageindex_text_fields(item)
-            for key, item in value.items()
-            if key not in {"text", "nodes", "children"}
-        }
-        row["depth"] = depth
-        row["children_count"] = len(child_values)
-        if parent_node_id:
-            row["parent_node_id"] = parent_node_id
-        rows.append(row)
-
-        next_parent = str(node_id) if node_id is not None else parent_node_id
-        for child in child_values:
-            visit(child, depth=depth + 1, parent_node_id=next_parent)
-
-    visit(structure, depth=0, parent_node_id=None)
-    return rows
-
-
-def find_pageindex_node(structure: Any, node_id: str) -> dict[str, Any] | None:
-    if isinstance(structure, dict):
-        if str(structure.get("node_id", "")) == str(node_id):
-            return deepcopy(structure)
-        for child_key in ("nodes", "children"):
-            found = find_pageindex_node(structure.get(child_key), node_id)
-            if found is not None:
-                return found
-    if isinstance(structure, list):
-        for item in structure:
-            found = find_pageindex_node(item, node_id)
-            if found is not None:
-                return found
-    return None
-
-
-def first_node_location(node: dict[str, Any]) -> str | None:
-    for key in ("line_num", "physical_index", "start_index"):
-        value = node.get(key)
-        if value is not None and value != "":
-            return str(value)
-    return None
--- a/tests/test_pageindex_filesystem_scope.py
+++ b/tests/test_pageindex_filesystem_scope.py
@ -7,7 +7,7 @@ import pytest
 def test_filesystem_lazy_exports_remain_public():
    import pageindex.filesystem as filesystem
    from pageindex.filesystem import (
-        HybridProjectionSearchBackend,
+        SemanticProjectionSearchBackend,
        RebuildableSemanticIndex,
        SemanticIndexRecord,
        SemanticSearchResult,
@ -16,7 +16,7 @@ def test_filesystem_lazy_exports_remain_public():
    )

    for name in (
-        "HybridProjectionSearchBackend",
+        "SemanticProjectionSearchBackend",
        "RebuildableSemanticIndex",
        "SemanticIndexRecord",
        "SemanticSearchResult",
@ -26,7 +26,7 @@ def test_filesystem_lazy_exports_remain_public():
        assert name in filesystem.__all__
        assert name in dir(filesystem)

-    assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend"
+    assert SemanticProjectionSearchBackend.__name__ == "SemanticProjectionSearchBackend"
    assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex"
    assert SemanticIndexRecord.__name__ == "SemanticIndexRecord"
    assert SemanticSearchResult.__name__ == "SemanticSearchResult"
@ -819,7 +819,7 @@ def test_existing_summary_projection_index_uses_current_config_when_dimensions_m

    monkeypatch.setattr(
        filesystem,
-        "configure_hybrid_projection_retrieval",
+        "configure_semantic_projection_retrieval",
        fake_configure,
    )

@ -876,7 +876,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(

    monkeypatch.setattr(
        filesystem,
-        "configure_hybrid_projection_retrieval",
+        "configure_semantic_projection_retrieval",
        fail_configure,
    )

@ -892,9 +892,9 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(

 def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path):
    from pageindex.filesystem import PageIndexFileSystem
-    from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
+    from pageindex.filesystem.semantic_projection import SemanticProjectionSearchBackend
    from pageindex.filesystem.metadata_generation import MetadataGenerationResult
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    class FixedEmbedder:
        def embed(self, texts):
@ -916,7 +916,7 @@ def test_browse_semantic_files_uses_summary_projection_when_only_summary_availab
        embedding_model="fake",
        embedding_dimensions=3,
    )
-    backend = HybridProjectionSearchBackend(
+    backend = SemanticProjectionSearchBackend(
        index_dir,
        embedder=FixedEmbedder(),
        embedding_provider="test",
--- a/tests/test_pifs_add_command.py
+++ b/tests/test_pifs_add_command.py
@ -25,7 +25,7 @@ class StaticEmbedder:


 def make_summary_indexer(workspace: Path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    return SummaryProjectionIndexer(
        workspace / "artifacts" / "projection_indexes",
--- a/tests/test_pifs_cli.py
+++ b/tests/test_pifs_cli.py
@ -37,7 +37,7 @@ def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec(
    workspace = tmp_path / "workspace"
    real_import = builtins.__import__

-    monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False)
+    monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_projection", raising=False)
    monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False)
    monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False)

--- a/tests/test_semantic_index.py
+++ b/tests/test_semantic_index.py
@ -102,7 +102,7 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm


 def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    indexer = SummaryProjectionIndexer(
        tmp_path / "projection",
@ -134,7 +134,7 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):


 def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    indexer = SummaryProjectionIndexer(
        tmp_path / "projection",
@ -164,7 +164,7 @@ def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):


 def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    indexer = SummaryProjectionIndexer(
        tmp_path / "projection",
@ -188,7 +188,7 @@ def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):


 def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    index_dir = tmp_path / "projection"
    index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@ -216,8 +216,8 @@ def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_pa
 def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder(
    tmp_path, monkeypatch
 ):
-    from pageindex.filesystem import projection_indexing
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem import semantic_projection
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    index_dir = tmp_path / "projection"
    index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@ -234,14 +234,14 @@ def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embe
    def fail_make_embedder(*args, **kwargs):
        raise AssertionError("embedder should not be constructed before dimension validation")

-    monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder)
+    monkeypatch.setattr(semantic_projection, "make_embedder", fail_make_embedder)

    with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
        SummaryProjectionIndexer.from_provider(index_dir)


 def test_embedding_cache_key_separates_model_dimensions(tmp_path):
-    from pageindex.filesystem.hybrid_projection import (
+    from pageindex.filesystem.semantic_projection import (
        EmbeddingCache,
        embedding_cache_model_key,
    )
@ -285,7 +285,7 @@ def test_embedding_cache_key_separates_model_dimensions(tmp_path):


 def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer

    index_dir = tmp_path / "projection"
    index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@ -328,7 +328,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path


 def test_hash_embedding_provider_is_not_available():
-    from pageindex.filesystem.hybrid_projection import make_embedder
+    from pageindex.filesystem.semantic_projection import make_embedder

    with pytest.raises(ValueError, match="unknown embedding provider: hash"):
        make_embedder("hash", "unused", dimensions=256, timeout=1)