From e368562e03d08b415f935f51cab146e5386ff208 Mon Sep 17 00:00:00 2001
From: BukeLy <bukely0119@foxmail.com>
Date: Sun, 31 May 2026 23:57:12 +0800
Subject: [PATCH] refactor(filesystem): consolidate semantic projection modules

---
 examples/pifs_demo.py                         |   2 +-
 pageindex/filesystem/__init__.py              |  10 +-
 pageindex/filesystem/core.py                  | 123 +++---------
 pageindex/filesystem/projection_indexing.py   | 179 -----------------
 ...d_projection.py => semantic_projection.py} | 187 +++++++++++++++++-
 pageindex/filesystem/structural_read.py       |  77 --------
 tests/test_pageindex_filesystem_scope.py      |  16 +-
 tests/test_pifs_add_command.py                |   2 +-
 tests/test_pifs_cli.py                        |   2 +-
 tests/test_semantic_index.py                  |  20 +-
 10 files changed, 233 insertions(+), 385 deletions(-)
 delete mode 100644 pageindex/filesystem/projection_indexing.py
 rename pageindex/filesystem/{hybrid_projection.py => semantic_projection.py} (68%)
 delete mode 100644 pageindex/filesystem/structural_read.py

diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py
index 87d2d45..d220654 100644
--- a/examples/pifs_demo.py
+++ b/examples/pifs_demo.py
@@ -338,7 +338,7 @@ def configure_summary_projection_backend(
 ) -> None:
     if not (filesystem.summary_projection_index_dir / "summary_only_vector.sqlite").exists():
         return
-    filesystem.configure_hybrid_projection_retrieval(
+    filesystem.configure_semantic_projection_retrieval(
         filesystem.summary_projection_index_dir,
         embedding_provider=embedding_provider,
         embedding_model=embedding_model,
diff --git a/pageindex/filesystem/__init__.py b/pageindex/filesystem/__init__.py
index 7908393..8cdf888 100644
--- a/pageindex/filesystem/__init__.py
+++ b/pageindex/filesystem/__init__.py
@@ -13,8 +13,8 @@ from .metadata_generation import (
 from .types import OpenResult, SearchResult
 
 if TYPE_CHECKING:
-    from .hybrid_projection import HybridProjectionSearchBackend
-    from .projection_indexing import SummaryProjectionIndexer
+    from .semantic_projection import SemanticProjectionSearchBackend
+    from .semantic_projection import SummaryProjectionIndexer
     from .semantic_index import (
         RebuildableSemanticIndex,
         SemanticIndexRecord,
@@ -23,17 +23,17 @@ if TYPE_CHECKING:
     )
 
 _LAZY_EXPORTS = {
-    "HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"),
+    "SemanticProjectionSearchBackend": (".semantic_projection", "SemanticProjectionSearchBackend"),
     "RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"),
     "SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"),
     "SemanticSearchResult": (".semantic_index", "SemanticSearchResult"),
     "SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"),
-    "SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"),
+    "SummaryProjectionIndexer": (".semantic_projection", "SummaryProjectionIndexer"),
 }
 
 __all__ = [
     "OpenResult",
-    "HybridProjectionSearchBackend",
+    "SemanticProjectionSearchBackend",
     "MetadataGenerationBackend",
     "MetadataGenerationError",
     "MetadataGenerationInput",
diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py
index e7d0a91..5d2fc68 100644
--- a/pageindex/filesystem/core.py
+++ b/pageindex/filesystem/core.py
@@ -23,17 +23,11 @@ from .store import (
     metadata_text,
     normalize_path,
 )
-from .structural_read import (
-    flatten_pageindex_structure_nodes,
-    first_node_location,
-    find_pageindex_node,
-    strip_pageindex_text_fields,
-)
 from .types import OpenResult, SearchResult
 
 if TYPE_CHECKING:
     from ..client import PageIndexClient
-    from .projection_indexing import SummaryProjectionIndexer
+    from .semantic_projection import SummaryProjectionIndexer
 
 DEFAULT_METADATA_GENERATION_FIELDS = {
     "summary": True,
@@ -94,6 +88,18 @@ ADD_FILE_CONTENT_TYPES = {
 }
 
 
+def strip_pageindex_text_fields(value: Any) -> Any:
+    if isinstance(value, list):
+        return [strip_pageindex_text_fields(item) for item in value]
+    if isinstance(value, dict):
+        return {
+            key: strip_pageindex_text_fields(item)
+            for key, item in value.items()
+            if key != "text"
+        }
+    return value
+
+
 class PageIndexFileSystem:
     def __init__(
         self,
@@ -325,9 +331,9 @@ class PageIndexFileSystem:
                 model=self.metadata_model,
                 base_url=self.metadata_base_url,
                 max_text_chars=self.metadata_max_text_chars,
-            )
+        )
         if self.summary_projection_index and self.summary_projection_indexer is None:
-            from .projection_indexing import SummaryProjectionIndexer
+            from .semantic_projection import SummaryProjectionIndexer
 
             self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
                 self.summary_projection_index_dir,
@@ -337,7 +343,7 @@ class PageIndexFileSystem:
                 embedding_timeout=self.summary_projection_embedding_timeout,
             )
         if self.summary_projection_index and self.semantic_retrieval_backend is None:
-            self.configure_hybrid_projection_retrieval(
+            self.configure_semantic_projection_retrieval(
                 self.summary_projection_index_dir,
                 embedding_provider=self.summary_projection_embedding_provider,
                 embedding_model=self.summary_projection_embedding_model,
@@ -352,9 +358,9 @@ class PageIndexFileSystem:
                 model=self.metadata_model,
                 base_url=self.metadata_base_url,
                 max_text_chars=self.metadata_max_text_chars,
-            )
+        )
         if self.summary_projection_index and self.summary_projection_indexer is None:
-            from .projection_indexing import SummaryProjectionIndexer
+            from .semantic_projection import SummaryProjectionIndexer
 
             self.summary_projection_indexer = SummaryProjectionIndexer.from_provider(
                 self.summary_projection_index_dir,
@@ -368,12 +374,12 @@ class PageIndexFileSystem:
         indexer = self.summary_projection_indexer
         if indexer is None:
             raise RuntimeError("pifs add requires a summary projection indexer")
-        from .hybrid_projection import HybridProjectionSearchBackend
+        from .semantic_projection import SemanticProjectionSearchBackend
 
         index_dir = Path(getattr(indexer, "index_dir", self.summary_projection_index_dir))
         embedder = getattr(indexer, "embedder", None)
         if embedder is None:
-            self.configure_hybrid_projection_retrieval(
+            self.configure_semantic_projection_retrieval(
                 index_dir,
                 embedding_provider=str(
                     getattr(
@@ -396,7 +402,7 @@ class PageIndexFileSystem:
             )
         else:
             embedding_cache = getattr(indexer, "embedding_cache", None)
-            self.semantic_retrieval_backend = HybridProjectionSearchBackend(
+            self.semantic_retrieval_backend = SemanticProjectionSearchBackend(
                 index_dir,
                 embedder=embedder,
                 embedding_provider=str(
@@ -458,7 +464,7 @@ class PageIndexFileSystem:
                 f"{self.summary_projection_embedding_dimensions}. Rebuild the "
                 "projection index or use a matching embedding configuration."
             )
-        self.configure_hybrid_projection_retrieval(
+        self.configure_semantic_projection_retrieval(
             self.summary_projection_index_dir,
             embedding_provider=self.summary_projection_embedding_provider,
             embedding_model=self.summary_projection_embedding_model,
@@ -731,7 +737,7 @@ class PageIndexFileSystem:
             )
         return results
 
-    def configure_hybrid_projection_retrieval(
+    def configure_semantic_projection_retrieval(
         self,
         index_dir: Union[str, Path],
         *,
@@ -741,9 +747,9 @@ class PageIndexFileSystem:
         embedding_timeout: float = 60,
         fetch_multiplier: int = 100,
     ) -> Any:
-        from .hybrid_projection import HybridProjectionSearchBackend
+        from .semantic_projection import SemanticProjectionSearchBackend
 
-        self.semantic_retrieval_backend = HybridProjectionSearchBackend.from_provider(
+        self.semantic_retrieval_backend = SemanticProjectionSearchBackend.from_provider(
             index_dir,
             embedding_provider=embedding_provider,
             embedding_model=embedding_model,
@@ -795,7 +801,7 @@ class PageIndexFileSystem:
         if self._file_format(entry) in {"pdf", "markdown", "pageindex"}:
             raise ValueError(
                 "open() text artifact reads are not supported for PDF/Markdown PageIndex files; "
-                "use pageindex_structure(), pageindex_pages(), or pageindex_node()."
+                "use pageindex_structure() or pageindex_pages()."
             )
         if str(location).strip().lower() in {"all", "full", "*"}:
             return self._open_all(file_ref)
@@ -814,9 +820,6 @@ class PageIndexFileSystem:
     def pageindex_structure(
         self,
         target: str,
-        *,
-        offset: int = 0,
-        limit: int = 25,
     ) -> dict[str, Any]:
         file_ref = self._resolve_target(target)
         entry = self.store.get_file(file_ref)
@@ -838,12 +841,6 @@ class PageIndexFileSystem:
                 entry,
                 message=str(structure["error"]),
             )
-        node_rows = flatten_pageindex_structure_nodes(structure)
-        offset = max(0, offset)
-        limit = max(0, limit)
-        window = node_rows[offset : offset + limit] if limit else []
-        next_offset = offset + len(window)
-        has_more = next_offset < len(node_rows)
         return {
             "mode": "structure",
             "file_ref": file_ref,
@@ -852,67 +849,7 @@ class PageIndexFileSystem:
             "status": entry.pageindex_tree_status,
             "available": True,
             "pageindex_doc_id": doc_id,
-            "structure": window,
-            "structure_pagination": {
-                "offset": offset,
-                "limit": limit,
-                "returned_nodes": len(window),
-                "total_nodes": len(node_rows),
-                "has_more": has_more,
-                "next_offset": next_offset if has_more else None,
-            },
-        }
-
-    def pageindex_node(self, target: str, node_id: str) -> dict[str, Any]:
-        file_ref = self._resolve_target(target)
-        entry = self.store.get_file(file_ref)
-        self._require_pageindex_document_file(entry, "cat --node")
-        client, doc_id = self._pageindex_client_doc_for_entry(entry)
-        if doc_id is None:
-            return self._structural_unavailable(
-                "node",
-                entry,
-                node_id=node_id,
-                message=(
-                    "PageIndex structure is not cached for this file in the "
-                    "PageIndexClient workspace."
-                ),
-            )
-        client._ensure_doc_loaded(doc_id)
-        doc = client.documents.get(doc_id, {})
-        node = find_pageindex_node(doc.get("structure", []), node_id)
-        if node is None:
-            return self._structural_unavailable(
-                "node",
-                entry,
-                node_id=node_id,
-                message="PageIndex node was not found in the cached structure.",
-            )
-        text = str(node.get("text") or "")
-        if not text:
-            location = first_node_location(node)
-            if location:
-                content = self._client_json(client.get_page_content(doc_id, location))
-                if isinstance(content, list):
-                    text = "\n\n".join(str(page.get("content") or "") for page in content)
-        if not text:
-            return self._structural_unavailable(
-                "node",
-                entry,
-                node_id=node_id,
-                message="Cached PageIndex node has no text content.",
-            )
-        return {
-            "mode": "node",
-            "file_ref": file_ref,
-            "external_id": entry.external_id,
-            "source_path": entry.source_path,
-            "status": entry.pageindex_tree_status,
-            "available": True,
-            "pageindex_doc_id": doc_id,
-            "node_id": node_id,
-            "node": strip_pageindex_text_fields(node),
-            "text": text,
+            "structure": strip_pageindex_text_fields(structure),
         }
 
     def pageindex_pages(self, target: str, pages: str) -> dict[str, Any]:
@@ -970,8 +907,7 @@ class PageIndexFileSystem:
             f"{command} is only supported for txt/text files; "
             f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
             "Use cat <path|file_ref|document_id> --structure, "
-            "cat <path|file_ref|document_id> --page, or "
-            "cat <path|file_ref|document_id> --node for PDF/Markdown PageIndex files."
+            "or cat <path|file_ref|document_id> --page for PDF/Markdown PageIndex files."
         )
 
     def _require_pageindex_document_file(self, entry: Any, command: str) -> None:
@@ -1748,7 +1684,6 @@ class PageIndexFileSystem:
         entry: Any,
         *,
         message: str,
-        node_id: str | None = None,
         pages: str | None = None,
     ) -> dict[str, Any]:
         pageindex_tree_error = cls._pageindex_tree_failure_message(entry.metadata_status)
@@ -1765,8 +1700,6 @@ class PageIndexFileSystem:
         }
         if pageindex_tree_error:
             result["pageindex_tree_error"] = pageindex_tree_error
-        if node_id is not None:
-            result["node_id"] = node_id
         if pages is not None:
             result["pages"] = pages
         return result
diff --git a/pageindex/filesystem/projection_indexing.py b/pageindex/filesystem/projection_indexing.py
deleted file mode 100644
index 3375f41..0000000
--- a/pageindex/filesystem/projection_indexing.py
+++ /dev/null
@@ -1,179 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-from .core import DEFAULT_EMBEDDING_DIMENSIONS
-from .hybrid_projection import (
-    EmbeddingCache,
-    INDEX_BY_CHANNEL,
-    embedding_cache_model_key,
-    make_embedder,
-)
-from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexRecord
-
-
-class SummaryProjectionIndexer:
-    """Synchronous register-time summary projection indexer."""
-
-    def __init__(
-        self,
-        index_dir: str | Path,
-        *,
-        embedder: Any,
-        embedding_provider: str,
-        embedding_model: str,
-        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
-        embedding_cache_path: str | Path | None = None,
-    ) -> None:
-        self.index_dir = Path(index_dir).expanduser()
-        self.index_dir.mkdir(parents=True, exist_ok=True)
-        self.embedder = embedder
-        self.embedding_provider = embedding_provider
-        self.embedding_model = embedding_model
-        self.embedding_dimensions = embedding_dimensions
-        self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
-        self.embedding_cache = EmbeddingCache(
-            Path(embedding_cache_path).expanduser()
-            if embedding_cache_path is not None
-            else self.index_dir / "embedding_cache.sqlite"
-        )
-        self.index = SQLiteVecSemanticIndex(
-            self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
-        )
-        self._ensure_index()
-
-    @classmethod
-    def from_provider(
-        cls,
-        index_dir: str | Path,
-        *,
-        embedding_provider: str = "openai",
-        embedding_model: str = "text-embedding-3-small",
-        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
-        embedding_timeout: float = 60,
-        **kwargs: Any,
-    ) -> "SummaryProjectionIndexer":
-        cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
-        return cls(
-            index_dir,
-            embedder=make_embedder(
-                embedding_provider,
-                embedding_model,
-                dimensions=embedding_dimensions,
-                timeout=embedding_timeout,
-            ),
-            embedding_provider=embedding_provider,
-            embedding_model=embedding_model,
-            embedding_dimensions=embedding_dimensions,
-            **kwargs,
-        )
-
-    def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
-        summary = str((record.get("metadata") or {}).get("summary") or "").strip()
-        if not summary:
-            return {"status": "skipped", "reason": "missing_summary"}
-        vector = self.embedding_cache.embed_texts(
-            [summary],
-            provider=self.embedding_provider,
-            model=self.cache_model,
-            embedder=self.embedder,
-            batch_size=1,
-        )[0]
-        metadata = dict(record.get("metadata") or {})
-        count = self.index.upsert_many(
-            [
-                SemanticIndexRecord(
-                    file_ref=str(record["file_ref"]),
-                    vector=vector,
-                    text=summary,
-                    external_id=record.get("external_id"),
-                    source_type=str(record.get("source_type") or ""),
-                    source_path=str(record.get("source_path") or ""),
-                    title=str(record.get("title") or ""),
-                    metadata=metadata,
-                )
-            ]
-        )
-        return {
-            "status": "ready",
-            "indexed_rows": count,
-            "index_path": str(self.index.db_path),
-            "embedding_provider": self.embedding_provider,
-            "embedding_model": self.embedding_model,
-            "embedding_dimensions": self.embedding_dimensions,
-        }
-
-    def delete_summary(self, file_ref: str) -> int:
-        return self.index.delete_file_refs([file_ref])
-
-    def _ensure_index(self) -> None:
-        if not self.index.db_path.exists():
-            self.index.reset(
-                dimension=self.embedding_dimensions,
-                metadata=self._index_metadata(),
-            )
-            return
-        try:
-            existing_dimension = self.index.dimension()
-        except Exception as exc:
-            raise RuntimeError(
-                "could not validate existing summary projection index config; "
-                f"refusing to reset {self.index.db_path}. Move the existing index "
-                "aside or rebuild it intentionally before changing embedding config."
-            ) from exc
-        if existing_dimension != self.embedding_dimensions:
-            raise self._dimension_mismatch_error(
-                self.index.db_path,
-                existing_dimension,
-                self.embedding_dimensions,
-            )
-
-    def _index_metadata(self) -> dict[str, Any]:
-        return {
-            "channel": "summary",
-            "embedding_provider": self.embedding_provider,
-            "embedding_model": self.embedding_model,
-            "embedding_dimensions": self.embedding_dimensions,
-        }
-
-    @classmethod
-    def _validate_existing_index_dimension(
-        cls,
-        index_dir: str | Path,
-        embedding_dimensions: int,
-    ) -> None:
-        index_path = (
-            Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
-        )
-        if not index_path.exists():
-            return
-        index = SQLiteVecSemanticIndex(index_path)
-        try:
-            existing_dimension = index.dimension()
-        except Exception as exc:
-            raise RuntimeError(
-                "could not validate existing summary projection index config; "
-                f"refusing to reset {index_path}. Move the existing index "
-                "aside or rebuild it intentionally before changing embedding config."
-            ) from exc
-        if existing_dimension != embedding_dimensions:
-            raise cls._dimension_mismatch_error(
-                index_path,
-                existing_dimension,
-                embedding_dimensions,
-            )
-
-    @staticmethod
-    def _dimension_mismatch_error(
-        index_path: Path,
-        existing_dimension: int,
-        embedding_dimensions: int,
-    ) -> RuntimeError:
-        return RuntimeError(
-            "summary projection index dimension mismatch: "
-            f"{index_path} was built with dimension {existing_dimension}, "
-            f"but configured embedding_dimensions is {embedding_dimensions}. "
-            "Use the matching embedding config, or rebuild the projection index "
-            "at a new path after preserving the existing data."
-        )
diff --git a/pageindex/filesystem/hybrid_projection.py b/pageindex/filesystem/semantic_projection.py
similarity index 68%
rename from pageindex/filesystem/hybrid_projection.py
rename to pageindex/filesystem/semantic_projection.py
index 348aa20..6059a9b 100644
--- a/pageindex/filesystem/hybrid_projection.py
+++ b/pageindex/filesystem/semantic_projection.py
@@ -11,7 +11,12 @@ from pathlib import Path
 from typing import Any
 
 from .core import DEFAULT_EMBEDDING_DIMENSIONS
-from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult
+from .semantic_index import (
+    SQLiteVecSemanticIndex,
+    SemanticIndexError,
+    SemanticIndexRecord,
+    SemanticSearchResult,
+)
 
 
 INDEX_BY_CHANNEL = {
@@ -29,7 +34,7 @@ class QueryProjection:
 
 
 @dataclass(frozen=True)
-class HybridProjectionCandidate:
+class SemanticProjectionCandidate:
     document_id: str
     score: float
     sources: list[dict[str, Any]]
@@ -40,7 +45,7 @@ class HybridProjectionCandidate:
     snippet: str
 
 
-class HybridProjectionSearchBackend:
+class SemanticProjectionSearchBackend:
     """Semantic channel retrieval over rebuildable projection indexes.
 
     The SQLite catalog remains the source of truth. This backend only reads
@@ -86,7 +91,7 @@ class HybridProjectionSearchBackend:
         embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
         embedding_timeout: float = 60,
         **kwargs: Any,
-    ) -> "HybridProjectionSearchBackend":
+    ) -> "SemanticProjectionSearchBackend":
         return cls(
             index_dir,
             embedder=make_embedder(
@@ -108,7 +113,7 @@ class HybridProjectionSearchBackend:
         *,
         limit: int = 10,
         filters: dict[str, Any] | None = None,
-    ) -> list[HybridProjectionCandidate]:
+    ) -> list[SemanticProjectionCandidate]:
         if channel not in SEMANTIC_TOOL_CHANNELS:
             raise ValueError(f"unsupported semantic channel: {channel}")
         if channel not in self.available_channels():
@@ -180,6 +185,172 @@ class HybridProjectionSearchBackend:
         return {**info, "available": int(info.get("document_count") or 0) > 0}
 
 
+class SummaryProjectionIndexer:
+    """Synchronous register-time summary projection indexer."""
+
+    def __init__(
+        self,
+        index_dir: str | Path,
+        *,
+        embedder: Any,
+        embedding_provider: str,
+        embedding_model: str,
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
+        embedding_cache_path: str | Path | None = None,
+    ) -> None:
+        self.index_dir = Path(index_dir).expanduser()
+        self.index_dir.mkdir(parents=True, exist_ok=True)
+        self.embedder = embedder
+        self.embedding_provider = embedding_provider
+        self.embedding_model = embedding_model
+        self.embedding_dimensions = embedding_dimensions
+        self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions)
+        self.embedding_cache = EmbeddingCache(
+            Path(embedding_cache_path).expanduser()
+            if embedding_cache_path is not None
+            else self.index_dir / "embedding_cache.sqlite"
+        )
+        self.index = SQLiteVecSemanticIndex(
+            self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
+        )
+        self._ensure_index()
+
+    @classmethod
+    def from_provider(
+        cls,
+        index_dir: str | Path,
+        *,
+        embedding_provider: str = "openai",
+        embedding_model: str = "text-embedding-3-small",
+        embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
+        embedding_timeout: float = 60,
+        **kwargs: Any,
+    ) -> "SummaryProjectionIndexer":
+        cls._validate_existing_index_dimension(index_dir, embedding_dimensions)
+        return cls(
+            index_dir,
+            embedder=make_embedder(
+                embedding_provider,
+                embedding_model,
+                dimensions=embedding_dimensions,
+                timeout=embedding_timeout,
+            ),
+            embedding_provider=embedding_provider,
+            embedding_model=embedding_model,
+            embedding_dimensions=embedding_dimensions,
+            **kwargs,
+        )
+
+    def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]:
+        summary = str((record.get("metadata") or {}).get("summary") or "").strip()
+        if not summary:
+            return {"status": "skipped", "reason": "missing_summary"}
+        vector = self.embedding_cache.embed_texts(
+            [summary],
+            provider=self.embedding_provider,
+            model=self.cache_model,
+            embedder=self.embedder,
+            batch_size=1,
+        )[0]
+        metadata = dict(record.get("metadata") or {})
+        count = self.index.upsert_many(
+            [
+                SemanticIndexRecord(
+                    file_ref=str(record["file_ref"]),
+                    vector=vector,
+                    text=summary,
+                    external_id=record.get("external_id"),
+                    source_type=str(record.get("source_type") or ""),
+                    source_path=str(record.get("source_path") or ""),
+                    title=str(record.get("title") or ""),
+                    metadata=metadata,
+                )
+            ]
+        )
+        return {
+            "status": "ready",
+            "indexed_rows": count,
+            "index_path": str(self.index.db_path),
+            "embedding_provider": self.embedding_provider,
+            "embedding_model": self.embedding_model,
+            "embedding_dimensions": self.embedding_dimensions,
+        }
+
+    def delete_summary(self, file_ref: str) -> int:
+        return self.index.delete_file_refs([file_ref])
+
+    def _ensure_index(self) -> None:
+        if not self.index.db_path.exists():
+            self.index.reset(
+                dimension=self.embedding_dimensions,
+                metadata=self._index_metadata(),
+            )
+            return
+        try:
+            existing_dimension = self.index.dimension()
+        except Exception as exc:
+            raise RuntimeError(
+                "could not validate existing summary projection index config; "
+                f"refusing to reset {self.index.db_path}. Move the existing index "
+                "aside or rebuild it intentionally before changing embedding config."
+            ) from exc
+        if existing_dimension != self.embedding_dimensions:
+            raise self._dimension_mismatch_error(
+                self.index.db_path,
+                existing_dimension,
+                self.embedding_dimensions,
+            )
+
+    def _index_metadata(self) -> dict[str, Any]:
+        return {
+            "channel": "summary",
+            "embedding_provider": self.embedding_provider,
+            "embedding_model": self.embedding_model,
+            "embedding_dimensions": self.embedding_dimensions,
+        }
+
+    @classmethod
+    def _validate_existing_index_dimension(
+        cls,
+        index_dir: str | Path,
+        embedding_dimensions: int,
+    ) -> None:
+        index_path = (
+            Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite"
+        )
+        if not index_path.exists():
+            return
+        index = SQLiteVecSemanticIndex(index_path)
+        try:
+            existing_dimension = index.dimension()
+        except Exception as exc:
+            raise RuntimeError(
+                "could not validate existing summary projection index config; "
+                f"refusing to reset {index_path}. Move the existing index "
+                "aside or rebuild it intentionally before changing embedding config."
+            ) from exc
+        if existing_dimension != embedding_dimensions:
+            raise cls._dimension_mismatch_error(
+                index_path,
+                existing_dimension,
+                embedding_dimensions,
+            )
+
+    @staticmethod
+    def _dimension_mismatch_error(
+        index_path: Path,
+        existing_dimension: int,
+        embedding_dimensions: int,
+    ) -> RuntimeError:
+        return RuntimeError(
+            "summary projection index dimension mismatch: "
+            f"{index_path} was built with dimension {existing_dimension}, "
+            f"but configured embedding_dimensions is {embedding_dimensions}. "
+            "Use the matching embedding config, or rebuild the projection index "
+            "at a new path after preserving the existing data."
+        )
+
+
 class EmbeddingCache:
     def __init__(self, db_path: Path):
         self.db_path = db_path
@@ -308,8 +479,8 @@ def query_text_for_channel(channel: str, query: str, projection: QueryProjection
 def rank_single_semantic_channel(
     channel: str,
     results: list[SemanticSearchResult],
-) -> list[HybridProjectionCandidate]:
-    rows: list[HybridProjectionCandidate] = []
+) -> list[SemanticProjectionCandidate]:
+    rows: list[SemanticProjectionCandidate] = []
     seen: set[str] = set()
     for rank, result in enumerate(results, 1):
         doc_id = str(result.external_id or result.file_ref)
@@ -317,7 +488,7 @@ def rank_single_semantic_channel(
             continue
         seen.add(doc_id)
         rows.append(
-            HybridProjectionCandidate(
+            SemanticProjectionCandidate(
                 document_id=doc_id,
                 score=1 / (60 + rank),
                 sources=[{"channel": channel, "rank": rank, "distance": result.distance}],
diff --git a/pageindex/filesystem/structural_read.py b/pageindex/filesystem/structural_read.py
deleted file mode 100644
index aca2bcd..0000000
--- a/pageindex/filesystem/structural_read.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from __future__ import annotations
-
-from copy import deepcopy
-from typing import Any
-
-
-def strip_pageindex_text_fields(value: Any) -> Any:
-    if isinstance(value, list):
-        return [strip_pageindex_text_fields(item) for item in value]
-    if isinstance(value, dict):
-        return {
-            key: strip_pageindex_text_fields(item)
-            for key, item in value.items()
-            if key != "text"
-        }
-    return value
-
-
-def flatten_pageindex_structure_nodes(structure: Any) -> list[dict[str, Any]]:
-    rows: list[dict[str, Any]] = []
-
-    def visit(value: Any, *, depth: int, parent_node_id: str | None) -> None:
-        if isinstance(value, list):
-            for item in value:
-                visit(item, depth=depth, parent_node_id=parent_node_id)
-            return
-        if not isinstance(value, dict):
-            return
-
-        node_id = value.get("node_id")
-        child_values: list[Any] = []
-        for child_key in ("nodes", "children"):
-            children = value.get(child_key)
-            if isinstance(children, list):
-                child_values.extend(children)
-
-        row = {
-            key: strip_pageindex_text_fields(item)
-            for key, item in value.items()
-            if key not in {"text", "nodes", "children"}
-        }
-        row["depth"] = depth
-        row["children_count"] = len(child_values)
-        if parent_node_id:
-            row["parent_node_id"] = parent_node_id
-        rows.append(row)
-
-        next_parent = str(node_id) if node_id is not None else parent_node_id
-        for child in child_values:
-            visit(child, depth=depth + 1, parent_node_id=next_parent)
-
-    visit(structure, depth=0, parent_node_id=None)
-    return rows
-
-
-def find_pageindex_node(structure: Any, node_id: str) -> dict[str, Any] | None:
-    if isinstance(structure, dict):
-        if str(structure.get("node_id", "")) == str(node_id):
-            return deepcopy(structure)
-        for child_key in ("nodes", "children"):
-            found = find_pageindex_node(structure.get(child_key), node_id)
-            if found is not None:
-                return found
-    if isinstance(structure, list):
-        for item in structure:
-            found = find_pageindex_node(item, node_id)
-            if found is not None:
-                return found
-    return None
-
-
-def first_node_location(node: dict[str, Any]) -> str | None:
-    for key in ("line_num", "physical_index", "start_index"):
-        value = node.get(key)
-        if value is not None and value != "":
-            return str(value)
-    return None
diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py
index b5b9491..0ce9f39 100644
--- a/tests/test_pageindex_filesystem_scope.py
+++ b/tests/test_pageindex_filesystem_scope.py
@@ -7,7 +7,7 @@ import pytest
 def test_filesystem_lazy_exports_remain_public():
     import pageindex.filesystem as filesystem
     from pageindex.filesystem import (
-        HybridProjectionSearchBackend,
+        SemanticProjectionSearchBackend,
         RebuildableSemanticIndex,
         SemanticIndexRecord,
         SemanticSearchResult,
@@ -16,7 +16,7 @@ def test_filesystem_lazy_exports_remain_public():
     )
 
     for name in (
-        "HybridProjectionSearchBackend",
+        "SemanticProjectionSearchBackend",
         "RebuildableSemanticIndex",
         "SemanticIndexRecord",
         "SemanticSearchResult",
@@ -26,7 +26,7 @@ def test_filesystem_lazy_exports_remain_public():
         assert name in filesystem.__all__
         assert name in dir(filesystem)
 
-    assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend"
+    assert SemanticProjectionSearchBackend.__name__ == "SemanticProjectionSearchBackend"
     assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex"
     assert SemanticIndexRecord.__name__ == "SemanticIndexRecord"
     assert SemanticSearchResult.__name__ == "SemanticSearchResult"
@@ -819,7 +819,7 @@ def test_existing_summary_projection_index_uses_current_config_when_dimensions_m
 
     monkeypatch.setattr(
         filesystem,
-        "configure_hybrid_projection_retrieval",
+        "configure_semantic_projection_retrieval",
         fake_configure,
     )
 
@@ -876,7 +876,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
 
     monkeypatch.setattr(
         filesystem,
-        "configure_hybrid_projection_retrieval",
+        "configure_semantic_projection_retrieval",
         fail_configure,
     )
 
@@ -892,9 +892,9 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
 
 def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path):
     from pageindex.filesystem import PageIndexFileSystem
-    from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
+    from pageindex.filesystem.semantic_projection import SemanticProjectionSearchBackend
     from pageindex.filesystem.metadata_generation import MetadataGenerationResult
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
 
     class FixedEmbedder:
         def embed(self, texts):
@@ -916,7 +916,7 @@ def test_browse_semantic_files_uses_summary_projection_when_only_summary_availab
         embedding_model="fake",
         embedding_dimensions=3,
     )
-    backend = HybridProjectionSearchBackend(
+    backend = SemanticProjectionSearchBackend(
         index_dir,
         embedder=FixedEmbedder(),
         embedding_provider="test",
diff --git a/tests/test_pifs_add_command.py b/tests/test_pifs_add_command.py
index 1679431..4161b80 100644
--- a/tests/test_pifs_add_command.py
+++ b/tests/test_pifs_add_command.py
@@ -25,7 +25,7 @@ class StaticEmbedder:
 
 
 def make_summary_indexer(workspace: Path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
 
     return SummaryProjectionIndexer(
         workspace / "artifacts" / "projection_indexes",
diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py
index 3437a3c..405f5b9 100644
--- a/tests/test_pifs_cli.py
+++ b/tests/test_pifs_cli.py
@@ -37,7 +37,7 @@ def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec(
     workspace = tmp_path / "workspace"
     real_import = builtins.__import__
 
-    monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False)
+    monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_projection", raising=False)
     monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False)
     monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False)
 
diff --git a/tests/test_semantic_index.py b/tests/test_semantic_index.py
index 6cdc0e1..c1da0dc 100644
--- a/tests/test_semantic_index.py
+++ b/tests/test_semantic_index.py
@@ -102,7 +102,7 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
 
 
 def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
 
     indexer = SummaryProjectionIndexer(
         tmp_path / "projection",
@@ -134,7 +134,7 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
 
 
 def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
 
     indexer = SummaryProjectionIndexer(
         tmp_path / "projection",
@@ -164,7 +164,7 @@ def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
 
 
 def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
 
     indexer = SummaryProjectionIndexer(
         tmp_path / "projection",
@@ -188,7 +188,7 @@ def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
 
 
 def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
 
     index_dir = tmp_path / "projection"
     index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@@ -216,8 +216,8 @@ def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_pa
 def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder(
     tmp_path, monkeypatch
 ):
-    from pageindex.filesystem import projection_indexing
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem import semantic_projection
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
 
     index_dir = tmp_path / "projection"
     index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@@ -234,14 +234,14 @@ def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embe
     def fail_make_embedder(*args, **kwargs):
         raise AssertionError("embedder should not be constructed before dimension validation")
 
-    monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder)
+    monkeypatch.setattr(semantic_projection, "make_embedder", fail_make_embedder)
 
     with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"):
         SummaryProjectionIndexer.from_provider(index_dir)
 
 
 def test_embedding_cache_key_separates_model_dimensions(tmp_path):
-    from pageindex.filesystem.hybrid_projection import (
+    from pageindex.filesystem.semantic_projection import (
         EmbeddingCache,
         embedding_cache_model_key,
     )
@@ -285,7 +285,7 @@ def test_embedding_cache_key_separates_model_dimensions(tmp_path):
 
 
 def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path):
-    from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
+    from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer
 
     index_dir = tmp_path / "projection"
     index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
@@ -328,7 +328,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
 
 
 def test_hash_embedding_provider_is_not_available():
-    from pageindex.filesystem.hybrid_projection import make_embedder
+    from pageindex.filesystem.semantic_projection import make_embedder
 
     with pytest.raises(ValueError, match="unknown embedding provider: hash"):
         make_embedder("hash", "unused", dimensions=256, timeout=1)