From e368562e03d08b415f935f51cab146e5386ff208 Mon Sep 17 00:00:00 2001 From: BukeLy Date: Sun, 31 May 2026 23:57:12 +0800 Subject: [PATCH] refactor(filesystem): consolidate semantic projection modules --- examples/pifs_demo.py | 2 +- pageindex/filesystem/__init__.py | 10 +- pageindex/filesystem/core.py | 123 +++--------- pageindex/filesystem/projection_indexing.py | 179 ----------------- ...d_projection.py => semantic_projection.py} | 187 +++++++++++++++++- pageindex/filesystem/structural_read.py | 77 -------- tests/test_pageindex_filesystem_scope.py | 16 +- tests/test_pifs_add_command.py | 2 +- tests/test_pifs_cli.py | 2 +- tests/test_semantic_index.py | 20 +- 10 files changed, 233 insertions(+), 385 deletions(-) delete mode 100644 pageindex/filesystem/projection_indexing.py rename pageindex/filesystem/{hybrid_projection.py => semantic_projection.py} (68%) delete mode 100644 pageindex/filesystem/structural_read.py diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index 87d2d45..d220654 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -338,7 +338,7 @@ def configure_summary_projection_backend( ) -> None: if not (filesystem.summary_projection_index_dir / "summary_only_vector.sqlite").exists(): return - filesystem.configure_hybrid_projection_retrieval( + filesystem.configure_semantic_projection_retrieval( filesystem.summary_projection_index_dir, embedding_provider=embedding_provider, embedding_model=embedding_model, diff --git a/pageindex/filesystem/__init__.py b/pageindex/filesystem/__init__.py index 7908393..8cdf888 100644 --- a/pageindex/filesystem/__init__.py +++ b/pageindex/filesystem/__init__.py @@ -13,8 +13,8 @@ from .metadata_generation import ( from .types import OpenResult, SearchResult if TYPE_CHECKING: - from .hybrid_projection import HybridProjectionSearchBackend - from .projection_indexing import SummaryProjectionIndexer + from .semantic_projection import SemanticProjectionSearchBackend + from .semantic_projection import SummaryProjectionIndexer from .semantic_index import ( RebuildableSemanticIndex, SemanticIndexRecord, @@ -23,17 +23,17 @@ if TYPE_CHECKING: ) _LAZY_EXPORTS = { - "HybridProjectionSearchBackend": (".hybrid_projection", "HybridProjectionSearchBackend"), + "SemanticProjectionSearchBackend": (".semantic_projection", "SemanticProjectionSearchBackend"), "RebuildableSemanticIndex": (".semantic_index", "RebuildableSemanticIndex"), "SemanticIndexRecord": (".semantic_index", "SemanticIndexRecord"), "SemanticSearchResult": (".semantic_index", "SemanticSearchResult"), "SQLiteVecSemanticIndex": (".semantic_index", "SQLiteVecSemanticIndex"), - "SummaryProjectionIndexer": (".projection_indexing", "SummaryProjectionIndexer"), + "SummaryProjectionIndexer": (".semantic_projection", "SummaryProjectionIndexer"), } __all__ = [ "OpenResult", - "HybridProjectionSearchBackend", + "SemanticProjectionSearchBackend", "MetadataGenerationBackend", "MetadataGenerationError", "MetadataGenerationInput", diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index e7d0a91..5d2fc68 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -23,17 +23,11 @@ from .store import ( metadata_text, normalize_path, ) -from .structural_read import ( - flatten_pageindex_structure_nodes, - first_node_location, - find_pageindex_node, - strip_pageindex_text_fields, -) from .types import OpenResult, SearchResult if TYPE_CHECKING: from ..client import PageIndexClient - from .projection_indexing import SummaryProjectionIndexer + from .semantic_projection import SummaryProjectionIndexer DEFAULT_METADATA_GENERATION_FIELDS = { "summary": True, @@ -94,6 +88,18 @@ ADD_FILE_CONTENT_TYPES = { } +def strip_pageindex_text_fields(value: Any) -> Any: + if isinstance(value, list): + return [strip_pageindex_text_fields(item) for item in value] + if isinstance(value, dict): + return { + key: strip_pageindex_text_fields(item) + for key, item in value.items() + if key != "text" + } + return value + + class PageIndexFileSystem: def __init__( self, @@ -325,9 +331,9 @@ class PageIndexFileSystem: model=self.metadata_model, base_url=self.metadata_base_url, max_text_chars=self.metadata_max_text_chars, - ) + ) if self.summary_projection_index and self.summary_projection_indexer is None: - from .projection_indexing import SummaryProjectionIndexer + from .semantic_projection import SummaryProjectionIndexer self.summary_projection_indexer = SummaryProjectionIndexer.from_provider( self.summary_projection_index_dir, @@ -337,7 +343,7 @@ class PageIndexFileSystem: embedding_timeout=self.summary_projection_embedding_timeout, ) if self.summary_projection_index and self.semantic_retrieval_backend is None: - self.configure_hybrid_projection_retrieval( + self.configure_semantic_projection_retrieval( self.summary_projection_index_dir, embedding_provider=self.summary_projection_embedding_provider, embedding_model=self.summary_projection_embedding_model, @@ -352,9 +358,9 @@ class PageIndexFileSystem: model=self.metadata_model, base_url=self.metadata_base_url, max_text_chars=self.metadata_max_text_chars, - ) + ) if self.summary_projection_index and self.summary_projection_indexer is None: - from .projection_indexing import SummaryProjectionIndexer + from .semantic_projection import SummaryProjectionIndexer self.summary_projection_indexer = SummaryProjectionIndexer.from_provider( self.summary_projection_index_dir, @@ -368,12 +374,12 @@ class PageIndexFileSystem: indexer = self.summary_projection_indexer if indexer is None: raise RuntimeError("pifs add requires a summary projection indexer") - from .hybrid_projection import HybridProjectionSearchBackend + from .semantic_projection import SemanticProjectionSearchBackend index_dir = Path(getattr(indexer, "index_dir", self.summary_projection_index_dir)) embedder = getattr(indexer, "embedder", None) if embedder is None: - self.configure_hybrid_projection_retrieval( + self.configure_semantic_projection_retrieval( index_dir, embedding_provider=str( getattr( @@ -396,7 +402,7 @@ class PageIndexFileSystem: ) else: embedding_cache = getattr(indexer, "embedding_cache", None) - self.semantic_retrieval_backend = HybridProjectionSearchBackend( + self.semantic_retrieval_backend = SemanticProjectionSearchBackend( index_dir, embedder=embedder, embedding_provider=str( @@ -458,7 +464,7 @@ class PageIndexFileSystem: f"{self.summary_projection_embedding_dimensions}. Rebuild the " "projection index or use a matching embedding configuration." ) - self.configure_hybrid_projection_retrieval( + self.configure_semantic_projection_retrieval( self.summary_projection_index_dir, embedding_provider=self.summary_projection_embedding_provider, embedding_model=self.summary_projection_embedding_model, @@ -731,7 +737,7 @@ class PageIndexFileSystem: ) return results - def configure_hybrid_projection_retrieval( + def configure_semantic_projection_retrieval( self, index_dir: Union[str, Path], *, @@ -741,9 +747,9 @@ class PageIndexFileSystem: embedding_timeout: float = 60, fetch_multiplier: int = 100, ) -> Any: - from .hybrid_projection import HybridProjectionSearchBackend + from .semantic_projection import SemanticProjectionSearchBackend - self.semantic_retrieval_backend = HybridProjectionSearchBackend.from_provider( + self.semantic_retrieval_backend = SemanticProjectionSearchBackend.from_provider( index_dir, embedding_provider=embedding_provider, embedding_model=embedding_model, @@ -795,7 +801,7 @@ class PageIndexFileSystem: if self._file_format(entry) in {"pdf", "markdown", "pageindex"}: raise ValueError( "open() text artifact reads are not supported for PDF/Markdown PageIndex files; " - "use pageindex_structure(), pageindex_pages(), or pageindex_node()." + "use pageindex_structure() or pageindex_pages()." ) if str(location).strip().lower() in {"all", "full", "*"}: return self._open_all(file_ref) @@ -814,9 +820,6 @@ class PageIndexFileSystem: def pageindex_structure( self, target: str, - *, - offset: int = 0, - limit: int = 25, ) -> dict[str, Any]: file_ref = self._resolve_target(target) entry = self.store.get_file(file_ref) @@ -838,12 +841,6 @@ class PageIndexFileSystem: entry, message=str(structure["error"]), ) - node_rows = flatten_pageindex_structure_nodes(structure) - offset = max(0, offset) - limit = max(0, limit) - window = node_rows[offset : offset + limit] if limit else [] - next_offset = offset + len(window) - has_more = next_offset < len(node_rows) return { "mode": "structure", "file_ref": file_ref, @@ -852,67 +849,7 @@ class PageIndexFileSystem: "status": entry.pageindex_tree_status, "available": True, "pageindex_doc_id": doc_id, - "structure": window, - "structure_pagination": { - "offset": offset, - "limit": limit, - "returned_nodes": len(window), - "total_nodes": len(node_rows), - "has_more": has_more, - "next_offset": next_offset if has_more else None, - }, - } - - def pageindex_node(self, target: str, node_id: str) -> dict[str, Any]: - file_ref = self._resolve_target(target) - entry = self.store.get_file(file_ref) - self._require_pageindex_document_file(entry, "cat --node") - client, doc_id = self._pageindex_client_doc_for_entry(entry) - if doc_id is None: - return self._structural_unavailable( - "node", - entry, - node_id=node_id, - message=( - "PageIndex structure is not cached for this file in the " - "PageIndexClient workspace." - ), - ) - client._ensure_doc_loaded(doc_id) - doc = client.documents.get(doc_id, {}) - node = find_pageindex_node(doc.get("structure", []), node_id) - if node is None: - return self._structural_unavailable( - "node", - entry, - node_id=node_id, - message="PageIndex node was not found in the cached structure.", - ) - text = str(node.get("text") or "") - if not text: - location = first_node_location(node) - if location: - content = self._client_json(client.get_page_content(doc_id, location)) - if isinstance(content, list): - text = "\n\n".join(str(page.get("content") or "") for page in content) - if not text: - return self._structural_unavailable( - "node", - entry, - node_id=node_id, - message="Cached PageIndex node has no text content.", - ) - return { - "mode": "node", - "file_ref": file_ref, - "external_id": entry.external_id, - "source_path": entry.source_path, - "status": entry.pageindex_tree_status, - "available": True, - "pageindex_doc_id": doc_id, - "node_id": node_id, - "node": strip_pageindex_text_fields(node), - "text": text, + "structure": strip_pageindex_text_fields(structure), } def pageindex_pages(self, target: str, pages: str) -> dict[str, Any]: @@ -970,8 +907,7 @@ class PageIndexFileSystem: f"{command} is only supported for txt/text files; " f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " "Use cat --structure, " - "cat --page, or " - "cat --node for PDF/Markdown PageIndex files." + "or cat --page for PDF/Markdown PageIndex files." ) def _require_pageindex_document_file(self, entry: Any, command: str) -> None: @@ -1748,7 +1684,6 @@ class PageIndexFileSystem: entry: Any, *, message: str, - node_id: str | None = None, pages: str | None = None, ) -> dict[str, Any]: pageindex_tree_error = cls._pageindex_tree_failure_message(entry.metadata_status) @@ -1765,8 +1700,6 @@ class PageIndexFileSystem: } if pageindex_tree_error: result["pageindex_tree_error"] = pageindex_tree_error - if node_id is not None: - result["node_id"] = node_id if pages is not None: result["pages"] = pages return result diff --git a/pageindex/filesystem/projection_indexing.py b/pageindex/filesystem/projection_indexing.py deleted file mode 100644 index 3375f41..0000000 --- a/pageindex/filesystem/projection_indexing.py +++ /dev/null @@ -1,179 +0,0 @@ -from __future__ import annotations - -from pathlib import Path -from typing import Any - -from .core import DEFAULT_EMBEDDING_DIMENSIONS -from .hybrid_projection import ( - EmbeddingCache, - INDEX_BY_CHANNEL, - embedding_cache_model_key, - make_embedder, -) -from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexRecord - - -class SummaryProjectionIndexer: - """Synchronous register-time summary projection indexer.""" - - def __init__( - self, - index_dir: str | Path, - *, - embedder: Any, - embedding_provider: str, - embedding_model: str, - embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, - embedding_cache_path: str | Path | None = None, - ) -> None: - self.index_dir = Path(index_dir).expanduser() - self.index_dir.mkdir(parents=True, exist_ok=True) - self.embedder = embedder - self.embedding_provider = embedding_provider - self.embedding_model = embedding_model - self.embedding_dimensions = embedding_dimensions - self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions) - self.embedding_cache = EmbeddingCache( - Path(embedding_cache_path).expanduser() - if embedding_cache_path is not None - else self.index_dir / "embedding_cache.sqlite" - ) - self.index = SQLiteVecSemanticIndex( - self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite" - ) - self._ensure_index() - - @classmethod - def from_provider( - cls, - index_dir: str | Path, - *, - embedding_provider: str = "openai", - embedding_model: str = "text-embedding-3-small", - embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, - embedding_timeout: float = 60, - **kwargs: Any, - ) -> "SummaryProjectionIndexer": - cls._validate_existing_index_dimension(index_dir, embedding_dimensions) - return cls( - index_dir, - embedder=make_embedder( - embedding_provider, - embedding_model, - dimensions=embedding_dimensions, - timeout=embedding_timeout, - ), - embedding_provider=embedding_provider, - embedding_model=embedding_model, - embedding_dimensions=embedding_dimensions, - **kwargs, - ) - - def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]: - summary = str((record.get("metadata") or {}).get("summary") or "").strip() - if not summary: - return {"status": "skipped", "reason": "missing_summary"} - vector = self.embedding_cache.embed_texts( - [summary], - provider=self.embedding_provider, - model=self.cache_model, - embedder=self.embedder, - batch_size=1, - )[0] - metadata = dict(record.get("metadata") or {}) - count = self.index.upsert_many( - [ - SemanticIndexRecord( - file_ref=str(record["file_ref"]), - vector=vector, - text=summary, - external_id=record.get("external_id"), - source_type=str(record.get("source_type") or ""), - source_path=str(record.get("source_path") or ""), - title=str(record.get("title") or ""), - metadata=metadata, - ) - ] - ) - return { - "status": "ready", - "indexed_rows": count, - "index_path": str(self.index.db_path), - "embedding_provider": self.embedding_provider, - "embedding_model": self.embedding_model, - "embedding_dimensions": self.embedding_dimensions, - } - - def delete_summary(self, file_ref: str) -> int: - return self.index.delete_file_refs([file_ref]) - - def _ensure_index(self) -> None: - if not self.index.db_path.exists(): - self.index.reset( - dimension=self.embedding_dimensions, - metadata=self._index_metadata(), - ) - return - try: - existing_dimension = self.index.dimension() - except Exception as exc: - raise RuntimeError( - "could not validate existing summary projection index config; " - f"refusing to reset {self.index.db_path}. Move the existing index " - "aside or rebuild it intentionally before changing embedding config." - ) from exc - if existing_dimension != self.embedding_dimensions: - raise self._dimension_mismatch_error( - self.index.db_path, - existing_dimension, - self.embedding_dimensions, - ) - - def _index_metadata(self) -> dict[str, Any]: - return { - "channel": "summary", - "embedding_provider": self.embedding_provider, - "embedding_model": self.embedding_model, - "embedding_dimensions": self.embedding_dimensions, - } - - @classmethod - def _validate_existing_index_dimension( - cls, - index_dir: str | Path, - embedding_dimensions: int, - ) -> None: - index_path = ( - Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite" - ) - if not index_path.exists(): - return - index = SQLiteVecSemanticIndex(index_path) - try: - existing_dimension = index.dimension() - except Exception as exc: - raise RuntimeError( - "could not validate existing summary projection index config; " - f"refusing to reset {index_path}. Move the existing index " - "aside or rebuild it intentionally before changing embedding config." - ) from exc - if existing_dimension != embedding_dimensions: - raise cls._dimension_mismatch_error( - index_path, - existing_dimension, - embedding_dimensions, - ) - - @staticmethod - def _dimension_mismatch_error( - index_path: Path, - existing_dimension: int, - embedding_dimensions: int, - ) -> RuntimeError: - return RuntimeError( - "summary projection index dimension mismatch: " - f"{index_path} was built with dimension {existing_dimension}, " - f"but configured embedding_dimensions is {embedding_dimensions}. " - "Use the matching embedding config, or rebuild the projection index " - "at a new path after preserving the existing data." - ) diff --git a/pageindex/filesystem/hybrid_projection.py b/pageindex/filesystem/semantic_projection.py similarity index 68% rename from pageindex/filesystem/hybrid_projection.py rename to pageindex/filesystem/semantic_projection.py index 348aa20..6059a9b 100644 --- a/pageindex/filesystem/hybrid_projection.py +++ b/pageindex/filesystem/semantic_projection.py @@ -11,7 +11,12 @@ from pathlib import Path from typing import Any from .core import DEFAULT_EMBEDDING_DIMENSIONS -from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, SemanticSearchResult +from .semantic_index import ( + SQLiteVecSemanticIndex, + SemanticIndexError, + SemanticIndexRecord, + SemanticSearchResult, +) INDEX_BY_CHANNEL = { @@ -29,7 +34,7 @@ class QueryProjection: @dataclass(frozen=True) -class HybridProjectionCandidate: +class SemanticProjectionCandidate: document_id: str score: float sources: list[dict[str, Any]] @@ -40,7 +45,7 @@ class HybridProjectionCandidate: snippet: str -class HybridProjectionSearchBackend: +class SemanticProjectionSearchBackend: """Semantic channel retrieval over rebuildable projection indexes. The SQLite catalog remains the source of truth. This backend only reads @@ -86,7 +91,7 @@ class HybridProjectionSearchBackend: embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_timeout: float = 60, **kwargs: Any, - ) -> "HybridProjectionSearchBackend": + ) -> "SemanticProjectionSearchBackend": return cls( index_dir, embedder=make_embedder( @@ -108,7 +113,7 @@ class HybridProjectionSearchBackend: *, limit: int = 10, filters: dict[str, Any] | None = None, - ) -> list[HybridProjectionCandidate]: + ) -> list[SemanticProjectionCandidate]: if channel not in SEMANTIC_TOOL_CHANNELS: raise ValueError(f"unsupported semantic channel: {channel}") if channel not in self.available_channels(): @@ -180,6 +185,172 @@ class HybridProjectionSearchBackend: return {**info, "available": int(info.get("document_count") or 0) > 0} +class SummaryProjectionIndexer: + """Synchronous register-time summary projection indexer.""" + + def __init__( + self, + index_dir: str | Path, + *, + embedder: Any, + embedding_provider: str, + embedding_model: str, + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, + embedding_cache_path: str | Path | None = None, + ) -> None: + self.index_dir = Path(index_dir).expanduser() + self.index_dir.mkdir(parents=True, exist_ok=True) + self.embedder = embedder + self.embedding_provider = embedding_provider + self.embedding_model = embedding_model + self.embedding_dimensions = embedding_dimensions + self.cache_model = embedding_cache_model_key(embedding_model, embedding_dimensions) + self.embedding_cache = EmbeddingCache( + Path(embedding_cache_path).expanduser() + if embedding_cache_path is not None + else self.index_dir / "embedding_cache.sqlite" + ) + self.index = SQLiteVecSemanticIndex( + self.index_dir / f"{INDEX_BY_CHANNEL['summary']}.sqlite" + ) + self._ensure_index() + + @classmethod + def from_provider( + cls, + index_dir: str | Path, + *, + embedding_provider: str = "openai", + embedding_model: str = "text-embedding-3-small", + embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, + embedding_timeout: float = 60, + **kwargs: Any, + ) -> "SummaryProjectionIndexer": + cls._validate_existing_index_dimension(index_dir, embedding_dimensions) + return cls( + index_dir, + embedder=make_embedder( + embedding_provider, + embedding_model, + dimensions=embedding_dimensions, + timeout=embedding_timeout, + ), + embedding_provider=embedding_provider, + embedding_model=embedding_model, + embedding_dimensions=embedding_dimensions, + **kwargs, + ) + + def upsert_summary(self, record: dict[str, Any]) -> dict[str, Any]: + summary = str((record.get("metadata") or {}).get("summary") or "").strip() + if not summary: + return {"status": "skipped", "reason": "missing_summary"} + vector = self.embedding_cache.embed_texts( + [summary], + provider=self.embedding_provider, + model=self.cache_model, + embedder=self.embedder, + batch_size=1, + )[0] + metadata = dict(record.get("metadata") or {}) + count = self.index.upsert_many( + [ + SemanticIndexRecord( + file_ref=str(record["file_ref"]), + vector=vector, + text=summary, + external_id=record.get("external_id"), + source_type=str(record.get("source_type") or ""), + source_path=str(record.get("source_path") or ""), + title=str(record.get("title") or ""), + metadata=metadata, + ) + ] + ) + return { + "status": "ready", + "indexed_rows": count, + "index_path": str(self.index.db_path), + "embedding_provider": self.embedding_provider, + "embedding_model": self.embedding_model, + "embedding_dimensions": self.embedding_dimensions, + } + + def delete_summary(self, file_ref: str) -> int: + return self.index.delete_file_refs([file_ref]) + + def _ensure_index(self) -> None: + if not self.index.db_path.exists(): + self.index.reset( + dimension=self.embedding_dimensions, + metadata=self._index_metadata(), + ) + return + try: + existing_dimension = self.index.dimension() + except Exception as exc: + raise RuntimeError( + "could not validate existing summary projection index config; " + f"refusing to reset {self.index.db_path}. Move the existing index " + "aside or rebuild it intentionally before changing embedding config." + ) from exc + if existing_dimension != self.embedding_dimensions: + raise self._dimension_mismatch_error( + self.index.db_path, + existing_dimension, + self.embedding_dimensions, + ) + + def _index_metadata(self) -> dict[str, Any]: + return { + "channel": "summary", + "embedding_provider": self.embedding_provider, + "embedding_model": self.embedding_model, + "embedding_dimensions": self.embedding_dimensions, + } + + @classmethod + def _validate_existing_index_dimension( + cls, + index_dir: str | Path, + embedding_dimensions: int, + ) -> None: + index_path = ( + Path(index_dir).expanduser() / f"{INDEX_BY_CHANNEL['summary']}.sqlite" + ) + if not index_path.exists(): + return + index = SQLiteVecSemanticIndex(index_path) + try: + existing_dimension = index.dimension() + except Exception as exc: + raise RuntimeError( + "could not validate existing summary projection index config; " + f"refusing to reset {index_path}. Move the existing index " + "aside or rebuild it intentionally before changing embedding config." + ) from exc + if existing_dimension != embedding_dimensions: + raise cls._dimension_mismatch_error( + index_path, + existing_dimension, + embedding_dimensions, + ) + + @staticmethod + def _dimension_mismatch_error( + index_path: Path, + existing_dimension: int, + embedding_dimensions: int, + ) -> RuntimeError: + return RuntimeError( + "summary projection index dimension mismatch: " + f"{index_path} was built with dimension {existing_dimension}, " + f"but configured embedding_dimensions is {embedding_dimensions}. " + "Use the matching embedding config, or rebuild the projection index " + "at a new path after preserving the existing data." + ) + + class EmbeddingCache: def __init__(self, db_path: Path): self.db_path = db_path @@ -308,8 +479,8 @@ def query_text_for_channel(channel: str, query: str, projection: QueryProjection def rank_single_semantic_channel( channel: str, results: list[SemanticSearchResult], -) -> list[HybridProjectionCandidate]: - rows: list[HybridProjectionCandidate] = [] +) -> list[SemanticProjectionCandidate]: + rows: list[SemanticProjectionCandidate] = [] seen: set[str] = set() for rank, result in enumerate(results, 1): doc_id = str(result.external_id or result.file_ref) @@ -317,7 +488,7 @@ def rank_single_semantic_channel( continue seen.add(doc_id) rows.append( - HybridProjectionCandidate( + SemanticProjectionCandidate( document_id=doc_id, score=1 / (60 + rank), sources=[{"channel": channel, "rank": rank, "distance": result.distance}], diff --git a/pageindex/filesystem/structural_read.py b/pageindex/filesystem/structural_read.py deleted file mode 100644 index aca2bcd..0000000 --- a/pageindex/filesystem/structural_read.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import annotations - -from copy import deepcopy -from typing import Any - - -def strip_pageindex_text_fields(value: Any) -> Any: - if isinstance(value, list): - return [strip_pageindex_text_fields(item) for item in value] - if isinstance(value, dict): - return { - key: strip_pageindex_text_fields(item) - for key, item in value.items() - if key != "text" - } - return value - - -def flatten_pageindex_structure_nodes(structure: Any) -> list[dict[str, Any]]: - rows: list[dict[str, Any]] = [] - - def visit(value: Any, *, depth: int, parent_node_id: str | None) -> None: - if isinstance(value, list): - for item in value: - visit(item, depth=depth, parent_node_id=parent_node_id) - return - if not isinstance(value, dict): - return - - node_id = value.get("node_id") - child_values: list[Any] = [] - for child_key in ("nodes", "children"): - children = value.get(child_key) - if isinstance(children, list): - child_values.extend(children) - - row = { - key: strip_pageindex_text_fields(item) - for key, item in value.items() - if key not in {"text", "nodes", "children"} - } - row["depth"] = depth - row["children_count"] = len(child_values) - if parent_node_id: - row["parent_node_id"] = parent_node_id - rows.append(row) - - next_parent = str(node_id) if node_id is not None else parent_node_id - for child in child_values: - visit(child, depth=depth + 1, parent_node_id=next_parent) - - visit(structure, depth=0, parent_node_id=None) - return rows - - -def find_pageindex_node(structure: Any, node_id: str) -> dict[str, Any] | None: - if isinstance(structure, dict): - if str(structure.get("node_id", "")) == str(node_id): - return deepcopy(structure) - for child_key in ("nodes", "children"): - found = find_pageindex_node(structure.get(child_key), node_id) - if found is not None: - return found - if isinstance(structure, list): - for item in structure: - found = find_pageindex_node(item, node_id) - if found is not None: - return found - return None - - -def first_node_location(node: dict[str, Any]) -> str | None: - for key in ("line_num", "physical_index", "start_index"): - value = node.get(key) - if value is not None and value != "": - return str(value) - return None diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index b5b9491..0ce9f39 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -7,7 +7,7 @@ import pytest def test_filesystem_lazy_exports_remain_public(): import pageindex.filesystem as filesystem from pageindex.filesystem import ( - HybridProjectionSearchBackend, + SemanticProjectionSearchBackend, RebuildableSemanticIndex, SemanticIndexRecord, SemanticSearchResult, @@ -16,7 +16,7 @@ def test_filesystem_lazy_exports_remain_public(): ) for name in ( - "HybridProjectionSearchBackend", + "SemanticProjectionSearchBackend", "RebuildableSemanticIndex", "SemanticIndexRecord", "SemanticSearchResult", @@ -26,7 +26,7 @@ def test_filesystem_lazy_exports_remain_public(): assert name in filesystem.__all__ assert name in dir(filesystem) - assert HybridProjectionSearchBackend.__name__ == "HybridProjectionSearchBackend" + assert SemanticProjectionSearchBackend.__name__ == "SemanticProjectionSearchBackend" assert RebuildableSemanticIndex.__name__ == "RebuildableSemanticIndex" assert SemanticIndexRecord.__name__ == "SemanticIndexRecord" assert SemanticSearchResult.__name__ == "SemanticSearchResult" @@ -819,7 +819,7 @@ def test_existing_summary_projection_index_uses_current_config_when_dimensions_m monkeypatch.setattr( filesystem, - "configure_hybrid_projection_retrieval", + "configure_semantic_projection_retrieval", fake_configure, ) @@ -876,7 +876,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval( monkeypatch.setattr( filesystem, - "configure_hybrid_projection_retrieval", + "configure_semantic_projection_retrieval", fail_configure, ) @@ -892,9 +892,9 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval( def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path): from pageindex.filesystem import PageIndexFileSystem - from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend + from pageindex.filesystem.semantic_projection import SemanticProjectionSearchBackend from pageindex.filesystem.metadata_generation import MetadataGenerationResult - from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer class FixedEmbedder: def embed(self, texts): @@ -916,7 +916,7 @@ def test_browse_semantic_files_uses_summary_projection_when_only_summary_availab embedding_model="fake", embedding_dimensions=3, ) - backend = HybridProjectionSearchBackend( + backend = SemanticProjectionSearchBackend( index_dir, embedder=FixedEmbedder(), embedding_provider="test", diff --git a/tests/test_pifs_add_command.py b/tests/test_pifs_add_command.py index 1679431..4161b80 100644 --- a/tests/test_pifs_add_command.py +++ b/tests/test_pifs_add_command.py @@ -25,7 +25,7 @@ class StaticEmbedder: def make_summary_indexer(workspace: Path): - from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer return SummaryProjectionIndexer( workspace / "artifacts" / "projection_indexes", diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py index 3437a3c..405f5b9 100644 --- a/tests/test_pifs_cli.py +++ b/tests/test_pifs_cli.py @@ -37,7 +37,7 @@ def test_cli_workspace_without_projection_index_does_not_require_sqlite_vec( workspace = tmp_path / "workspace" real_import = builtins.__import__ - monkeypatch.delitem(sys.modules, "pageindex.filesystem.hybrid_projection", raising=False) + monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_projection", raising=False) monkeypatch.delitem(sys.modules, "pageindex.filesystem.semantic_index", raising=False) monkeypatch.delitem(sys.modules, "sqlite_vec", raising=False) diff --git a/tests/test_semantic_index.py b/tests/test_semantic_index.py index 6cdc0e1..c1da0dc 100644 --- a/tests/test_semantic_index.py +++ b/tests/test_semantic_index.py @@ -102,7 +102,7 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm def test_summary_projection_indexes_unified_metadata_summary(tmp_path): - from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer indexer = SummaryProjectionIndexer( tmp_path / "projection", @@ -134,7 +134,7 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path): def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path): - from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer indexer = SummaryProjectionIndexer( tmp_path / "projection", @@ -164,7 +164,7 @@ def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path): def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path): - from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer indexer = SummaryProjectionIndexer( tmp_path / "projection", @@ -188,7 +188,7 @@ def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path): def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_path): - from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer index_dir = tmp_path / "projection" index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") @@ -216,8 +216,8 @@ def test_summary_projection_default_rejects_existing_256_index_for_writes(tmp_pa def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embedder( tmp_path, monkeypatch ): - from pageindex.filesystem import projection_indexing - from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + from pageindex.filesystem import semantic_projection + from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer index_dir = tmp_path / "projection" index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") @@ -234,14 +234,14 @@ def test_summary_projection_from_provider_rejects_dimension_mismatch_before_embe def fail_make_embedder(*args, **kwargs): raise AssertionError("embedder should not be constructed before dimension validation") - monkeypatch.setattr(projection_indexing, "make_embedder", fail_make_embedder) + monkeypatch.setattr(semantic_projection, "make_embedder", fail_make_embedder) with pytest.raises(RuntimeError, match="configured embedding_dimensions is 1024"): SummaryProjectionIndexer.from_provider(index_dir) def test_embedding_cache_key_separates_model_dimensions(tmp_path): - from pageindex.filesystem.hybrid_projection import ( + from pageindex.filesystem.semantic_projection import ( EmbeddingCache, embedding_cache_model_key, ) @@ -285,7 +285,7 @@ def test_embedding_cache_key_separates_model_dimensions(tmp_path): def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path): - from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer + from pageindex.filesystem.semantic_projection import SummaryProjectionIndexer index_dir = tmp_path / "projection" index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite") @@ -328,7 +328,7 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path def test_hash_embedding_provider_is_not_available(): - from pageindex.filesystem.hybrid_projection import make_embedder + from pageindex.filesystem.semantic_projection import make_embedder with pytest.raises(ValueError, match="unknown embedding provider: hash"): make_embedder("hash", "unused", dimensions=256, timeout=1)