refactor(pifs): remove dead semantic retrieval code (#33)

This commit is contained in:
Bukely_ 2026-05-31 22:22:05 +08:00 committed by GitHub
parent d3034fa1b9
commit 0f71da3bc1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 24 additions and 691 deletions

View file

@ -18,7 +18,7 @@ class PIFSCommandError(ValueError):
class PIFSCommandExecutor: class PIFSCommandExecutor:
FORBIDDEN_SUBSTRINGS = (";", "`", "$(", "||", "\n", "\r") FORBIDDEN_SUBSTRINGS = (";", "`", "$(", "||", "\n", "\r")
FORBIDDEN_TOKENS = {"|", ">", "<", ">>", "<<", "&"} FORBIDDEN_TOKENS = {"|", ">", "<", ">>", "<<", "&"}
BASE_ALLOWED_COMMANDS = { COMMAND_NAMES = {
"ls", "ls",
"tree", "tree",
"find", "find",
@ -30,9 +30,7 @@ class PIFSCommandExecutor:
"tail", "tail",
"sed", "sed",
} }
ALLOWED_COMMANDS = BASE_ALLOWED_COMMANDS
ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"} ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"}
COMMAND_METHODS = {}
MAX_CHAINED_COMMANDS = 3 MAX_CHAINED_COMMANDS = 3
MAX_PIPE_COMMANDS = 3 MAX_PIPE_COMMANDS = 3
MAX_LS_LIMIT = 100 MAX_LS_LIMIT = 100
@ -65,7 +63,7 @@ class PIFSCommandExecutor:
self.query_context = query_context self.query_context = query_context
def allowed_commands(self) -> set[str]: def allowed_commands(self) -> set[str]:
return set(self.BASE_ALLOWED_COMMANDS) return set(self.COMMAND_NAMES)
def command_capabilities(self) -> dict[str, Any]: def command_capabilities(self) -> dict[str, Any]:
return { return {
@ -149,8 +147,7 @@ class PIFSCommandExecutor:
name = tokens[0] name = tokens[0]
if name not in self.allowed_commands(): if name not in self.allowed_commands():
raise PIFSCommandError(f"Unsupported command: {name}") raise PIFSCommandError(f"Unsupported command: {name}")
method_name = self.COMMAND_METHODS.get(name, f"_cmd_{name}") data = getattr(self, f"_cmd_{name}")(tokens[1:])
data = getattr(self, method_name)(tokens[1:])
return self._render(data, json_output=json_output, command_name=name) return self._render(data, json_output=json_output, command_name=name)
def _execute_pipe_filter(self, input_text: str, command: str) -> str: def _execute_pipe_filter(self, input_text: str, command: str) -> str:
@ -375,7 +372,6 @@ class PIFSCommandExecutor:
scope=scope, scope=scope,
metadata_filter=where, metadata_filter=where,
limit=limit, limit=limit,
semantic=False,
) )
def _cmd_grep(self, args: list[str]) -> Any: def _cmd_grep(self, args: list[str]) -> Any:
@ -423,7 +419,6 @@ class PIFSCommandExecutor:
scope={"folder_path": normalized, "recursive": False}, scope={"folder_path": normalized, "recursive": False},
metadata_filter=where, metadata_filter=where,
limit=limit, limit=limit,
semantic=False,
) )
if direct_results: if direct_results:
return { return {
@ -471,7 +466,6 @@ class PIFSCommandExecutor:
scope={"folder_path": normalized, "recursive": recursive}, scope={"folder_path": normalized, "recursive": recursive},
metadata_filter=where, metadata_filter=where,
limit=limit, limit=limit,
semantic=False,
) )
if not results and where is None: if not results and where is None:
source_hits = self._grep_source_file_hits(normalized, query, limit=limit) source_hits = self._grep_source_file_hits(normalized, query, limit=limit)
@ -1240,23 +1234,6 @@ class PIFSCommandExecutor:
return f"{folder}/{title}" if folder else f"/{title}" return f"{folder}/{title}" if folder else f"/{title}"
return str(item.get("source_path") or item.get("external_id") or file_ref or "-") return str(item.get("source_path") or item.get("external_id") or file_ref or "-")
def _stable_file_target_path(self, item: dict[str, Any]) -> str:
file_ref = str(item.get("file_ref") or "").strip()
source_path = str(item.get("source_path") or "").strip()
if source_path:
target = "/" + source_path.strip("/")
try:
if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref:
return target
except KeyError:
pass
external_id = str(item.get("external_id") or "").strip()
if external_id:
return external_id
if file_ref:
return file_ref
return str(item.get("external_id") or item.get("file_ref") or "-")
def _semantic_retrieval_query(self, query: str) -> str: def _semantic_retrieval_query(self, query: str) -> str:
query = str(query or "").strip() query = str(query or "").strip()
context = str(self.query_context or "").strip() context = str(self.query_context or "").strip()
@ -1326,7 +1303,6 @@ class PIFSCommandExecutor:
scope={"folder_path": child["path"], "recursive": True}, scope={"folder_path": child["path"], "recursive": True},
metadata_filter=metadata_filter, metadata_filter=metadata_filter,
limit=max(limit, 50), limit=max(limit, 50),
semantic=False,
) )
if not results: if not results:
continue continue

View file

@ -17,14 +17,6 @@ from .metadata_generation import (
MetadataGenerator, MetadataGenerator,
) )
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
from .semantic_folder_policy import (
SEMANTIC_FOLDER_BASE_FIELDS,
SEMANTIC_FOLDER_ROOT,
SEMANTIC_FOLDER_SYSTEM_FIELDS,
canonical_semantic_folder_field_name,
is_semantic_folder_forbidden_field,
semantic_folder_allowed_extension_fields,
)
from .store import ( from .store import (
SQLiteFileSystemStore, SQLiteFileSystemStore,
fingerprint, fingerprint,
@ -571,8 +563,7 @@ class PageIndexFileSystem:
) )
offset = (page - 1) * page_size offset = (page - 1) * page_size
needed = offset + page_size + 1 needed = offset + page_size + 1
semantic_filters = self._semantic_filters_for_scope(scope) semantic_filters = {"file_ref": scope_file_refs}
semantic_filters["file_ref"] = scope_file_refs
candidates = ( candidates = (
search_channel( search_channel(
space, space,
@ -695,95 +686,14 @@ class PageIndexFileSystem:
def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None: def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None:
self.store.attach_files_to_folders(items) self.store.attach_files_to_folders(items)
def apply_semantic_folder_projection(
self,
projection_plan: dict[str, Any],
*,
file_ref_by_document_id: Optional[dict[str, str]] = None,
) -> dict[str, Any]:
"""Attach registered files to a Semantic Folder Projection.
Registration remains the explicit folder placement step. This method is
the separate product API for adding derived `/semantic/...` memberships.
"""
folders = list(projection_plan.get("folders") or [])
memberships = list(projection_plan.get("memberships") or [])
policy_raw = projection_plan.get("policy")
policy = policy_raw if isinstance(policy_raw, dict) else {}
allowed_extension_fields = semantic_folder_allowed_extension_fields(
policy.get("allowed_extension_fields", [])
)
for folder in folders:
self._validate_semantic_folder_projection_item(folder, allowed_extension_fields)
for membership in memberships:
self._validate_semantic_folder_projection_item(membership, allowed_extension_fields)
for folder in folders:
folder_metadata = folder.get("metadata")
self.create_folder(
self._validate_semantic_folder_projection_path(str(folder["path"])),
kind=str(folder.get("kind") or "semantic_projection"),
description=str(folder.get("description") or ""),
metadata=folder_metadata if isinstance(folder_metadata, dict) else {},
)
items: list[dict[str, Any]] = []
file_ref_by_document_id = file_ref_by_document_id or {}
for membership in memberships:
document_id = self._semantic_folder_projection_document_id(membership)
file_ref = file_ref_by_document_id.get(document_id)
if not file_ref:
file_ref = self.store.resolve_file_ref(document_id)
metadata = (
dict(membership.get("folder_metadata"))
if isinstance(membership.get("folder_metadata"), dict)
else {}
)
metadata.update(
{
"projection": "Semantic Folder Projection",
"field": membership.get("field", ""),
"value": membership.get("value", ""),
"mount_kind": membership.get(
"mount_kind",
"semantic_folder_projection",
),
}
)
items.append(
{
"file_ref": file_ref,
"folder": self._validate_semantic_folder_projection_path(
str(membership["folder_path"])
),
"metadata": metadata,
}
)
self.attach_files_to_folders(items)
return {
"projection": "Semantic Folder Projection",
"folders_applied": len(folders),
"memberships_attached": len(items),
}
def search( def search(
self, self,
query: Union[str, list[str], None] = None, query: Union[str, list[str], None] = None,
scope: Optional[dict[str, Any]] = None, scope: Optional[dict[str, Any]] = None,
metadata_filter: Optional[dict[str, Any] | str] = None, metadata_filter: Optional[dict[str, Any] | str] = None,
limit: int = 10, limit: int = 10,
semantic: bool = True,
) -> list[SearchResult]: ) -> list[SearchResult]:
parsed_filter = self.metadata.parse_filter(metadata_filter) parsed_filter = self.metadata.parse_filter(metadata_filter)
if semantic and self._should_use_semantic_retrieval(query, scope):
semantic_results = self._semantic_search(
query,
scope=scope,
metadata_filter=parsed_filter,
limit=limit,
)
if semantic_results:
return semantic_results
rows = self.store.search_files( rows = self.store.search_files(
query, query,
scope=scope, scope=scope,
@ -821,30 +731,6 @@ class PageIndexFileSystem:
) )
return results return results
def search_semantic_channel(
self,
channel: str,
query: Union[str, list[str], None],
*,
scope: Optional[dict[str, Any]] = None,
metadata_filter: Optional[dict[str, Any] | str] = None,
limit: int = 10,
) -> list[SearchResult]:
parsed_filter = self.metadata.parse_filter(metadata_filter)
if (
self.semantic_retrieval_backend is None
or not self.has_semantic_channel(channel)
or not self._query_text(query)
):
return []
return self._semantic_search(
query,
scope=scope,
metadata_filter=parsed_filter,
limit=limit,
channel=channel,
)
def configure_hybrid_projection_retrieval( def configure_hybrid_projection_retrieval(
self, self,
index_dir: Union[str, Path], index_dir: Union[str, Path],
@ -853,7 +739,6 @@ class PageIndexFileSystem:
embedding_model: str = "text-embedding-3-small", embedding_model: str = "text-embedding-3-small",
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_timeout: float = 60, embedding_timeout: float = 60,
per_channel_limit: int = 100,
fetch_multiplier: int = 100, fetch_multiplier: int = 100,
) -> Any: ) -> Any:
from .hybrid_projection import HybridProjectionSearchBackend from .hybrid_projection import HybridProjectionSearchBackend
@ -864,7 +749,6 @@ class PageIndexFileSystem:
embedding_model=embedding_model, embedding_model=embedding_model,
embedding_dimensions=embedding_dimensions, embedding_dimensions=embedding_dimensions,
embedding_timeout=embedding_timeout, embedding_timeout=embedding_timeout,
per_channel_limit=per_channel_limit,
fetch_multiplier=fetch_multiplier, fetch_multiplier=fetch_multiplier,
) )
return self.semantic_retrieval_backend return self.semantic_retrieval_backend
@ -905,30 +789,6 @@ class PageIndexFileSystem:
}, },
} }
def find(
self,
target: str,
patterns: Union[str, list[str]],
limit: int = 20,
) -> list[OpenResult]:
file_ref = self._resolve_target(target)
patterns = [patterns] if isinstance(patterns, str) else list(patterns)
lowered_patterns = [pattern.lower() for pattern in patterns if pattern]
if not lowered_patterns:
return []
text = self.store.read_text(file_ref)
lines = text.splitlines()
matches = []
for i, line in enumerate(lines, 1):
haystack = line.lower()
if any(pattern in haystack for pattern in lowered_patterns):
start = max(1, i - 1)
end = min(len(lines), i + 1)
matches.append(self._open_lines(file_ref, start, end))
if len(matches) >= limit:
break
return matches
def open(self, target: str, location: str = "all") -> OpenResult: def open(self, target: str, location: str = "all") -> OpenResult:
file_ref = self._resolve_target(target) file_ref = self._resolve_target(target)
entry = self.store.get_file(file_ref) entry = self.store.get_file(file_ref)
@ -1387,15 +1247,6 @@ class PageIndexFileSystem:
metadata = file.get("metadata") or {} metadata = file.get("metadata") or {}
if not isinstance(metadata, dict): if not isinstance(metadata, dict):
raise ValueError("metadata must be a JSON object") raise ValueError("metadata must be a JSON object")
legacy_value_key = "derived_" + "metadata"
legacy_policy_key = "metadata_" + "generation_policy"
legacy_status_key = "metadata_" + "generation_status"
if legacy_value_key in file:
raise ValueError("legacy generated metadata map has been removed; put values in metadata")
if legacy_policy_key in file:
raise ValueError("legacy metadata policy key has been renamed to metadata_policy")
if legacy_status_key in file:
raise ValueError("legacy metadata status key has been renamed to metadata_status")
self._validate_register_metadata(metadata) self._validate_register_metadata(metadata)
external_id = file.get("external_id") external_id = file.get("external_id")
content = file.get("content") or "" content = file.get("content") or ""
@ -1946,93 +1797,6 @@ class PageIndexFileSystem:
def _resolve_target(self, target: str) -> str: def _resolve_target(self, target: str) -> str:
return self.store.resolve_file_ref(target) return self.store.resolve_file_ref(target)
def _should_use_semantic_retrieval(
self,
query: Union[str, list[str], None],
scope: Optional[dict[str, Any]],
) -> bool:
if self.semantic_retrieval_backend is None:
return False
if not self._query_text(query):
return False
if not scope:
return True
return bool(scope.get("recursive", True))
def _semantic_search(
self,
query: Union[str, list[str], None],
*,
scope: Optional[dict[str, Any]],
metadata_filter: Optional[dict[str, Any]],
limit: int,
channel: str | None = None,
) -> list[SearchResult]:
if self.semantic_retrieval_backend is None:
return []
filters = self._semantic_filters_for_scope(scope)
fetch_limit = max(limit * 10, 50)
query_text = self._query_text(query)
if channel:
search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None)
if search_channel is None:
return []
candidates = search_channel(
channel,
query_text,
limit=fetch_limit,
filters=filters,
)
else:
candidates = self.semantic_retrieval_backend.search(
query_text,
limit=fetch_limit,
filters=filters,
)
results: list[SearchResult] = []
seen: set[str] = set()
scope_path = self._scope_folder_path(scope)
for candidate in candidates:
try:
file_ref = self.store.resolve_file_ref(candidate.document_id)
except KeyError:
continue
if file_ref in seen:
continue
if not self.store.file_matches(file_ref, scope=scope, metadata_filter=metadata_filter):
continue
seen.add(file_ref)
entry = self.store.get_file(file_ref)
folder_paths = [
folder["path"]
for folder in self.store.folder_memberships(file_ref)
]
folder_path = self._preferred_folder_path(folder_paths, scope_path, entry.folder_path)
results.append(
SearchResult(
file_ref=file_ref,
external_id=entry.external_id,
title=entry.title,
snippet=candidate.snippet or entry.descriptor,
folder_path=folder_path,
folder_paths=folder_paths,
metadata=entry.metadata,
metadata_status=entry.metadata_status,
source_path=entry.source_path,
id=entry.external_id or file_ref,
document_id=entry.external_id,
name=entry.title,
description=entry.descriptor,
status=entry.pageindex_tree_status,
pageNum=None,
createdAt=None,
folderId=None,
)
)
if len(results) >= limit:
break
return results
@staticmethod @staticmethod
def _semantic_candidate_score(candidate: Any) -> float | None: def _semantic_candidate_score(candidate: Any) -> float | None:
try: try:
@ -2348,135 +2112,6 @@ class PageIndexFileSystem:
path = scope.get("folder_path") or scope.get("path") path = scope.get("folder_path") or scope.get("path")
return normalize_path(path) if path else None return normalize_path(path) if path else None
@classmethod
def _semantic_filters_for_scope(cls, scope: Optional[dict[str, Any]]) -> dict[str, Any]:
path = cls._scope_folder_path(scope)
if not path or path == "/":
return {}
source_type = cls._source_type_filter_from_path(path)
return {"source_type": source_type} if source_type else {}
@staticmethod
def _source_type_filter_from_path(path: str) -> str:
segments = [segment for segment in path.strip("/").split("/") if segment]
if not segments:
return ""
if segments[0] == SEMANTIC_FOLDER_ROOT.strip("/"):
segments = segments[1:]
if not segments:
return ""
first_segment = segments[0]
if first_segment.startswith("source_type="):
return first_segment.split("=", 1)[1].replace("-", "_")
if path.startswith(f"{SEMANTIC_FOLDER_ROOT}/"):
return ""
return ""
@classmethod
def _validate_semantic_folder_projection_item(
cls,
item: dict[str, Any],
allowed_extension_fields: set[str],
) -> None:
path = item.get("folder_path") or item.get("path")
if not path:
raise ValueError("Semantic Folder Projection items must include a folder path")
cls._validate_semantic_folder_projection_path(str(path))
allowed_fields = (
SEMANTIC_FOLDER_BASE_FIELDS
| SEMANTIC_FOLDER_SYSTEM_FIELDS
| allowed_extension_fields
)
if item.get("dataset_doc_uuid"):
raise ValueError(
"dataset_doc_uuid is not allowed in Semantic Folder Projection memberships; "
"use file_key or file_ref"
)
fields = []
explicit_field = cls._canonical_semantic_folder_field_name(item.get("field"))
if explicit_field:
fields.append(explicit_field)
fields.extend(cls._semantic_folder_projection_fields_from_path(str(path)))
for payload_key in ("metadata", "folder_metadata"):
cls._validate_semantic_folder_projection_metadata_payload(
item.get(payload_key),
allowed_fields,
)
for field in fields:
if is_semantic_folder_forbidden_field(field) or field not in allowed_fields:
raise ValueError(f"Field is not allowed for Semantic Folder Projection: {field}")
@staticmethod
def _validate_semantic_folder_projection_path(path: str) -> str:
normalized = normalize_path(path)
if normalized != SEMANTIC_FOLDER_ROOT and not normalized.startswith(
f"{SEMANTIC_FOLDER_ROOT}/"
):
raise ValueError("Semantic Folder Projection paths must be under /semantic")
return normalized
@classmethod
def _semantic_folder_projection_fields_from_path(cls, path: str) -> list[str]:
normalized = cls._validate_semantic_folder_projection_path(path)
fields: list[str] = []
for segment in normalized.strip("/").split("/")[1:]:
if "=" not in segment:
continue
field = cls._canonical_semantic_folder_field_name(
segment.split("=", 1)[0]
)
if field:
fields.append(field)
return fields
@classmethod
def _validate_semantic_folder_projection_metadata_payload(
cls,
payload: Any,
allowed_fields: set[str],
) -> None:
if isinstance(payload, dict):
for key, value in payload.items():
key_text = str(key)
key_field = cls._canonical_semantic_folder_field_name(key)
if is_semantic_folder_forbidden_field(key_field):
raise ValueError(
"Forbidden metadata field in Semantic Folder Projection payload: "
f"{key_text}"
)
if key_field in {"field", "source_field", "metadata_field"}:
field = cls._canonical_semantic_folder_field_name(value)
if field and (
is_semantic_folder_forbidden_field(field)
or field not in allowed_fields
):
raise ValueError(
f"Field is not allowed for Semantic Folder Projection: {field}"
)
cls._validate_semantic_folder_projection_metadata_payload(value, allowed_fields)
elif isinstance(payload, list):
for item in payload:
cls._validate_semantic_folder_projection_metadata_payload(item, allowed_fields)
elif isinstance(payload, str):
field = cls._canonical_semantic_folder_field_name(payload)
if is_semantic_folder_forbidden_field(field):
raise ValueError(
"Forbidden metadata field label in Semantic Folder Projection payload: "
f"{payload}"
)
@staticmethod
def _canonical_semantic_folder_field_name(value: Any) -> str:
return canonical_semantic_folder_field_name(value)
@staticmethod
def _semantic_folder_projection_document_id(membership: dict[str, Any]) -> str:
for key in ("file_key", "file_ref", "document_ref"):
value = str(membership.get(key) or "").strip()
if value:
return value
raise ValueError("Semantic Folder Projection membership is missing file_key or file_ref")
@staticmethod @staticmethod
def _query_text(query: Union[str, list[str], None]) -> str: def _query_text(query: Union[str, list[str], None]) -> str:
if query is None: if query is None:

View file

@ -15,28 +15,17 @@ from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, Semantic
INDEX_BY_CHANNEL = { INDEX_BY_CHANNEL = {
"metadata": "metadata_composite_vector",
"summary": "summary_only_vector", "summary": "summary_only_vector",
"entity": "entity_vectors", "entity": "entity_vectors",
"constraint": "constraint_vectors",
"relation": "relation_vectors", "relation": "relation_vectors",
} }
HYBRID_ENTITY_RELATION_CHANNELS = ("metadata", "entity", "constraint", "relation")
SEMANTIC_TOOL_CHANNELS = ("summary", "entity", "relation") SEMANTIC_TOOL_CHANNELS = ("summary", "entity", "relation")
HYBRID_ENTITY_RELATION_WEIGHTS = {
"metadata": 0.25,
"entity": 0.25,
"relation": 0.30,
"constraint": 0.20,
}
@dataclass(frozen=True) @dataclass(frozen=True)
class QueryProjection: class QueryProjection:
entities: list[str] entities: list[str]
relations: list[str] relations: list[str]
constraints: list[str]
expected_answer_type: str = ""
@dataclass(frozen=True) @dataclass(frozen=True)
@ -52,7 +41,7 @@ class HybridProjectionCandidate:
class HybridProjectionSearchBackend: class HybridProjectionSearchBackend:
"""Hybrid entity/relation/vector retrieval over rebuildable projection indexes. """Semantic channel retrieval over rebuildable projection indexes.
The SQLite catalog remains the source of truth. This backend only reads The SQLite catalog remains the source of truth. This backend only reads
external sqlite-vec projection indexes and returns candidate document ids external sqlite-vec projection indexes and returns candidate document ids
@ -68,7 +57,6 @@ class HybridProjectionSearchBackend:
embedding_model: str, embedding_model: str,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_cache_path: str | Path | None = None, embedding_cache_path: str | Path | None = None,
per_channel_limit: int = 100,
fetch_multiplier: int = 100, fetch_multiplier: int = 100,
) -> None: ) -> None:
self.index_dir = Path(index_dir).expanduser() self.index_dir = Path(index_dir).expanduser()
@ -82,7 +70,6 @@ class HybridProjectionSearchBackend:
if embedding_cache_path is not None if embedding_cache_path is not None
else self.index_dir / "embedding_cache.sqlite" else self.index_dir / "embedding_cache.sqlite"
) )
self.per_channel_limit = per_channel_limit
self.fetch_multiplier = fetch_multiplier self.fetch_multiplier = fetch_multiplier
self.indexes = { self.indexes = {
channel: SQLiteVecSemanticIndex(self.index_dir / f"{index_name}.sqlite") channel: SQLiteVecSemanticIndex(self.index_dir / f"{index_name}.sqlite")
@ -114,35 +101,6 @@ class HybridProjectionSearchBackend:
**kwargs, **kwargs,
) )
def search(
self,
query: str,
*,
limit: int = 10,
filters: dict[str, Any] | None = None,
) -> list[HybridProjectionCandidate]:
query = normalize_text(query)
if not query:
return []
projection = heuristic_query_projection(query)
channels = tuple(
channel
for channel in HYBRID_ENTITY_RELATION_CHANNELS
if self._channel_document_count(channel) > 0
)
if not channels:
if self._channel_document_count("summary") > 0:
return self.search_channel("summary", query, limit=limit, filters=filters)
return []
channel_hits = self._search_channels(
query=query,
projection=projection,
limit=max(limit, self.per_channel_limit),
filters=filters,
channels=channels,
)
return aggregate_hybrid_entity_relation(channel_hits, projection)[:limit]
def search_channel( def search_channel(
self, self,
channel: str, channel: str,
@ -187,7 +145,7 @@ class HybridProjectionSearchBackend:
"embedding_provider": self.embedding_provider, "embedding_provider": self.embedding_provider,
"embedding_model": self.embedding_model, "embedding_model": self.embedding_model,
"embedding_dimensions": self.embedding_dimensions, "embedding_dimensions": self.embedding_dimensions,
"strategy": "hybrid_entity_relation_vector", "strategy": "semantic_channel_vector",
"available_channels": list(self.available_channels()), "available_channels": list(self.available_channels()),
"channels": { "channels": {
channel: self._safe_channel_info(channel) channel: self._safe_channel_info(channel)
@ -221,36 +179,6 @@ class HybridProjectionSearchBackend:
} }
return {**info, "available": int(info.get("document_count") or 0) > 0} return {**info, "available": int(info.get("document_count") or 0) > 0}
def _search_channels(
self,
*,
query: str,
projection: QueryProjection,
limit: int,
filters: dict[str, Any] | None,
channels: tuple[str, ...],
) -> dict[str, list[SemanticSearchResult]]:
query_texts = {
channel: query_text_for_channel(channel, query, projection)
for channel in channels
}
vectors = self.embedding_cache.embed_texts(
[query_texts[channel] for channel in channels],
provider=self.embedding_provider,
model=self.cache_model,
embedder=self.embedder,
batch_size=1,
)
return {
channel: self.indexes[channel].search(
vector,
limit=limit,
filters=filters,
fetch_multiplier=self.fetch_multiplier,
)
for channel, vector in zip(channels, vectors)
}
class EmbeddingCache: class EmbeddingCache:
def __init__(self, db_path: Path): def __init__(self, db_path: Path):
@ -368,12 +296,10 @@ def make_embedder(provider: str, model: str, *, dimensions: int, timeout: float)
def query_text_for_channel(channel: str, query: str, projection: QueryProjection) -> str: def query_text_for_channel(channel: str, query: str, projection: QueryProjection) -> str:
if channel in {"metadata", "summary"}: if channel == "summary":
return query return query
if channel == "entity": if channel == "entity":
return compact_join(projection.entities, limit=24) or query return compact_join(projection.entities, limit=24) or query
if channel == "constraint":
return compact_join(projection.constraints, limit=24) or query
if channel == "relation": if channel == "relation":
return "\n".join(projection.relations) or query return "\n".join(projection.relations) or query
raise ValueError(f"unknown semantic channel: {channel}") raise ValueError(f"unknown semantic channel: {channel}")
@ -405,87 +331,6 @@ def rank_single_semantic_channel(
return rows return rows
def aggregate_hybrid_entity_relation(
channel_hits: dict[str, list[SemanticSearchResult]],
projection: QueryProjection,
) -> list[HybridProjectionCandidate]:
by_doc: dict[str, dict[str, Any]] = {}
for channel, results in channel_hits.items():
weight = HYBRID_ENTITY_RELATION_WEIGHTS[channel]
seen_in_channel = set()
for rank, result in enumerate(results, 1):
doc_id = str(result.external_id or result.file_ref)
if doc_id in seen_in_channel:
continue
seen_in_channel.add(doc_id)
item = by_doc.setdefault(
doc_id,
{
"document_id": doc_id,
"score": 0.0,
"sources": [],
"source_type": result.source_type,
"source_path": result.source_path,
"title": result.title,
"metadata": result.metadata,
},
)
item["score"] += weight * (1 / (60 + rank))
item["sources"].append({"channel": channel, "rank": rank, "distance": result.distance})
candidates = []
for item in by_doc.values():
item["score"] += exact_match_bonus(item, projection)
candidates.append(
HybridProjectionCandidate(
document_id=item["document_id"],
score=float(item["score"]),
sources=item["sources"],
source_type=item["source_type"],
source_path=item["source_path"],
title=item["title"],
metadata=item["metadata"],
snippet=hybrid_snippet(item),
)
)
return sorted(
candidates,
key=lambda item: (
-item.score,
min(source["rank"] for source in item.sources),
item.document_id,
),
)
def exact_match_bonus(item: dict[str, Any], projection: QueryProjection) -> float:
haystack = json.dumps(
{
"title": item.get("title", ""),
"source_path": item.get("source_path", ""),
"metadata": item.get("metadata", {}),
},
ensure_ascii=False,
).lower()
terms = [*projection.entities[:8], *projection.constraints[:6]]
matched = 0
for term in terms:
normalized = str(term).lower().strip()
if len(normalized) >= 3 and normalized in haystack:
matched += 1
return min(0.02, matched * 0.004)
def hybrid_snippet(item: dict[str, Any]) -> str:
channels = ", ".join(
f"{source['channel']}@{source['rank']}" for source in item.get("sources", [])[:4]
)
topic = str((item.get("metadata") or {}).get("topic") or "").strip()
parts = [f"hybrid_entity_relation_vector {channels}"]
if topic:
parts.append(f"topic: {topic}")
return "; ".join(parts)
def heuristic_query_projection(question: str) -> QueryProjection: def heuristic_query_projection(question: str) -> QueryProjection:
entities = dedupe( entities = dedupe(
[ [
@ -493,19 +338,11 @@ def heuristic_query_projection(question: str) -> QueryProjection:
*keyword_terms(question)[:16], *keyword_terms(question)[:16],
] ]
)[:16] )[:16]
constraints = dedupe(
[
*extract_constraint_terms(question),
*numeric_terms(question),
]
)[:12]
predicate = infer_query_predicate(question) predicate = infer_query_predicate(question)
subject = entities[0] if entities else "question" subject = entities[0] if entities else "question"
return QueryProjection( return QueryProjection(
entities=entities, entities=entities,
relations=[f"{subject} | {predicate} | {question}"], relations=[f"{subject} | {predicate} | {question}"],
constraints=constraints,
expected_answer_type=infer_answer_type(question),
) )
@ -554,24 +391,6 @@ def keyword_terms(text: str) -> list[str]:
return dedupe(terms) return dedupe(terms)
def extract_constraint_terms(text: str) -> list[str]:
constraints = []
for pattern in [
r"\b(?:must|should|required|requires?|default(?:s)?|limit(?:s)?|maximum|minimum)\b[^.!?\n]{0,120}",
r"\b[A-Za-z_][A-Za-z0-9_]{2,}\s*(?:=|:)\s*[A-Za-z0-9_.:/-]+",
]:
constraints.extend(match.strip() for match in re.findall(pattern, text, flags=re.IGNORECASE))
return dedupe(constraints)
def numeric_terms(text: str) -> list[str]:
return re.findall(
r"\b\d+(?:\.\d+)?\s*(?:MiB|GiB|MB|GB|ms|sec|seconds|minutes|hours|days|%|tokens?|req/s|rps)\b",
text,
flags=re.IGNORECASE,
)
def infer_query_predicate(question: str) -> str: def infer_query_predicate(question: str) -> str:
lowered = question.lower() lowered = question.lower()
rules = [ rules = [
@ -589,19 +408,6 @@ def infer_query_predicate(question: str) -> str:
return "asks_about" return "asks_about"
def infer_answer_type(question: str) -> str:
lowered = question.lower()
if "how many" in lowered or "limit" in lowered or "size" in lowered:
return "number_or_limit"
if lowered.startswith("who"):
return "person_or_team"
if lowered.startswith("when"):
return "date_or_time"
if "why" in lowered or "caused" in lowered:
return "cause"
return "fact"
def dedupe(values: Any) -> list[str]: def dedupe(values: Any) -> list[str]:
seen = set() seen = set()
result = [] result = []

View file

@ -1,72 +0,0 @@
from __future__ import annotations
import re
from typing import Any, Iterable
SEMANTIC_FOLDER_ROOT = "/semantic"
SEMANTIC_FOLDER_BASE_FIELDS = {"doc_type", "domain", "topic"}
SEMANTIC_FOLDER_SYSTEM_FIELDS = {"source_type"}
SEMANTIC_FOLDER_FORBIDDEN_FIELDS = {
"summary",
"entities",
"relations",
"constraints",
"retrieval_cues",
"dataset_doc_uuid",
"path",
"uri",
"source_path",
"storage_uri",
"title",
"content_type",
"created_at",
"updated_at",
}
def canonical_semantic_folder_field_name(value: Any) -> str:
text = str(value or "").strip()
if not text:
return ""
text = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", text)
text = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", text)
return re.sub(r"[^A-Za-z0-9]+", "_", text).strip("_").casefold()
def compact_semantic_folder_field_name(value: Any) -> str:
return re.sub(r"[^a-z0-9]+", "", canonical_semantic_folder_field_name(value))
def semantic_folder_field_identity_keys(value: Any) -> frozenset[str]:
canonical = canonical_semantic_folder_field_name(value)
compact = compact_semantic_folder_field_name(value)
return frozenset(key for key in (canonical, compact) if key)
def semantic_folder_field_identity_set(fields: Iterable[Any]) -> frozenset[str]:
keys: set[str] = set()
for field in fields:
keys.update(semantic_folder_field_identity_keys(field))
return frozenset(keys)
SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES = semantic_folder_field_identity_set(
SEMANTIC_FOLDER_FORBIDDEN_FIELDS
)
def is_semantic_folder_forbidden_field(value: Any) -> bool:
return bool(
semantic_folder_field_identity_keys(value)
& SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES
)
def semantic_folder_allowed_extension_fields(fields: Iterable[Any]) -> set[str]:
allowed = set()
for field in fields:
name = canonical_semantic_folder_field_name(field)
if name and not is_semantic_folder_forbidden_field(field):
allowed.add(name)
return allowed

View file

@ -308,7 +308,7 @@ def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp
assert filtered["data"][0]["summary"] == "summary for doc_10" assert filtered["data"][0]["summary"] == "summary for doc_10"
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path): def test_browse_scopes_channel_candidates_before_candidate_limit(tmp_path):
import json import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
@ -738,20 +738,6 @@ def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_
assert "semantic-grep" not in rendered assert "semantic-grep" not in rendered
def test_semantic_search_scope_filters_explicit_source_type_facets():
from pageindex.filesystem import PageIndexFileSystem
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/source_type=google-drive"}
) == {"source_type": "google_drive"}
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/semantic/source_type=google-drive"}
) == {"source_type": "google_drive"}
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/documents"}
) == {}
def test_grep_source_file_requires_terms_on_same_line(tmp_path): def test_grep_source_file_requires_terms_on_same_line(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
@ -904,7 +890,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
filesystem.configure_existing_projection_retrieval() filesystem.configure_existing_projection_retrieval()
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path): def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path):
from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
from pageindex.filesystem.metadata_generation import MetadataGenerationResult from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -961,9 +947,14 @@ def test_default_semantic_search_uses_summary_projection_when_only_summary_avail
}, },
) )
assert filesystem.search("purchase order exposure", semantic=False) == [] assert filesystem.search("purchase order exposure") == []
results = filesystem.search("purchase order exposure", semantic=True) results = filesystem.browse_semantic_files(
"/documents",
"purchase order exposure",
recursive=True,
page_size=5,
)
assert [result.external_id for result in results] == ["doc_summary_only"] assert [item["external_id"] for item in results["data"]] == ["doc_summary_only"]
assert results[0].snippet == "summary_vector rank=1" assert results["data"][0]["snippet"] == "summary_vector rank=1"

View file

@ -158,13 +158,13 @@ def test_add_configures_semantic_retrieval_in_same_filesystem_instance(tmp_path)
filesystem.add_file(source, "/documents") filesystem.add_file(source, "/documents")
assert filesystem.semantic_retrieval_channels() == ("summary",) assert filesystem.semantic_retrieval_channels() == ("summary",)
results = filesystem.search_semantic_channel( results = filesystem.browse_semantic_files(
"summary", "/documents",
"semantic recall", "semantic recall",
scope={"folder_path": "/documents", "recursive": True}, recursive=True,
limit=5, page_size=5,
) )
assert [result.source_path for result in results] == ["documents/semantic.txt"] assert [item["source_path"] for item in results["data"]] == ["documents/semantic.txt"]
def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch): def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch):

View file

@ -263,7 +263,7 @@ class PIFSAgentStreamTest(unittest.TestCase):
): ):
self.assertNotIn(old_command, prompt_surface) self.assertNotIn(old_command, prompt_surface)
def test_demo_prompt_uses_browse_strategy_and_not_legacy_semantic_search(self): def test_demo_prompt_uses_browse_strategy_and_not_old_vector_commands(self):
demo_prompt = load_demo_agent_prompt() demo_prompt = load_demo_agent_prompt()
self.assertIn("Start with ls or tree", demo_prompt) self.assertIn("Start with ls or tree", demo_prompt)

View file

@ -46,7 +46,6 @@ def test_descendant_folder_filter_treats_underscore_literally(tmp_path):
folder_id = filesystem.folder_info("/proj_1")["folder_id"] folder_id = filesystem.folder_info("/proj_1")["folder_id"]
scoped_results = filesystem.search( scoped_results = filesystem.search(
scope={"folder_id": folder_id, "recursive": True}, scope={"folder_id": folder_id, "recursive": True},
semantic=False,
limit=10, limit=10,
) )
ranked_folders = { ranked_folders = {
@ -102,12 +101,10 @@ def test_metadata_contains_treats_percent_and_underscore_literally(tmp_path):
percent_results = filesystem.search( percent_results = filesystem.search(
metadata_filter={"status": {"$contains": "100% done"}}, metadata_filter={"status": {"$contains": "100% done"}},
semantic=False,
limit=10, limit=10,
) )
underscore_results = filesystem.search( underscore_results = filesystem.search(
metadata_filter={"status": {"$contains": "build_alpha"}}, metadata_filter={"status": {"$contains": "build_alpha"}},
semantic=False,
limit=10, limit=10,
) )