refactor(pifs): remove dead semantic retrieval code (#33)

This commit is contained in:
Bukely_ 2026-05-31 22:22:05 +08:00 committed by GitHub
parent d3034fa1b9
commit 0f71da3bc1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 24 additions and 691 deletions

View file

@ -18,7 +18,7 @@ class PIFSCommandError(ValueError):
class PIFSCommandExecutor:
FORBIDDEN_SUBSTRINGS = (";", "`", "$(", "||", "\n", "\r")
FORBIDDEN_TOKENS = {"|", ">", "<", ">>", "<<", "&"}
BASE_ALLOWED_COMMANDS = {
COMMAND_NAMES = {
"ls",
"tree",
"find",
@ -30,9 +30,7 @@ class PIFSCommandExecutor:
"tail",
"sed",
}
ALLOWED_COMMANDS = BASE_ALLOWED_COMMANDS
ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"}
COMMAND_METHODS = {}
MAX_CHAINED_COMMANDS = 3
MAX_PIPE_COMMANDS = 3
MAX_LS_LIMIT = 100
@ -65,7 +63,7 @@ class PIFSCommandExecutor:
self.query_context = query_context
def allowed_commands(self) -> set[str]:
return set(self.BASE_ALLOWED_COMMANDS)
return set(self.COMMAND_NAMES)
def command_capabilities(self) -> dict[str, Any]:
return {
@ -149,8 +147,7 @@ class PIFSCommandExecutor:
name = tokens[0]
if name not in self.allowed_commands():
raise PIFSCommandError(f"Unsupported command: {name}")
method_name = self.COMMAND_METHODS.get(name, f"_cmd_{name}")
data = getattr(self, method_name)(tokens[1:])
data = getattr(self, f"_cmd_{name}")(tokens[1:])
return self._render(data, json_output=json_output, command_name=name)
def _execute_pipe_filter(self, input_text: str, command: str) -> str:
@ -375,7 +372,6 @@ class PIFSCommandExecutor:
scope=scope,
metadata_filter=where,
limit=limit,
semantic=False,
)
def _cmd_grep(self, args: list[str]) -> Any:
@ -423,7 +419,6 @@ class PIFSCommandExecutor:
scope={"folder_path": normalized, "recursive": False},
metadata_filter=where,
limit=limit,
semantic=False,
)
if direct_results:
return {
@ -471,7 +466,6 @@ class PIFSCommandExecutor:
scope={"folder_path": normalized, "recursive": recursive},
metadata_filter=where,
limit=limit,
semantic=False,
)
if not results and where is None:
source_hits = self._grep_source_file_hits(normalized, query, limit=limit)
@ -1240,23 +1234,6 @@ class PIFSCommandExecutor:
return f"{folder}/{title}" if folder else f"/{title}"
return str(item.get("source_path") or item.get("external_id") or file_ref or "-")
def _stable_file_target_path(self, item: dict[str, Any]) -> str:
file_ref = str(item.get("file_ref") or "").strip()
source_path = str(item.get("source_path") or "").strip()
if source_path:
target = "/" + source_path.strip("/")
try:
if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref:
return target
except KeyError:
pass
external_id = str(item.get("external_id") or "").strip()
if external_id:
return external_id
if file_ref:
return file_ref
return str(item.get("external_id") or item.get("file_ref") or "-")
def _semantic_retrieval_query(self, query: str) -> str:
query = str(query or "").strip()
context = str(self.query_context or "").strip()
@ -1326,7 +1303,6 @@ class PIFSCommandExecutor:
scope={"folder_path": child["path"], "recursive": True},
metadata_filter=metadata_filter,
limit=max(limit, 50),
semantic=False,
)
if not results:
continue

View file

@ -17,14 +17,6 @@ from .metadata_generation import (
MetadataGenerator,
)
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
from .semantic_folder_policy import (
SEMANTIC_FOLDER_BASE_FIELDS,
SEMANTIC_FOLDER_ROOT,
SEMANTIC_FOLDER_SYSTEM_FIELDS,
canonical_semantic_folder_field_name,
is_semantic_folder_forbidden_field,
semantic_folder_allowed_extension_fields,
)
from .store import (
SQLiteFileSystemStore,
fingerprint,
@ -571,8 +563,7 @@ class PageIndexFileSystem:
)
offset = (page - 1) * page_size
needed = offset + page_size + 1
semantic_filters = self._semantic_filters_for_scope(scope)
semantic_filters["file_ref"] = scope_file_refs
semantic_filters = {"file_ref": scope_file_refs}
candidates = (
search_channel(
space,
@ -695,95 +686,14 @@ class PageIndexFileSystem:
def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None:
self.store.attach_files_to_folders(items)
def apply_semantic_folder_projection(
self,
projection_plan: dict[str, Any],
*,
file_ref_by_document_id: Optional[dict[str, str]] = None,
) -> dict[str, Any]:
"""Attach registered files to a Semantic Folder Projection.
Registration remains the explicit folder placement step. This method is
the separate product API for adding derived `/semantic/...` memberships.
"""
folders = list(projection_plan.get("folders") or [])
memberships = list(projection_plan.get("memberships") or [])
policy_raw = projection_plan.get("policy")
policy = policy_raw if isinstance(policy_raw, dict) else {}
allowed_extension_fields = semantic_folder_allowed_extension_fields(
policy.get("allowed_extension_fields", [])
)
for folder in folders:
self._validate_semantic_folder_projection_item(folder, allowed_extension_fields)
for membership in memberships:
self._validate_semantic_folder_projection_item(membership, allowed_extension_fields)
for folder in folders:
folder_metadata = folder.get("metadata")
self.create_folder(
self._validate_semantic_folder_projection_path(str(folder["path"])),
kind=str(folder.get("kind") or "semantic_projection"),
description=str(folder.get("description") or ""),
metadata=folder_metadata if isinstance(folder_metadata, dict) else {},
)
items: list[dict[str, Any]] = []
file_ref_by_document_id = file_ref_by_document_id or {}
for membership in memberships:
document_id = self._semantic_folder_projection_document_id(membership)
file_ref = file_ref_by_document_id.get(document_id)
if not file_ref:
file_ref = self.store.resolve_file_ref(document_id)
metadata = (
dict(membership.get("folder_metadata"))
if isinstance(membership.get("folder_metadata"), dict)
else {}
)
metadata.update(
{
"projection": "Semantic Folder Projection",
"field": membership.get("field", ""),
"value": membership.get("value", ""),
"mount_kind": membership.get(
"mount_kind",
"semantic_folder_projection",
),
}
)
items.append(
{
"file_ref": file_ref,
"folder": self._validate_semantic_folder_projection_path(
str(membership["folder_path"])
),
"metadata": metadata,
}
)
self.attach_files_to_folders(items)
return {
"projection": "Semantic Folder Projection",
"folders_applied": len(folders),
"memberships_attached": len(items),
}
def search(
self,
query: Union[str, list[str], None] = None,
scope: Optional[dict[str, Any]] = None,
metadata_filter: Optional[dict[str, Any] | str] = None,
limit: int = 10,
semantic: bool = True,
) -> list[SearchResult]:
parsed_filter = self.metadata.parse_filter(metadata_filter)
if semantic and self._should_use_semantic_retrieval(query, scope):
semantic_results = self._semantic_search(
query,
scope=scope,
metadata_filter=parsed_filter,
limit=limit,
)
if semantic_results:
return semantic_results
rows = self.store.search_files(
query,
scope=scope,
@ -821,30 +731,6 @@ class PageIndexFileSystem:
)
return results
def search_semantic_channel(
self,
channel: str,
query: Union[str, list[str], None],
*,
scope: Optional[dict[str, Any]] = None,
metadata_filter: Optional[dict[str, Any] | str] = None,
limit: int = 10,
) -> list[SearchResult]:
parsed_filter = self.metadata.parse_filter(metadata_filter)
if (
self.semantic_retrieval_backend is None
or not self.has_semantic_channel(channel)
or not self._query_text(query)
):
return []
return self._semantic_search(
query,
scope=scope,
metadata_filter=parsed_filter,
limit=limit,
channel=channel,
)
def configure_hybrid_projection_retrieval(
self,
index_dir: Union[str, Path],
@ -853,7 +739,6 @@ class PageIndexFileSystem:
embedding_model: str = "text-embedding-3-small",
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_timeout: float = 60,
per_channel_limit: int = 100,
fetch_multiplier: int = 100,
) -> Any:
from .hybrid_projection import HybridProjectionSearchBackend
@ -864,7 +749,6 @@ class PageIndexFileSystem:
embedding_model=embedding_model,
embedding_dimensions=embedding_dimensions,
embedding_timeout=embedding_timeout,
per_channel_limit=per_channel_limit,
fetch_multiplier=fetch_multiplier,
)
return self.semantic_retrieval_backend
@ -905,30 +789,6 @@ class PageIndexFileSystem:
},
}
def find(
self,
target: str,
patterns: Union[str, list[str]],
limit: int = 20,
) -> list[OpenResult]:
file_ref = self._resolve_target(target)
patterns = [patterns] if isinstance(patterns, str) else list(patterns)
lowered_patterns = [pattern.lower() for pattern in patterns if pattern]
if not lowered_patterns:
return []
text = self.store.read_text(file_ref)
lines = text.splitlines()
matches = []
for i, line in enumerate(lines, 1):
haystack = line.lower()
if any(pattern in haystack for pattern in lowered_patterns):
start = max(1, i - 1)
end = min(len(lines), i + 1)
matches.append(self._open_lines(file_ref, start, end))
if len(matches) >= limit:
break
return matches
def open(self, target: str, location: str = "all") -> OpenResult:
file_ref = self._resolve_target(target)
entry = self.store.get_file(file_ref)
@ -1387,15 +1247,6 @@ class PageIndexFileSystem:
metadata = file.get("metadata") or {}
if not isinstance(metadata, dict):
raise ValueError("metadata must be a JSON object")
legacy_value_key = "derived_" + "metadata"
legacy_policy_key = "metadata_" + "generation_policy"
legacy_status_key = "metadata_" + "generation_status"
if legacy_value_key in file:
raise ValueError("legacy generated metadata map has been removed; put values in metadata")
if legacy_policy_key in file:
raise ValueError("legacy metadata policy key has been renamed to metadata_policy")
if legacy_status_key in file:
raise ValueError("legacy metadata status key has been renamed to metadata_status")
self._validate_register_metadata(metadata)
external_id = file.get("external_id")
content = file.get("content") or ""
@ -1946,93 +1797,6 @@ class PageIndexFileSystem:
def _resolve_target(self, target: str) -> str:
return self.store.resolve_file_ref(target)
def _should_use_semantic_retrieval(
self,
query: Union[str, list[str], None],
scope: Optional[dict[str, Any]],
) -> bool:
if self.semantic_retrieval_backend is None:
return False
if not self._query_text(query):
return False
if not scope:
return True
return bool(scope.get("recursive", True))
def _semantic_search(
self,
query: Union[str, list[str], None],
*,
scope: Optional[dict[str, Any]],
metadata_filter: Optional[dict[str, Any]],
limit: int,
channel: str | None = None,
) -> list[SearchResult]:
if self.semantic_retrieval_backend is None:
return []
filters = self._semantic_filters_for_scope(scope)
fetch_limit = max(limit * 10, 50)
query_text = self._query_text(query)
if channel:
search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None)
if search_channel is None:
return []
candidates = search_channel(
channel,
query_text,
limit=fetch_limit,
filters=filters,
)
else:
candidates = self.semantic_retrieval_backend.search(
query_text,
limit=fetch_limit,
filters=filters,
)
results: list[SearchResult] = []
seen: set[str] = set()
scope_path = self._scope_folder_path(scope)
for candidate in candidates:
try:
file_ref = self.store.resolve_file_ref(candidate.document_id)
except KeyError:
continue
if file_ref in seen:
continue
if not self.store.file_matches(file_ref, scope=scope, metadata_filter=metadata_filter):
continue
seen.add(file_ref)
entry = self.store.get_file(file_ref)
folder_paths = [
folder["path"]
for folder in self.store.folder_memberships(file_ref)
]
folder_path = self._preferred_folder_path(folder_paths, scope_path, entry.folder_path)
results.append(
SearchResult(
file_ref=file_ref,
external_id=entry.external_id,
title=entry.title,
snippet=candidate.snippet or entry.descriptor,
folder_path=folder_path,
folder_paths=folder_paths,
metadata=entry.metadata,
metadata_status=entry.metadata_status,
source_path=entry.source_path,
id=entry.external_id or file_ref,
document_id=entry.external_id,
name=entry.title,
description=entry.descriptor,
status=entry.pageindex_tree_status,
pageNum=None,
createdAt=None,
folderId=None,
)
)
if len(results) >= limit:
break
return results
@staticmethod
def _semantic_candidate_score(candidate: Any) -> float | None:
try:
@ -2348,135 +2112,6 @@ class PageIndexFileSystem:
path = scope.get("folder_path") or scope.get("path")
return normalize_path(path) if path else None
@classmethod
def _semantic_filters_for_scope(cls, scope: Optional[dict[str, Any]]) -> dict[str, Any]:
path = cls._scope_folder_path(scope)
if not path or path == "/":
return {}
source_type = cls._source_type_filter_from_path(path)
return {"source_type": source_type} if source_type else {}
@staticmethod
def _source_type_filter_from_path(path: str) -> str:
segments = [segment for segment in path.strip("/").split("/") if segment]
if not segments:
return ""
if segments[0] == SEMANTIC_FOLDER_ROOT.strip("/"):
segments = segments[1:]
if not segments:
return ""
first_segment = segments[0]
if first_segment.startswith("source_type="):
return first_segment.split("=", 1)[1].replace("-", "_")
if path.startswith(f"{SEMANTIC_FOLDER_ROOT}/"):
return ""
return ""
@classmethod
def _validate_semantic_folder_projection_item(
cls,
item: dict[str, Any],
allowed_extension_fields: set[str],
) -> None:
path = item.get("folder_path") or item.get("path")
if not path:
raise ValueError("Semantic Folder Projection items must include a folder path")
cls._validate_semantic_folder_projection_path(str(path))
allowed_fields = (
SEMANTIC_FOLDER_BASE_FIELDS
| SEMANTIC_FOLDER_SYSTEM_FIELDS
| allowed_extension_fields
)
if item.get("dataset_doc_uuid"):
raise ValueError(
"dataset_doc_uuid is not allowed in Semantic Folder Projection memberships; "
"use file_key or file_ref"
)
fields = []
explicit_field = cls._canonical_semantic_folder_field_name(item.get("field"))
if explicit_field:
fields.append(explicit_field)
fields.extend(cls._semantic_folder_projection_fields_from_path(str(path)))
for payload_key in ("metadata", "folder_metadata"):
cls._validate_semantic_folder_projection_metadata_payload(
item.get(payload_key),
allowed_fields,
)
for field in fields:
if is_semantic_folder_forbidden_field(field) or field not in allowed_fields:
raise ValueError(f"Field is not allowed for Semantic Folder Projection: {field}")
@staticmethod
def _validate_semantic_folder_projection_path(path: str) -> str:
normalized = normalize_path(path)
if normalized != SEMANTIC_FOLDER_ROOT and not normalized.startswith(
f"{SEMANTIC_FOLDER_ROOT}/"
):
raise ValueError("Semantic Folder Projection paths must be under /semantic")
return normalized
@classmethod
def _semantic_folder_projection_fields_from_path(cls, path: str) -> list[str]:
normalized = cls._validate_semantic_folder_projection_path(path)
fields: list[str] = []
for segment in normalized.strip("/").split("/")[1:]:
if "=" not in segment:
continue
field = cls._canonical_semantic_folder_field_name(
segment.split("=", 1)[0]
)
if field:
fields.append(field)
return fields
@classmethod
def _validate_semantic_folder_projection_metadata_payload(
cls,
payload: Any,
allowed_fields: set[str],
) -> None:
if isinstance(payload, dict):
for key, value in payload.items():
key_text = str(key)
key_field = cls._canonical_semantic_folder_field_name(key)
if is_semantic_folder_forbidden_field(key_field):
raise ValueError(
"Forbidden metadata field in Semantic Folder Projection payload: "
f"{key_text}"
)
if key_field in {"field", "source_field", "metadata_field"}:
field = cls._canonical_semantic_folder_field_name(value)
if field and (
is_semantic_folder_forbidden_field(field)
or field not in allowed_fields
):
raise ValueError(
f"Field is not allowed for Semantic Folder Projection: {field}"
)
cls._validate_semantic_folder_projection_metadata_payload(value, allowed_fields)
elif isinstance(payload, list):
for item in payload:
cls._validate_semantic_folder_projection_metadata_payload(item, allowed_fields)
elif isinstance(payload, str):
field = cls._canonical_semantic_folder_field_name(payload)
if is_semantic_folder_forbidden_field(field):
raise ValueError(
"Forbidden metadata field label in Semantic Folder Projection payload: "
f"{payload}"
)
@staticmethod
def _canonical_semantic_folder_field_name(value: Any) -> str:
return canonical_semantic_folder_field_name(value)
@staticmethod
def _semantic_folder_projection_document_id(membership: dict[str, Any]) -> str:
for key in ("file_key", "file_ref", "document_ref"):
value = str(membership.get(key) or "").strip()
if value:
return value
raise ValueError("Semantic Folder Projection membership is missing file_key or file_ref")
@staticmethod
def _query_text(query: Union[str, list[str], None]) -> str:
if query is None:

View file

@ -15,28 +15,17 @@ from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, Semantic
INDEX_BY_CHANNEL = {
"metadata": "metadata_composite_vector",
"summary": "summary_only_vector",
"entity": "entity_vectors",
"constraint": "constraint_vectors",
"relation": "relation_vectors",
}
HYBRID_ENTITY_RELATION_CHANNELS = ("metadata", "entity", "constraint", "relation")
SEMANTIC_TOOL_CHANNELS = ("summary", "entity", "relation")
HYBRID_ENTITY_RELATION_WEIGHTS = {
"metadata": 0.25,
"entity": 0.25,
"relation": 0.30,
"constraint": 0.20,
}
@dataclass(frozen=True)
class QueryProjection:
entities: list[str]
relations: list[str]
constraints: list[str]
expected_answer_type: str = ""
@dataclass(frozen=True)
@ -52,7 +41,7 @@ class HybridProjectionCandidate:
class HybridProjectionSearchBackend:
"""Hybrid entity/relation/vector retrieval over rebuildable projection indexes.
"""Semantic channel retrieval over rebuildable projection indexes.
The SQLite catalog remains the source of truth. This backend only reads
external sqlite-vec projection indexes and returns candidate document ids
@ -68,7 +57,6 @@ class HybridProjectionSearchBackend:
embedding_model: str,
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
embedding_cache_path: str | Path | None = None,
per_channel_limit: int = 100,
fetch_multiplier: int = 100,
) -> None:
self.index_dir = Path(index_dir).expanduser()
@ -82,7 +70,6 @@ class HybridProjectionSearchBackend:
if embedding_cache_path is not None
else self.index_dir / "embedding_cache.sqlite"
)
self.per_channel_limit = per_channel_limit
self.fetch_multiplier = fetch_multiplier
self.indexes = {
channel: SQLiteVecSemanticIndex(self.index_dir / f"{index_name}.sqlite")
@ -114,35 +101,6 @@ class HybridProjectionSearchBackend:
**kwargs,
)
def search(
self,
query: str,
*,
limit: int = 10,
filters: dict[str, Any] | None = None,
) -> list[HybridProjectionCandidate]:
query = normalize_text(query)
if not query:
return []
projection = heuristic_query_projection(query)
channels = tuple(
channel
for channel in HYBRID_ENTITY_RELATION_CHANNELS
if self._channel_document_count(channel) > 0
)
if not channels:
if self._channel_document_count("summary") > 0:
return self.search_channel("summary", query, limit=limit, filters=filters)
return []
channel_hits = self._search_channels(
query=query,
projection=projection,
limit=max(limit, self.per_channel_limit),
filters=filters,
channels=channels,
)
return aggregate_hybrid_entity_relation(channel_hits, projection)[:limit]
def search_channel(
self,
channel: str,
@ -187,7 +145,7 @@ class HybridProjectionSearchBackend:
"embedding_provider": self.embedding_provider,
"embedding_model": self.embedding_model,
"embedding_dimensions": self.embedding_dimensions,
"strategy": "hybrid_entity_relation_vector",
"strategy": "semantic_channel_vector",
"available_channels": list(self.available_channels()),
"channels": {
channel: self._safe_channel_info(channel)
@ -221,36 +179,6 @@ class HybridProjectionSearchBackend:
}
return {**info, "available": int(info.get("document_count") or 0) > 0}
def _search_channels(
self,
*,
query: str,
projection: QueryProjection,
limit: int,
filters: dict[str, Any] | None,
channels: tuple[str, ...],
) -> dict[str, list[SemanticSearchResult]]:
query_texts = {
channel: query_text_for_channel(channel, query, projection)
for channel in channels
}
vectors = self.embedding_cache.embed_texts(
[query_texts[channel] for channel in channels],
provider=self.embedding_provider,
model=self.cache_model,
embedder=self.embedder,
batch_size=1,
)
return {
channel: self.indexes[channel].search(
vector,
limit=limit,
filters=filters,
fetch_multiplier=self.fetch_multiplier,
)
for channel, vector in zip(channels, vectors)
}
class EmbeddingCache:
def __init__(self, db_path: Path):
@ -368,12 +296,10 @@ def make_embedder(provider: str, model: str, *, dimensions: int, timeout: float)
def query_text_for_channel(channel: str, query: str, projection: QueryProjection) -> str:
if channel in {"metadata", "summary"}:
if channel == "summary":
return query
if channel == "entity":
return compact_join(projection.entities, limit=24) or query
if channel == "constraint":
return compact_join(projection.constraints, limit=24) or query
if channel == "relation":
return "\n".join(projection.relations) or query
raise ValueError(f"unknown semantic channel: {channel}")
@ -405,87 +331,6 @@ def rank_single_semantic_channel(
return rows
def aggregate_hybrid_entity_relation(
channel_hits: dict[str, list[SemanticSearchResult]],
projection: QueryProjection,
) -> list[HybridProjectionCandidate]:
by_doc: dict[str, dict[str, Any]] = {}
for channel, results in channel_hits.items():
weight = HYBRID_ENTITY_RELATION_WEIGHTS[channel]
seen_in_channel = set()
for rank, result in enumerate(results, 1):
doc_id = str(result.external_id or result.file_ref)
if doc_id in seen_in_channel:
continue
seen_in_channel.add(doc_id)
item = by_doc.setdefault(
doc_id,
{
"document_id": doc_id,
"score": 0.0,
"sources": [],
"source_type": result.source_type,
"source_path": result.source_path,
"title": result.title,
"metadata": result.metadata,
},
)
item["score"] += weight * (1 / (60 + rank))
item["sources"].append({"channel": channel, "rank": rank, "distance": result.distance})
candidates = []
for item in by_doc.values():
item["score"] += exact_match_bonus(item, projection)
candidates.append(
HybridProjectionCandidate(
document_id=item["document_id"],
score=float(item["score"]),
sources=item["sources"],
source_type=item["source_type"],
source_path=item["source_path"],
title=item["title"],
metadata=item["metadata"],
snippet=hybrid_snippet(item),
)
)
return sorted(
candidates,
key=lambda item: (
-item.score,
min(source["rank"] for source in item.sources),
item.document_id,
),
)
def exact_match_bonus(item: dict[str, Any], projection: QueryProjection) -> float:
haystack = json.dumps(
{
"title": item.get("title", ""),
"source_path": item.get("source_path", ""),
"metadata": item.get("metadata", {}),
},
ensure_ascii=False,
).lower()
terms = [*projection.entities[:8], *projection.constraints[:6]]
matched = 0
for term in terms:
normalized = str(term).lower().strip()
if len(normalized) >= 3 and normalized in haystack:
matched += 1
return min(0.02, matched * 0.004)
def hybrid_snippet(item: dict[str, Any]) -> str:
channels = ", ".join(
f"{source['channel']}@{source['rank']}" for source in item.get("sources", [])[:4]
)
topic = str((item.get("metadata") or {}).get("topic") or "").strip()
parts = [f"hybrid_entity_relation_vector {channels}"]
if topic:
parts.append(f"topic: {topic}")
return "; ".join(parts)
def heuristic_query_projection(question: str) -> QueryProjection:
entities = dedupe(
[
@ -493,19 +338,11 @@ def heuristic_query_projection(question: str) -> QueryProjection:
*keyword_terms(question)[:16],
]
)[:16]
constraints = dedupe(
[
*extract_constraint_terms(question),
*numeric_terms(question),
]
)[:12]
predicate = infer_query_predicate(question)
subject = entities[0] if entities else "question"
return QueryProjection(
entities=entities,
relations=[f"{subject} | {predicate} | {question}"],
constraints=constraints,
expected_answer_type=infer_answer_type(question),
)
@ -554,24 +391,6 @@ def keyword_terms(text: str) -> list[str]:
return dedupe(terms)
def extract_constraint_terms(text: str) -> list[str]:
constraints = []
for pattern in [
r"\b(?:must|should|required|requires?|default(?:s)?|limit(?:s)?|maximum|minimum)\b[^.!?\n]{0,120}",
r"\b[A-Za-z_][A-Za-z0-9_]{2,}\s*(?:=|:)\s*[A-Za-z0-9_.:/-]+",
]:
constraints.extend(match.strip() for match in re.findall(pattern, text, flags=re.IGNORECASE))
return dedupe(constraints)
def numeric_terms(text: str) -> list[str]:
return re.findall(
r"\b\d+(?:\.\d+)?\s*(?:MiB|GiB|MB|GB|ms|sec|seconds|minutes|hours|days|%|tokens?|req/s|rps)\b",
text,
flags=re.IGNORECASE,
)
def infer_query_predicate(question: str) -> str:
lowered = question.lower()
rules = [
@ -589,19 +408,6 @@ def infer_query_predicate(question: str) -> str:
return "asks_about"
def infer_answer_type(question: str) -> str:
lowered = question.lower()
if "how many" in lowered or "limit" in lowered or "size" in lowered:
return "number_or_limit"
if lowered.startswith("who"):
return "person_or_team"
if lowered.startswith("when"):
return "date_or_time"
if "why" in lowered or "caused" in lowered:
return "cause"
return "fact"
def dedupe(values: Any) -> list[str]:
seen = set()
result = []

View file

@ -1,72 +0,0 @@
from __future__ import annotations
import re
from typing import Any, Iterable
SEMANTIC_FOLDER_ROOT = "/semantic"
SEMANTIC_FOLDER_BASE_FIELDS = {"doc_type", "domain", "topic"}
SEMANTIC_FOLDER_SYSTEM_FIELDS = {"source_type"}
SEMANTIC_FOLDER_FORBIDDEN_FIELDS = {
"summary",
"entities",
"relations",
"constraints",
"retrieval_cues",
"dataset_doc_uuid",
"path",
"uri",
"source_path",
"storage_uri",
"title",
"content_type",
"created_at",
"updated_at",
}
def canonical_semantic_folder_field_name(value: Any) -> str:
text = str(value or "").strip()
if not text:
return ""
text = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", text)
text = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", text)
return re.sub(r"[^A-Za-z0-9]+", "_", text).strip("_").casefold()
def compact_semantic_folder_field_name(value: Any) -> str:
return re.sub(r"[^a-z0-9]+", "", canonical_semantic_folder_field_name(value))
def semantic_folder_field_identity_keys(value: Any) -> frozenset[str]:
canonical = canonical_semantic_folder_field_name(value)
compact = compact_semantic_folder_field_name(value)
return frozenset(key for key in (canonical, compact) if key)
def semantic_folder_field_identity_set(fields: Iterable[Any]) -> frozenset[str]:
keys: set[str] = set()
for field in fields:
keys.update(semantic_folder_field_identity_keys(field))
return frozenset(keys)
SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES = semantic_folder_field_identity_set(
SEMANTIC_FOLDER_FORBIDDEN_FIELDS
)
def is_semantic_folder_forbidden_field(value: Any) -> bool:
return bool(
semantic_folder_field_identity_keys(value)
& SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES
)
def semantic_folder_allowed_extension_fields(fields: Iterable[Any]) -> set[str]:
allowed = set()
for field in fields:
name = canonical_semantic_folder_field_name(field)
if name and not is_semantic_folder_forbidden_field(field):
allowed.add(name)
return allowed

View file

@ -308,7 +308,7 @@ def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp
assert filtered["data"][0]["summary"] == "summary for doc_10"
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
def test_browse_scopes_channel_candidates_before_candidate_limit(tmp_path):
import json
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
@ -738,20 +738,6 @@ def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_
assert "semantic-grep" not in rendered
def test_semantic_search_scope_filters_explicit_source_type_facets():
from pageindex.filesystem import PageIndexFileSystem
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/source_type=google-drive"}
) == {"source_type": "google_drive"}
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/semantic/source_type=google-drive"}
) == {"source_type": "google_drive"}
assert PageIndexFileSystem._semantic_filters_for_scope(
{"folder_path": "/documents"}
) == {}
def test_grep_source_file_requires_terms_on_same_line(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
@ -904,7 +890,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
filesystem.configure_existing_projection_retrieval()
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -961,9 +947,14 @@ def test_default_semantic_search_uses_summary_projection_when_only_summary_avail
},
)
assert filesystem.search("purchase order exposure", semantic=False) == []
assert filesystem.search("purchase order exposure") == []
results = filesystem.search("purchase order exposure", semantic=True)
results = filesystem.browse_semantic_files(
"/documents",
"purchase order exposure",
recursive=True,
page_size=5,
)
assert [result.external_id for result in results] == ["doc_summary_only"]
assert results[0].snippet == "summary_vector rank=1"
assert [item["external_id"] for item in results["data"]] == ["doc_summary_only"]
assert results["data"][0]["snippet"] == "summary_vector rank=1"

View file

@ -158,13 +158,13 @@ def test_add_configures_semantic_retrieval_in_same_filesystem_instance(tmp_path)
filesystem.add_file(source, "/documents")
assert filesystem.semantic_retrieval_channels() == ("summary",)
results = filesystem.search_semantic_channel(
"summary",
results = filesystem.browse_semantic_files(
"/documents",
"semantic recall",
scope={"folder_path": "/documents", "recursive": True},
limit=5,
recursive=True,
page_size=5,
)
assert [result.source_path for result in results] == ["documents/semantic.txt"]
assert [item["source_path"] for item in results["data"]] == ["documents/semantic.txt"]
def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch):

View file

@ -263,7 +263,7 @@ class PIFSAgentStreamTest(unittest.TestCase):
):
self.assertNotIn(old_command, prompt_surface)
def test_demo_prompt_uses_browse_strategy_and_not_legacy_semantic_search(self):
def test_demo_prompt_uses_browse_strategy_and_not_old_vector_commands(self):
demo_prompt = load_demo_agent_prompt()
self.assertIn("Start with ls or tree", demo_prompt)

View file

@ -46,7 +46,6 @@ def test_descendant_folder_filter_treats_underscore_literally(tmp_path):
folder_id = filesystem.folder_info("/proj_1")["folder_id"]
scoped_results = filesystem.search(
scope={"folder_id": folder_id, "recursive": True},
semantic=False,
limit=10,
)
ranked_folders = {
@ -102,12 +101,10 @@ def test_metadata_contains_treats_percent_and_underscore_literally(tmp_path):
percent_results = filesystem.search(
metadata_filter={"status": {"$contains": "100% done"}},
semantic=False,
limit=10,
)
underscore_results = filesystem.search(
metadata_filter={"status": {"$contains": "build_alpha"}},
semantic=False,
limit=10,
)