mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
refactor(pifs): remove dead semantic retrieval code (#33)
This commit is contained in:
parent
d3034fa1b9
commit
0f71da3bc1
8 changed files with 24 additions and 691 deletions
|
|
@ -18,7 +18,7 @@ class PIFSCommandError(ValueError):
|
|||
class PIFSCommandExecutor:
|
||||
FORBIDDEN_SUBSTRINGS = (";", "`", "$(", "||", "\n", "\r")
|
||||
FORBIDDEN_TOKENS = {"|", ">", "<", ">>", "<<", "&"}
|
||||
BASE_ALLOWED_COMMANDS = {
|
||||
COMMAND_NAMES = {
|
||||
"ls",
|
||||
"tree",
|
||||
"find",
|
||||
|
|
@ -30,9 +30,7 @@ class PIFSCommandExecutor:
|
|||
"tail",
|
||||
"sed",
|
||||
}
|
||||
ALLOWED_COMMANDS = BASE_ALLOWED_COMMANDS
|
||||
ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"}
|
||||
COMMAND_METHODS = {}
|
||||
MAX_CHAINED_COMMANDS = 3
|
||||
MAX_PIPE_COMMANDS = 3
|
||||
MAX_LS_LIMIT = 100
|
||||
|
|
@ -65,7 +63,7 @@ class PIFSCommandExecutor:
|
|||
self.query_context = query_context
|
||||
|
||||
def allowed_commands(self) -> set[str]:
|
||||
return set(self.BASE_ALLOWED_COMMANDS)
|
||||
return set(self.COMMAND_NAMES)
|
||||
|
||||
def command_capabilities(self) -> dict[str, Any]:
|
||||
return {
|
||||
|
|
@ -149,8 +147,7 @@ class PIFSCommandExecutor:
|
|||
name = tokens[0]
|
||||
if name not in self.allowed_commands():
|
||||
raise PIFSCommandError(f"Unsupported command: {name}")
|
||||
method_name = self.COMMAND_METHODS.get(name, f"_cmd_{name}")
|
||||
data = getattr(self, method_name)(tokens[1:])
|
||||
data = getattr(self, f"_cmd_{name}")(tokens[1:])
|
||||
return self._render(data, json_output=json_output, command_name=name)
|
||||
|
||||
def _execute_pipe_filter(self, input_text: str, command: str) -> str:
|
||||
|
|
@ -375,7 +372,6 @@ class PIFSCommandExecutor:
|
|||
scope=scope,
|
||||
metadata_filter=where,
|
||||
limit=limit,
|
||||
semantic=False,
|
||||
)
|
||||
|
||||
def _cmd_grep(self, args: list[str]) -> Any:
|
||||
|
|
@ -423,7 +419,6 @@ class PIFSCommandExecutor:
|
|||
scope={"folder_path": normalized, "recursive": False},
|
||||
metadata_filter=where,
|
||||
limit=limit,
|
||||
semantic=False,
|
||||
)
|
||||
if direct_results:
|
||||
return {
|
||||
|
|
@ -471,7 +466,6 @@ class PIFSCommandExecutor:
|
|||
scope={"folder_path": normalized, "recursive": recursive},
|
||||
metadata_filter=where,
|
||||
limit=limit,
|
||||
semantic=False,
|
||||
)
|
||||
if not results and where is None:
|
||||
source_hits = self._grep_source_file_hits(normalized, query, limit=limit)
|
||||
|
|
@ -1240,23 +1234,6 @@ class PIFSCommandExecutor:
|
|||
return f"{folder}/{title}" if folder else f"/{title}"
|
||||
return str(item.get("source_path") or item.get("external_id") or file_ref or "-")
|
||||
|
||||
def _stable_file_target_path(self, item: dict[str, Any]) -> str:
|
||||
file_ref = str(item.get("file_ref") or "").strip()
|
||||
source_path = str(item.get("source_path") or "").strip()
|
||||
if source_path:
|
||||
target = "/" + source_path.strip("/")
|
||||
try:
|
||||
if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref:
|
||||
return target
|
||||
except KeyError:
|
||||
pass
|
||||
external_id = str(item.get("external_id") or "").strip()
|
||||
if external_id:
|
||||
return external_id
|
||||
if file_ref:
|
||||
return file_ref
|
||||
return str(item.get("external_id") or item.get("file_ref") or "-")
|
||||
|
||||
def _semantic_retrieval_query(self, query: str) -> str:
|
||||
query = str(query or "").strip()
|
||||
context = str(self.query_context or "").strip()
|
||||
|
|
@ -1326,7 +1303,6 @@ class PIFSCommandExecutor:
|
|||
scope={"folder_path": child["path"], "recursive": True},
|
||||
metadata_filter=metadata_filter,
|
||||
limit=max(limit, 50),
|
||||
semantic=False,
|
||||
)
|
||||
if not results:
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -17,14 +17,6 @@ from .metadata_generation import (
|
|||
MetadataGenerator,
|
||||
)
|
||||
from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS
|
||||
from .semantic_folder_policy import (
|
||||
SEMANTIC_FOLDER_BASE_FIELDS,
|
||||
SEMANTIC_FOLDER_ROOT,
|
||||
SEMANTIC_FOLDER_SYSTEM_FIELDS,
|
||||
canonical_semantic_folder_field_name,
|
||||
is_semantic_folder_forbidden_field,
|
||||
semantic_folder_allowed_extension_fields,
|
||||
)
|
||||
from .store import (
|
||||
SQLiteFileSystemStore,
|
||||
fingerprint,
|
||||
|
|
@ -571,8 +563,7 @@ class PageIndexFileSystem:
|
|||
)
|
||||
offset = (page - 1) * page_size
|
||||
needed = offset + page_size + 1
|
||||
semantic_filters = self._semantic_filters_for_scope(scope)
|
||||
semantic_filters["file_ref"] = scope_file_refs
|
||||
semantic_filters = {"file_ref": scope_file_refs}
|
||||
candidates = (
|
||||
search_channel(
|
||||
space,
|
||||
|
|
@ -695,95 +686,14 @@ class PageIndexFileSystem:
|
|||
def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None:
|
||||
self.store.attach_files_to_folders(items)
|
||||
|
||||
def apply_semantic_folder_projection(
|
||||
self,
|
||||
projection_plan: dict[str, Any],
|
||||
*,
|
||||
file_ref_by_document_id: Optional[dict[str, str]] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Attach registered files to a Semantic Folder Projection.
|
||||
|
||||
Registration remains the explicit folder placement step. This method is
|
||||
the separate product API for adding derived `/semantic/...` memberships.
|
||||
"""
|
||||
folders = list(projection_plan.get("folders") or [])
|
||||
memberships = list(projection_plan.get("memberships") or [])
|
||||
policy_raw = projection_plan.get("policy")
|
||||
policy = policy_raw if isinstance(policy_raw, dict) else {}
|
||||
allowed_extension_fields = semantic_folder_allowed_extension_fields(
|
||||
policy.get("allowed_extension_fields", [])
|
||||
)
|
||||
for folder in folders:
|
||||
self._validate_semantic_folder_projection_item(folder, allowed_extension_fields)
|
||||
for membership in memberships:
|
||||
self._validate_semantic_folder_projection_item(membership, allowed_extension_fields)
|
||||
|
||||
for folder in folders:
|
||||
folder_metadata = folder.get("metadata")
|
||||
self.create_folder(
|
||||
self._validate_semantic_folder_projection_path(str(folder["path"])),
|
||||
kind=str(folder.get("kind") or "semantic_projection"),
|
||||
description=str(folder.get("description") or ""),
|
||||
metadata=folder_metadata if isinstance(folder_metadata, dict) else {},
|
||||
)
|
||||
|
||||
items: list[dict[str, Any]] = []
|
||||
file_ref_by_document_id = file_ref_by_document_id or {}
|
||||
for membership in memberships:
|
||||
document_id = self._semantic_folder_projection_document_id(membership)
|
||||
file_ref = file_ref_by_document_id.get(document_id)
|
||||
if not file_ref:
|
||||
file_ref = self.store.resolve_file_ref(document_id)
|
||||
metadata = (
|
||||
dict(membership.get("folder_metadata"))
|
||||
if isinstance(membership.get("folder_metadata"), dict)
|
||||
else {}
|
||||
)
|
||||
metadata.update(
|
||||
{
|
||||
"projection": "Semantic Folder Projection",
|
||||
"field": membership.get("field", ""),
|
||||
"value": membership.get("value", ""),
|
||||
"mount_kind": membership.get(
|
||||
"mount_kind",
|
||||
"semantic_folder_projection",
|
||||
),
|
||||
}
|
||||
)
|
||||
items.append(
|
||||
{
|
||||
"file_ref": file_ref,
|
||||
"folder": self._validate_semantic_folder_projection_path(
|
||||
str(membership["folder_path"])
|
||||
),
|
||||
"metadata": metadata,
|
||||
}
|
||||
)
|
||||
self.attach_files_to_folders(items)
|
||||
return {
|
||||
"projection": "Semantic Folder Projection",
|
||||
"folders_applied": len(folders),
|
||||
"memberships_attached": len(items),
|
||||
}
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: Union[str, list[str], None] = None,
|
||||
scope: Optional[dict[str, Any]] = None,
|
||||
metadata_filter: Optional[dict[str, Any] | str] = None,
|
||||
limit: int = 10,
|
||||
semantic: bool = True,
|
||||
) -> list[SearchResult]:
|
||||
parsed_filter = self.metadata.parse_filter(metadata_filter)
|
||||
if semantic and self._should_use_semantic_retrieval(query, scope):
|
||||
semantic_results = self._semantic_search(
|
||||
query,
|
||||
scope=scope,
|
||||
metadata_filter=parsed_filter,
|
||||
limit=limit,
|
||||
)
|
||||
if semantic_results:
|
||||
return semantic_results
|
||||
rows = self.store.search_files(
|
||||
query,
|
||||
scope=scope,
|
||||
|
|
@ -821,30 +731,6 @@ class PageIndexFileSystem:
|
|||
)
|
||||
return results
|
||||
|
||||
def search_semantic_channel(
|
||||
self,
|
||||
channel: str,
|
||||
query: Union[str, list[str], None],
|
||||
*,
|
||||
scope: Optional[dict[str, Any]] = None,
|
||||
metadata_filter: Optional[dict[str, Any] | str] = None,
|
||||
limit: int = 10,
|
||||
) -> list[SearchResult]:
|
||||
parsed_filter = self.metadata.parse_filter(metadata_filter)
|
||||
if (
|
||||
self.semantic_retrieval_backend is None
|
||||
or not self.has_semantic_channel(channel)
|
||||
or not self._query_text(query)
|
||||
):
|
||||
return []
|
||||
return self._semantic_search(
|
||||
query,
|
||||
scope=scope,
|
||||
metadata_filter=parsed_filter,
|
||||
limit=limit,
|
||||
channel=channel,
|
||||
)
|
||||
|
||||
def configure_hybrid_projection_retrieval(
|
||||
self,
|
||||
index_dir: Union[str, Path],
|
||||
|
|
@ -853,7 +739,6 @@ class PageIndexFileSystem:
|
|||
embedding_model: str = "text-embedding-3-small",
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_timeout: float = 60,
|
||||
per_channel_limit: int = 100,
|
||||
fetch_multiplier: int = 100,
|
||||
) -> Any:
|
||||
from .hybrid_projection import HybridProjectionSearchBackend
|
||||
|
|
@ -864,7 +749,6 @@ class PageIndexFileSystem:
|
|||
embedding_model=embedding_model,
|
||||
embedding_dimensions=embedding_dimensions,
|
||||
embedding_timeout=embedding_timeout,
|
||||
per_channel_limit=per_channel_limit,
|
||||
fetch_multiplier=fetch_multiplier,
|
||||
)
|
||||
return self.semantic_retrieval_backend
|
||||
|
|
@ -905,30 +789,6 @@ class PageIndexFileSystem:
|
|||
},
|
||||
}
|
||||
|
||||
def find(
|
||||
self,
|
||||
target: str,
|
||||
patterns: Union[str, list[str]],
|
||||
limit: int = 20,
|
||||
) -> list[OpenResult]:
|
||||
file_ref = self._resolve_target(target)
|
||||
patterns = [patterns] if isinstance(patterns, str) else list(patterns)
|
||||
lowered_patterns = [pattern.lower() for pattern in patterns if pattern]
|
||||
if not lowered_patterns:
|
||||
return []
|
||||
text = self.store.read_text(file_ref)
|
||||
lines = text.splitlines()
|
||||
matches = []
|
||||
for i, line in enumerate(lines, 1):
|
||||
haystack = line.lower()
|
||||
if any(pattern in haystack for pattern in lowered_patterns):
|
||||
start = max(1, i - 1)
|
||||
end = min(len(lines), i + 1)
|
||||
matches.append(self._open_lines(file_ref, start, end))
|
||||
if len(matches) >= limit:
|
||||
break
|
||||
return matches
|
||||
|
||||
def open(self, target: str, location: str = "all") -> OpenResult:
|
||||
file_ref = self._resolve_target(target)
|
||||
entry = self.store.get_file(file_ref)
|
||||
|
|
@ -1387,15 +1247,6 @@ class PageIndexFileSystem:
|
|||
metadata = file.get("metadata") or {}
|
||||
if not isinstance(metadata, dict):
|
||||
raise ValueError("metadata must be a JSON object")
|
||||
legacy_value_key = "derived_" + "metadata"
|
||||
legacy_policy_key = "metadata_" + "generation_policy"
|
||||
legacy_status_key = "metadata_" + "generation_status"
|
||||
if legacy_value_key in file:
|
||||
raise ValueError("legacy generated metadata map has been removed; put values in metadata")
|
||||
if legacy_policy_key in file:
|
||||
raise ValueError("legacy metadata policy key has been renamed to metadata_policy")
|
||||
if legacy_status_key in file:
|
||||
raise ValueError("legacy metadata status key has been renamed to metadata_status")
|
||||
self._validate_register_metadata(metadata)
|
||||
external_id = file.get("external_id")
|
||||
content = file.get("content") or ""
|
||||
|
|
@ -1946,93 +1797,6 @@ class PageIndexFileSystem:
|
|||
def _resolve_target(self, target: str) -> str:
|
||||
return self.store.resolve_file_ref(target)
|
||||
|
||||
def _should_use_semantic_retrieval(
|
||||
self,
|
||||
query: Union[str, list[str], None],
|
||||
scope: Optional[dict[str, Any]],
|
||||
) -> bool:
|
||||
if self.semantic_retrieval_backend is None:
|
||||
return False
|
||||
if not self._query_text(query):
|
||||
return False
|
||||
if not scope:
|
||||
return True
|
||||
return bool(scope.get("recursive", True))
|
||||
|
||||
def _semantic_search(
|
||||
self,
|
||||
query: Union[str, list[str], None],
|
||||
*,
|
||||
scope: Optional[dict[str, Any]],
|
||||
metadata_filter: Optional[dict[str, Any]],
|
||||
limit: int,
|
||||
channel: str | None = None,
|
||||
) -> list[SearchResult]:
|
||||
if self.semantic_retrieval_backend is None:
|
||||
return []
|
||||
filters = self._semantic_filters_for_scope(scope)
|
||||
fetch_limit = max(limit * 10, 50)
|
||||
query_text = self._query_text(query)
|
||||
if channel:
|
||||
search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None)
|
||||
if search_channel is None:
|
||||
return []
|
||||
candidates = search_channel(
|
||||
channel,
|
||||
query_text,
|
||||
limit=fetch_limit,
|
||||
filters=filters,
|
||||
)
|
||||
else:
|
||||
candidates = self.semantic_retrieval_backend.search(
|
||||
query_text,
|
||||
limit=fetch_limit,
|
||||
filters=filters,
|
||||
)
|
||||
results: list[SearchResult] = []
|
||||
seen: set[str] = set()
|
||||
scope_path = self._scope_folder_path(scope)
|
||||
for candidate in candidates:
|
||||
try:
|
||||
file_ref = self.store.resolve_file_ref(candidate.document_id)
|
||||
except KeyError:
|
||||
continue
|
||||
if file_ref in seen:
|
||||
continue
|
||||
if not self.store.file_matches(file_ref, scope=scope, metadata_filter=metadata_filter):
|
||||
continue
|
||||
seen.add(file_ref)
|
||||
entry = self.store.get_file(file_ref)
|
||||
folder_paths = [
|
||||
folder["path"]
|
||||
for folder in self.store.folder_memberships(file_ref)
|
||||
]
|
||||
folder_path = self._preferred_folder_path(folder_paths, scope_path, entry.folder_path)
|
||||
results.append(
|
||||
SearchResult(
|
||||
file_ref=file_ref,
|
||||
external_id=entry.external_id,
|
||||
title=entry.title,
|
||||
snippet=candidate.snippet or entry.descriptor,
|
||||
folder_path=folder_path,
|
||||
folder_paths=folder_paths,
|
||||
metadata=entry.metadata,
|
||||
metadata_status=entry.metadata_status,
|
||||
source_path=entry.source_path,
|
||||
id=entry.external_id or file_ref,
|
||||
document_id=entry.external_id,
|
||||
name=entry.title,
|
||||
description=entry.descriptor,
|
||||
status=entry.pageindex_tree_status,
|
||||
pageNum=None,
|
||||
createdAt=None,
|
||||
folderId=None,
|
||||
)
|
||||
)
|
||||
if len(results) >= limit:
|
||||
break
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _semantic_candidate_score(candidate: Any) -> float | None:
|
||||
try:
|
||||
|
|
@ -2348,135 +2112,6 @@ class PageIndexFileSystem:
|
|||
path = scope.get("folder_path") or scope.get("path")
|
||||
return normalize_path(path) if path else None
|
||||
|
||||
@classmethod
|
||||
def _semantic_filters_for_scope(cls, scope: Optional[dict[str, Any]]) -> dict[str, Any]:
|
||||
path = cls._scope_folder_path(scope)
|
||||
if not path or path == "/":
|
||||
return {}
|
||||
source_type = cls._source_type_filter_from_path(path)
|
||||
return {"source_type": source_type} if source_type else {}
|
||||
|
||||
@staticmethod
|
||||
def _source_type_filter_from_path(path: str) -> str:
|
||||
segments = [segment for segment in path.strip("/").split("/") if segment]
|
||||
if not segments:
|
||||
return ""
|
||||
if segments[0] == SEMANTIC_FOLDER_ROOT.strip("/"):
|
||||
segments = segments[1:]
|
||||
if not segments:
|
||||
return ""
|
||||
first_segment = segments[0]
|
||||
if first_segment.startswith("source_type="):
|
||||
return first_segment.split("=", 1)[1].replace("-", "_")
|
||||
if path.startswith(f"{SEMANTIC_FOLDER_ROOT}/"):
|
||||
return ""
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def _validate_semantic_folder_projection_item(
|
||||
cls,
|
||||
item: dict[str, Any],
|
||||
allowed_extension_fields: set[str],
|
||||
) -> None:
|
||||
path = item.get("folder_path") or item.get("path")
|
||||
if not path:
|
||||
raise ValueError("Semantic Folder Projection items must include a folder path")
|
||||
cls._validate_semantic_folder_projection_path(str(path))
|
||||
allowed_fields = (
|
||||
SEMANTIC_FOLDER_BASE_FIELDS
|
||||
| SEMANTIC_FOLDER_SYSTEM_FIELDS
|
||||
| allowed_extension_fields
|
||||
)
|
||||
if item.get("dataset_doc_uuid"):
|
||||
raise ValueError(
|
||||
"dataset_doc_uuid is not allowed in Semantic Folder Projection memberships; "
|
||||
"use file_key or file_ref"
|
||||
)
|
||||
fields = []
|
||||
explicit_field = cls._canonical_semantic_folder_field_name(item.get("field"))
|
||||
if explicit_field:
|
||||
fields.append(explicit_field)
|
||||
fields.extend(cls._semantic_folder_projection_fields_from_path(str(path)))
|
||||
for payload_key in ("metadata", "folder_metadata"):
|
||||
cls._validate_semantic_folder_projection_metadata_payload(
|
||||
item.get(payload_key),
|
||||
allowed_fields,
|
||||
)
|
||||
for field in fields:
|
||||
if is_semantic_folder_forbidden_field(field) or field not in allowed_fields:
|
||||
raise ValueError(f"Field is not allowed for Semantic Folder Projection: {field}")
|
||||
|
||||
@staticmethod
|
||||
def _validate_semantic_folder_projection_path(path: str) -> str:
|
||||
normalized = normalize_path(path)
|
||||
if normalized != SEMANTIC_FOLDER_ROOT and not normalized.startswith(
|
||||
f"{SEMANTIC_FOLDER_ROOT}/"
|
||||
):
|
||||
raise ValueError("Semantic Folder Projection paths must be under /semantic")
|
||||
return normalized
|
||||
|
||||
@classmethod
|
||||
def _semantic_folder_projection_fields_from_path(cls, path: str) -> list[str]:
|
||||
normalized = cls._validate_semantic_folder_projection_path(path)
|
||||
fields: list[str] = []
|
||||
for segment in normalized.strip("/").split("/")[1:]:
|
||||
if "=" not in segment:
|
||||
continue
|
||||
field = cls._canonical_semantic_folder_field_name(
|
||||
segment.split("=", 1)[0]
|
||||
)
|
||||
if field:
|
||||
fields.append(field)
|
||||
return fields
|
||||
|
||||
@classmethod
|
||||
def _validate_semantic_folder_projection_metadata_payload(
|
||||
cls,
|
||||
payload: Any,
|
||||
allowed_fields: set[str],
|
||||
) -> None:
|
||||
if isinstance(payload, dict):
|
||||
for key, value in payload.items():
|
||||
key_text = str(key)
|
||||
key_field = cls._canonical_semantic_folder_field_name(key)
|
||||
if is_semantic_folder_forbidden_field(key_field):
|
||||
raise ValueError(
|
||||
"Forbidden metadata field in Semantic Folder Projection payload: "
|
||||
f"{key_text}"
|
||||
)
|
||||
if key_field in {"field", "source_field", "metadata_field"}:
|
||||
field = cls._canonical_semantic_folder_field_name(value)
|
||||
if field and (
|
||||
is_semantic_folder_forbidden_field(field)
|
||||
or field not in allowed_fields
|
||||
):
|
||||
raise ValueError(
|
||||
f"Field is not allowed for Semantic Folder Projection: {field}"
|
||||
)
|
||||
cls._validate_semantic_folder_projection_metadata_payload(value, allowed_fields)
|
||||
elif isinstance(payload, list):
|
||||
for item in payload:
|
||||
cls._validate_semantic_folder_projection_metadata_payload(item, allowed_fields)
|
||||
elif isinstance(payload, str):
|
||||
field = cls._canonical_semantic_folder_field_name(payload)
|
||||
if is_semantic_folder_forbidden_field(field):
|
||||
raise ValueError(
|
||||
"Forbidden metadata field label in Semantic Folder Projection payload: "
|
||||
f"{payload}"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _canonical_semantic_folder_field_name(value: Any) -> str:
|
||||
return canonical_semantic_folder_field_name(value)
|
||||
|
||||
@staticmethod
|
||||
def _semantic_folder_projection_document_id(membership: dict[str, Any]) -> str:
|
||||
for key in ("file_key", "file_ref", "document_ref"):
|
||||
value = str(membership.get(key) or "").strip()
|
||||
if value:
|
||||
return value
|
||||
raise ValueError("Semantic Folder Projection membership is missing file_key or file_ref")
|
||||
|
||||
@staticmethod
|
||||
def _query_text(query: Union[str, list[str], None]) -> str:
|
||||
if query is None:
|
||||
|
|
|
|||
|
|
@ -15,28 +15,17 @@ from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, Semantic
|
|||
|
||||
|
||||
INDEX_BY_CHANNEL = {
|
||||
"metadata": "metadata_composite_vector",
|
||||
"summary": "summary_only_vector",
|
||||
"entity": "entity_vectors",
|
||||
"constraint": "constraint_vectors",
|
||||
"relation": "relation_vectors",
|
||||
}
|
||||
HYBRID_ENTITY_RELATION_CHANNELS = ("metadata", "entity", "constraint", "relation")
|
||||
SEMANTIC_TOOL_CHANNELS = ("summary", "entity", "relation")
|
||||
HYBRID_ENTITY_RELATION_WEIGHTS = {
|
||||
"metadata": 0.25,
|
||||
"entity": 0.25,
|
||||
"relation": 0.30,
|
||||
"constraint": 0.20,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class QueryProjection:
|
||||
entities: list[str]
|
||||
relations: list[str]
|
||||
constraints: list[str]
|
||||
expected_answer_type: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
|
@ -52,7 +41,7 @@ class HybridProjectionCandidate:
|
|||
|
||||
|
||||
class HybridProjectionSearchBackend:
|
||||
"""Hybrid entity/relation/vector retrieval over rebuildable projection indexes.
|
||||
"""Semantic channel retrieval over rebuildable projection indexes.
|
||||
|
||||
The SQLite catalog remains the source of truth. This backend only reads
|
||||
external sqlite-vec projection indexes and returns candidate document ids
|
||||
|
|
@ -68,7 +57,6 @@ class HybridProjectionSearchBackend:
|
|||
embedding_model: str,
|
||||
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS,
|
||||
embedding_cache_path: str | Path | None = None,
|
||||
per_channel_limit: int = 100,
|
||||
fetch_multiplier: int = 100,
|
||||
) -> None:
|
||||
self.index_dir = Path(index_dir).expanduser()
|
||||
|
|
@ -82,7 +70,6 @@ class HybridProjectionSearchBackend:
|
|||
if embedding_cache_path is not None
|
||||
else self.index_dir / "embedding_cache.sqlite"
|
||||
)
|
||||
self.per_channel_limit = per_channel_limit
|
||||
self.fetch_multiplier = fetch_multiplier
|
||||
self.indexes = {
|
||||
channel: SQLiteVecSemanticIndex(self.index_dir / f"{index_name}.sqlite")
|
||||
|
|
@ -114,35 +101,6 @@ class HybridProjectionSearchBackend:
|
|||
**kwargs,
|
||||
)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
*,
|
||||
limit: int = 10,
|
||||
filters: dict[str, Any] | None = None,
|
||||
) -> list[HybridProjectionCandidate]:
|
||||
query = normalize_text(query)
|
||||
if not query:
|
||||
return []
|
||||
projection = heuristic_query_projection(query)
|
||||
channels = tuple(
|
||||
channel
|
||||
for channel in HYBRID_ENTITY_RELATION_CHANNELS
|
||||
if self._channel_document_count(channel) > 0
|
||||
)
|
||||
if not channels:
|
||||
if self._channel_document_count("summary") > 0:
|
||||
return self.search_channel("summary", query, limit=limit, filters=filters)
|
||||
return []
|
||||
channel_hits = self._search_channels(
|
||||
query=query,
|
||||
projection=projection,
|
||||
limit=max(limit, self.per_channel_limit),
|
||||
filters=filters,
|
||||
channels=channels,
|
||||
)
|
||||
return aggregate_hybrid_entity_relation(channel_hits, projection)[:limit]
|
||||
|
||||
def search_channel(
|
||||
self,
|
||||
channel: str,
|
||||
|
|
@ -187,7 +145,7 @@ class HybridProjectionSearchBackend:
|
|||
"embedding_provider": self.embedding_provider,
|
||||
"embedding_model": self.embedding_model,
|
||||
"embedding_dimensions": self.embedding_dimensions,
|
||||
"strategy": "hybrid_entity_relation_vector",
|
||||
"strategy": "semantic_channel_vector",
|
||||
"available_channels": list(self.available_channels()),
|
||||
"channels": {
|
||||
channel: self._safe_channel_info(channel)
|
||||
|
|
@ -221,36 +179,6 @@ class HybridProjectionSearchBackend:
|
|||
}
|
||||
return {**info, "available": int(info.get("document_count") or 0) > 0}
|
||||
|
||||
def _search_channels(
|
||||
self,
|
||||
*,
|
||||
query: str,
|
||||
projection: QueryProjection,
|
||||
limit: int,
|
||||
filters: dict[str, Any] | None,
|
||||
channels: tuple[str, ...],
|
||||
) -> dict[str, list[SemanticSearchResult]]:
|
||||
query_texts = {
|
||||
channel: query_text_for_channel(channel, query, projection)
|
||||
for channel in channels
|
||||
}
|
||||
vectors = self.embedding_cache.embed_texts(
|
||||
[query_texts[channel] for channel in channels],
|
||||
provider=self.embedding_provider,
|
||||
model=self.cache_model,
|
||||
embedder=self.embedder,
|
||||
batch_size=1,
|
||||
)
|
||||
return {
|
||||
channel: self.indexes[channel].search(
|
||||
vector,
|
||||
limit=limit,
|
||||
filters=filters,
|
||||
fetch_multiplier=self.fetch_multiplier,
|
||||
)
|
||||
for channel, vector in zip(channels, vectors)
|
||||
}
|
||||
|
||||
|
||||
class EmbeddingCache:
|
||||
def __init__(self, db_path: Path):
|
||||
|
|
@ -368,12 +296,10 @@ def make_embedder(provider: str, model: str, *, dimensions: int, timeout: float)
|
|||
|
||||
|
||||
def query_text_for_channel(channel: str, query: str, projection: QueryProjection) -> str:
|
||||
if channel in {"metadata", "summary"}:
|
||||
if channel == "summary":
|
||||
return query
|
||||
if channel == "entity":
|
||||
return compact_join(projection.entities, limit=24) or query
|
||||
if channel == "constraint":
|
||||
return compact_join(projection.constraints, limit=24) or query
|
||||
if channel == "relation":
|
||||
return "\n".join(projection.relations) or query
|
||||
raise ValueError(f"unknown semantic channel: {channel}")
|
||||
|
|
@ -405,87 +331,6 @@ def rank_single_semantic_channel(
|
|||
return rows
|
||||
|
||||
|
||||
def aggregate_hybrid_entity_relation(
|
||||
channel_hits: dict[str, list[SemanticSearchResult]],
|
||||
projection: QueryProjection,
|
||||
) -> list[HybridProjectionCandidate]:
|
||||
by_doc: dict[str, dict[str, Any]] = {}
|
||||
for channel, results in channel_hits.items():
|
||||
weight = HYBRID_ENTITY_RELATION_WEIGHTS[channel]
|
||||
seen_in_channel = set()
|
||||
for rank, result in enumerate(results, 1):
|
||||
doc_id = str(result.external_id or result.file_ref)
|
||||
if doc_id in seen_in_channel:
|
||||
continue
|
||||
seen_in_channel.add(doc_id)
|
||||
item = by_doc.setdefault(
|
||||
doc_id,
|
||||
{
|
||||
"document_id": doc_id,
|
||||
"score": 0.0,
|
||||
"sources": [],
|
||||
"source_type": result.source_type,
|
||||
"source_path": result.source_path,
|
||||
"title": result.title,
|
||||
"metadata": result.metadata,
|
||||
},
|
||||
)
|
||||
item["score"] += weight * (1 / (60 + rank))
|
||||
item["sources"].append({"channel": channel, "rank": rank, "distance": result.distance})
|
||||
candidates = []
|
||||
for item in by_doc.values():
|
||||
item["score"] += exact_match_bonus(item, projection)
|
||||
candidates.append(
|
||||
HybridProjectionCandidate(
|
||||
document_id=item["document_id"],
|
||||
score=float(item["score"]),
|
||||
sources=item["sources"],
|
||||
source_type=item["source_type"],
|
||||
source_path=item["source_path"],
|
||||
title=item["title"],
|
||||
metadata=item["metadata"],
|
||||
snippet=hybrid_snippet(item),
|
||||
)
|
||||
)
|
||||
return sorted(
|
||||
candidates,
|
||||
key=lambda item: (
|
||||
-item.score,
|
||||
min(source["rank"] for source in item.sources),
|
||||
item.document_id,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def exact_match_bonus(item: dict[str, Any], projection: QueryProjection) -> float:
|
||||
haystack = json.dumps(
|
||||
{
|
||||
"title": item.get("title", ""),
|
||||
"source_path": item.get("source_path", ""),
|
||||
"metadata": item.get("metadata", {}),
|
||||
},
|
||||
ensure_ascii=False,
|
||||
).lower()
|
||||
terms = [*projection.entities[:8], *projection.constraints[:6]]
|
||||
matched = 0
|
||||
for term in terms:
|
||||
normalized = str(term).lower().strip()
|
||||
if len(normalized) >= 3 and normalized in haystack:
|
||||
matched += 1
|
||||
return min(0.02, matched * 0.004)
|
||||
|
||||
|
||||
def hybrid_snippet(item: dict[str, Any]) -> str:
|
||||
channels = ", ".join(
|
||||
f"{source['channel']}@{source['rank']}" for source in item.get("sources", [])[:4]
|
||||
)
|
||||
topic = str((item.get("metadata") or {}).get("topic") or "").strip()
|
||||
parts = [f"hybrid_entity_relation_vector {channels}"]
|
||||
if topic:
|
||||
parts.append(f"topic: {topic}")
|
||||
return "; ".join(parts)
|
||||
|
||||
|
||||
def heuristic_query_projection(question: str) -> QueryProjection:
|
||||
entities = dedupe(
|
||||
[
|
||||
|
|
@ -493,19 +338,11 @@ def heuristic_query_projection(question: str) -> QueryProjection:
|
|||
*keyword_terms(question)[:16],
|
||||
]
|
||||
)[:16]
|
||||
constraints = dedupe(
|
||||
[
|
||||
*extract_constraint_terms(question),
|
||||
*numeric_terms(question),
|
||||
]
|
||||
)[:12]
|
||||
predicate = infer_query_predicate(question)
|
||||
subject = entities[0] if entities else "question"
|
||||
return QueryProjection(
|
||||
entities=entities,
|
||||
relations=[f"{subject} | {predicate} | {question}"],
|
||||
constraints=constraints,
|
||||
expected_answer_type=infer_answer_type(question),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -554,24 +391,6 @@ def keyword_terms(text: str) -> list[str]:
|
|||
return dedupe(terms)
|
||||
|
||||
|
||||
def extract_constraint_terms(text: str) -> list[str]:
|
||||
constraints = []
|
||||
for pattern in [
|
||||
r"\b(?:must|should|required|requires?|default(?:s)?|limit(?:s)?|maximum|minimum)\b[^.!?\n]{0,120}",
|
||||
r"\b[A-Za-z_][A-Za-z0-9_]{2,}\s*(?:=|:)\s*[A-Za-z0-9_.:/-]+",
|
||||
]:
|
||||
constraints.extend(match.strip() for match in re.findall(pattern, text, flags=re.IGNORECASE))
|
||||
return dedupe(constraints)
|
||||
|
||||
|
||||
def numeric_terms(text: str) -> list[str]:
|
||||
return re.findall(
|
||||
r"\b\d+(?:\.\d+)?\s*(?:MiB|GiB|MB|GB|ms|sec|seconds|minutes|hours|days|%|tokens?|req/s|rps)\b",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def infer_query_predicate(question: str) -> str:
|
||||
lowered = question.lower()
|
||||
rules = [
|
||||
|
|
@ -589,19 +408,6 @@ def infer_query_predicate(question: str) -> str:
|
|||
return "asks_about"
|
||||
|
||||
|
||||
def infer_answer_type(question: str) -> str:
|
||||
lowered = question.lower()
|
||||
if "how many" in lowered or "limit" in lowered or "size" in lowered:
|
||||
return "number_or_limit"
|
||||
if lowered.startswith("who"):
|
||||
return "person_or_team"
|
||||
if lowered.startswith("when"):
|
||||
return "date_or_time"
|
||||
if "why" in lowered or "caused" in lowered:
|
||||
return "cause"
|
||||
return "fact"
|
||||
|
||||
|
||||
def dedupe(values: Any) -> list[str]:
|
||||
seen = set()
|
||||
result = []
|
||||
|
|
|
|||
|
|
@ -1,72 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Iterable
|
||||
|
||||
|
||||
SEMANTIC_FOLDER_ROOT = "/semantic"
|
||||
SEMANTIC_FOLDER_BASE_FIELDS = {"doc_type", "domain", "topic"}
|
||||
SEMANTIC_FOLDER_SYSTEM_FIELDS = {"source_type"}
|
||||
SEMANTIC_FOLDER_FORBIDDEN_FIELDS = {
|
||||
"summary",
|
||||
"entities",
|
||||
"relations",
|
||||
"constraints",
|
||||
"retrieval_cues",
|
||||
"dataset_doc_uuid",
|
||||
"path",
|
||||
"uri",
|
||||
"source_path",
|
||||
"storage_uri",
|
||||
"title",
|
||||
"content_type",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
}
|
||||
|
||||
|
||||
def canonical_semantic_folder_field_name(value: Any) -> str:
|
||||
text = str(value or "").strip()
|
||||
if not text:
|
||||
return ""
|
||||
text = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", text)
|
||||
text = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", text)
|
||||
return re.sub(r"[^A-Za-z0-9]+", "_", text).strip("_").casefold()
|
||||
|
||||
|
||||
def compact_semantic_folder_field_name(value: Any) -> str:
|
||||
return re.sub(r"[^a-z0-9]+", "", canonical_semantic_folder_field_name(value))
|
||||
|
||||
|
||||
def semantic_folder_field_identity_keys(value: Any) -> frozenset[str]:
|
||||
canonical = canonical_semantic_folder_field_name(value)
|
||||
compact = compact_semantic_folder_field_name(value)
|
||||
return frozenset(key for key in (canonical, compact) if key)
|
||||
|
||||
|
||||
def semantic_folder_field_identity_set(fields: Iterable[Any]) -> frozenset[str]:
|
||||
keys: set[str] = set()
|
||||
for field in fields:
|
||||
keys.update(semantic_folder_field_identity_keys(field))
|
||||
return frozenset(keys)
|
||||
|
||||
|
||||
SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES = semantic_folder_field_identity_set(
|
||||
SEMANTIC_FOLDER_FORBIDDEN_FIELDS
|
||||
)
|
||||
|
||||
|
||||
def is_semantic_folder_forbidden_field(value: Any) -> bool:
|
||||
return bool(
|
||||
semantic_folder_field_identity_keys(value)
|
||||
& SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES
|
||||
)
|
||||
|
||||
|
||||
def semantic_folder_allowed_extension_fields(fields: Iterable[Any]) -> set[str]:
|
||||
allowed = set()
|
||||
for field in fields:
|
||||
name = canonical_semantic_folder_field_name(field)
|
||||
if name and not is_semantic_folder_forbidden_field(field):
|
||||
allowed.add(name)
|
||||
return allowed
|
||||
|
|
@ -308,7 +308,7 @@ def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp
|
|||
assert filtered["data"][0]["summary"] == "summary for doc_10"
|
||||
|
||||
|
||||
def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path):
|
||||
def test_browse_scopes_channel_candidates_before_candidate_limit(tmp_path):
|
||||
import json
|
||||
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
|
@ -738,20 +738,6 @@ def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_
|
|||
assert "semantic-grep" not in rendered
|
||||
|
||||
|
||||
def test_semantic_search_scope_filters_explicit_source_type_facets():
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
assert PageIndexFileSystem._semantic_filters_for_scope(
|
||||
{"folder_path": "/source_type=google-drive"}
|
||||
) == {"source_type": "google_drive"}
|
||||
assert PageIndexFileSystem._semantic_filters_for_scope(
|
||||
{"folder_path": "/semantic/source_type=google-drive"}
|
||||
) == {"source_type": "google_drive"}
|
||||
assert PageIndexFileSystem._semantic_filters_for_scope(
|
||||
{"folder_path": "/documents"}
|
||||
) == {}
|
||||
|
||||
|
||||
def test_grep_source_file_requires_terms_on_same_line(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
|
|
@ -904,7 +890,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
|
|||
filesystem.configure_existing_projection_retrieval()
|
||||
|
||||
|
||||
def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path):
|
||||
def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
|
@ -961,9 +947,14 @@ def test_default_semantic_search_uses_summary_projection_when_only_summary_avail
|
|||
},
|
||||
)
|
||||
|
||||
assert filesystem.search("purchase order exposure", semantic=False) == []
|
||||
assert filesystem.search("purchase order exposure") == []
|
||||
|
||||
results = filesystem.search("purchase order exposure", semantic=True)
|
||||
results = filesystem.browse_semantic_files(
|
||||
"/documents",
|
||||
"purchase order exposure",
|
||||
recursive=True,
|
||||
page_size=5,
|
||||
)
|
||||
|
||||
assert [result.external_id for result in results] == ["doc_summary_only"]
|
||||
assert results[0].snippet == "summary_vector rank=1"
|
||||
assert [item["external_id"] for item in results["data"]] == ["doc_summary_only"]
|
||||
assert results["data"][0]["snippet"] == "summary_vector rank=1"
|
||||
|
|
|
|||
|
|
@ -158,13 +158,13 @@ def test_add_configures_semantic_retrieval_in_same_filesystem_instance(tmp_path)
|
|||
filesystem.add_file(source, "/documents")
|
||||
|
||||
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
||||
results = filesystem.search_semantic_channel(
|
||||
"summary",
|
||||
results = filesystem.browse_semantic_files(
|
||||
"/documents",
|
||||
"semantic recall",
|
||||
scope={"folder_path": "/documents", "recursive": True},
|
||||
limit=5,
|
||||
recursive=True,
|
||||
page_size=5,
|
||||
)
|
||||
assert [result.source_path for result in results] == ["documents/semantic.txt"]
|
||||
assert [item["source_path"] for item in results["data"]] == ["documents/semantic.txt"]
|
||||
|
||||
|
||||
def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch):
|
||||
|
|
|
|||
|
|
@ -263,7 +263,7 @@ class PIFSAgentStreamTest(unittest.TestCase):
|
|||
):
|
||||
self.assertNotIn(old_command, prompt_surface)
|
||||
|
||||
def test_demo_prompt_uses_browse_strategy_and_not_legacy_semantic_search(self):
|
||||
def test_demo_prompt_uses_browse_strategy_and_not_old_vector_commands(self):
|
||||
demo_prompt = load_demo_agent_prompt()
|
||||
|
||||
self.assertIn("Start with ls or tree", demo_prompt)
|
||||
|
|
|
|||
|
|
@ -46,7 +46,6 @@ def test_descendant_folder_filter_treats_underscore_literally(tmp_path):
|
|||
folder_id = filesystem.folder_info("/proj_1")["folder_id"]
|
||||
scoped_results = filesystem.search(
|
||||
scope={"folder_id": folder_id, "recursive": True},
|
||||
semantic=False,
|
||||
limit=10,
|
||||
)
|
||||
ranked_folders = {
|
||||
|
|
@ -102,12 +101,10 @@ def test_metadata_contains_treats_percent_and_underscore_literally(tmp_path):
|
|||
|
||||
percent_results = filesystem.search(
|
||||
metadata_filter={"status": {"$contains": "100% done"}},
|
||||
semantic=False,
|
||||
limit=10,
|
||||
)
|
||||
underscore_results = filesystem.search(
|
||||
metadata_filter={"status": {"$contains": "build_alpha"}},
|
||||
semantic=False,
|
||||
limit=10,
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue