diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index ee6e565..3d1fbae 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -18,7 +18,7 @@ class PIFSCommandError(ValueError): class PIFSCommandExecutor: FORBIDDEN_SUBSTRINGS = (";", "`", "$(", "||", "\n", "\r") FORBIDDEN_TOKENS = {"|", ">", "<", ">>", "<<", "&"} - BASE_ALLOWED_COMMANDS = { + COMMAND_NAMES = { "ls", "tree", "find", @@ -30,9 +30,7 @@ class PIFSCommandExecutor: "tail", "sed", } - ALLOWED_COMMANDS = BASE_ALLOWED_COMMANDS ALLOWED_PIPE_FILTERS = {"head", "tail", "grep", "sed"} - COMMAND_METHODS = {} MAX_CHAINED_COMMANDS = 3 MAX_PIPE_COMMANDS = 3 MAX_LS_LIMIT = 100 @@ -65,7 +63,7 @@ class PIFSCommandExecutor: self.query_context = query_context def allowed_commands(self) -> set[str]: - return set(self.BASE_ALLOWED_COMMANDS) + return set(self.COMMAND_NAMES) def command_capabilities(self) -> dict[str, Any]: return { @@ -149,8 +147,7 @@ class PIFSCommandExecutor: name = tokens[0] if name not in self.allowed_commands(): raise PIFSCommandError(f"Unsupported command: {name}") - method_name = self.COMMAND_METHODS.get(name, f"_cmd_{name}") - data = getattr(self, method_name)(tokens[1:]) + data = getattr(self, f"_cmd_{name}")(tokens[1:]) return self._render(data, json_output=json_output, command_name=name) def _execute_pipe_filter(self, input_text: str, command: str) -> str: @@ -375,7 +372,6 @@ class PIFSCommandExecutor: scope=scope, metadata_filter=where, limit=limit, - semantic=False, ) def _cmd_grep(self, args: list[str]) -> Any: @@ -423,7 +419,6 @@ class PIFSCommandExecutor: scope={"folder_path": normalized, "recursive": False}, metadata_filter=where, limit=limit, - semantic=False, ) if direct_results: return { @@ -471,7 +466,6 @@ class PIFSCommandExecutor: scope={"folder_path": normalized, "recursive": recursive}, metadata_filter=where, limit=limit, - semantic=False, ) if not results and where is None: source_hits = self._grep_source_file_hits(normalized, query, limit=limit) @@ -1240,23 +1234,6 @@ class PIFSCommandExecutor: return f"{folder}/{title}" if folder else f"/{title}" return str(item.get("source_path") or item.get("external_id") or file_ref or "-") - def _stable_file_target_path(self, item: dict[str, Any]) -> str: - file_ref = str(item.get("file_ref") or "").strip() - source_path = str(item.get("source_path") or "").strip() - if source_path: - target = "/" + source_path.strip("/") - try: - if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref: - return target - except KeyError: - pass - external_id = str(item.get("external_id") or "").strip() - if external_id: - return external_id - if file_ref: - return file_ref - return str(item.get("external_id") or item.get("file_ref") or "-") - def _semantic_retrieval_query(self, query: str) -> str: query = str(query or "").strip() context = str(self.query_context or "").strip() @@ -1326,7 +1303,6 @@ class PIFSCommandExecutor: scope={"folder_path": child["path"], "recursive": True}, metadata_filter=metadata_filter, limit=max(limit, 50), - semantic=False, ) if not results: continue diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index d8f6310..0a8a40a 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -17,14 +17,6 @@ from .metadata_generation import ( MetadataGenerator, ) from .embedding_defaults import DEFAULT_EMBEDDING_DIMENSIONS -from .semantic_folder_policy import ( - SEMANTIC_FOLDER_BASE_FIELDS, - SEMANTIC_FOLDER_ROOT, - SEMANTIC_FOLDER_SYSTEM_FIELDS, - canonical_semantic_folder_field_name, - is_semantic_folder_forbidden_field, - semantic_folder_allowed_extension_fields, -) from .store import ( SQLiteFileSystemStore, fingerprint, @@ -571,8 +563,7 @@ class PageIndexFileSystem: ) offset = (page - 1) * page_size needed = offset + page_size + 1 - semantic_filters = self._semantic_filters_for_scope(scope) - semantic_filters["file_ref"] = scope_file_refs + semantic_filters = {"file_ref": scope_file_refs} candidates = ( search_channel( space, @@ -695,95 +686,14 @@ class PageIndexFileSystem: def attach_files_to_folders(self, items: list[dict[str, Any]]) -> None: self.store.attach_files_to_folders(items) - def apply_semantic_folder_projection( - self, - projection_plan: dict[str, Any], - *, - file_ref_by_document_id: Optional[dict[str, str]] = None, - ) -> dict[str, Any]: - """Attach registered files to a Semantic Folder Projection. - - Registration remains the explicit folder placement step. This method is - the separate product API for adding derived `/semantic/...` memberships. - """ - folders = list(projection_plan.get("folders") or []) - memberships = list(projection_plan.get("memberships") or []) - policy_raw = projection_plan.get("policy") - policy = policy_raw if isinstance(policy_raw, dict) else {} - allowed_extension_fields = semantic_folder_allowed_extension_fields( - policy.get("allowed_extension_fields", []) - ) - for folder in folders: - self._validate_semantic_folder_projection_item(folder, allowed_extension_fields) - for membership in memberships: - self._validate_semantic_folder_projection_item(membership, allowed_extension_fields) - - for folder in folders: - folder_metadata = folder.get("metadata") - self.create_folder( - self._validate_semantic_folder_projection_path(str(folder["path"])), - kind=str(folder.get("kind") or "semantic_projection"), - description=str(folder.get("description") or ""), - metadata=folder_metadata if isinstance(folder_metadata, dict) else {}, - ) - - items: list[dict[str, Any]] = [] - file_ref_by_document_id = file_ref_by_document_id or {} - for membership in memberships: - document_id = self._semantic_folder_projection_document_id(membership) - file_ref = file_ref_by_document_id.get(document_id) - if not file_ref: - file_ref = self.store.resolve_file_ref(document_id) - metadata = ( - dict(membership.get("folder_metadata")) - if isinstance(membership.get("folder_metadata"), dict) - else {} - ) - metadata.update( - { - "projection": "Semantic Folder Projection", - "field": membership.get("field", ""), - "value": membership.get("value", ""), - "mount_kind": membership.get( - "mount_kind", - "semantic_folder_projection", - ), - } - ) - items.append( - { - "file_ref": file_ref, - "folder": self._validate_semantic_folder_projection_path( - str(membership["folder_path"]) - ), - "metadata": metadata, - } - ) - self.attach_files_to_folders(items) - return { - "projection": "Semantic Folder Projection", - "folders_applied": len(folders), - "memberships_attached": len(items), - } - def search( self, query: Union[str, list[str], None] = None, scope: Optional[dict[str, Any]] = None, metadata_filter: Optional[dict[str, Any] | str] = None, limit: int = 10, - semantic: bool = True, ) -> list[SearchResult]: parsed_filter = self.metadata.parse_filter(metadata_filter) - if semantic and self._should_use_semantic_retrieval(query, scope): - semantic_results = self._semantic_search( - query, - scope=scope, - metadata_filter=parsed_filter, - limit=limit, - ) - if semantic_results: - return semantic_results rows = self.store.search_files( query, scope=scope, @@ -821,30 +731,6 @@ class PageIndexFileSystem: ) return results - def search_semantic_channel( - self, - channel: str, - query: Union[str, list[str], None], - *, - scope: Optional[dict[str, Any]] = None, - metadata_filter: Optional[dict[str, Any] | str] = None, - limit: int = 10, - ) -> list[SearchResult]: - parsed_filter = self.metadata.parse_filter(metadata_filter) - if ( - self.semantic_retrieval_backend is None - or not self.has_semantic_channel(channel) - or not self._query_text(query) - ): - return [] - return self._semantic_search( - query, - scope=scope, - metadata_filter=parsed_filter, - limit=limit, - channel=channel, - ) - def configure_hybrid_projection_retrieval( self, index_dir: Union[str, Path], @@ -853,7 +739,6 @@ class PageIndexFileSystem: embedding_model: str = "text-embedding-3-small", embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_timeout: float = 60, - per_channel_limit: int = 100, fetch_multiplier: int = 100, ) -> Any: from .hybrid_projection import HybridProjectionSearchBackend @@ -864,7 +749,6 @@ class PageIndexFileSystem: embedding_model=embedding_model, embedding_dimensions=embedding_dimensions, embedding_timeout=embedding_timeout, - per_channel_limit=per_channel_limit, fetch_multiplier=fetch_multiplier, ) return self.semantic_retrieval_backend @@ -905,30 +789,6 @@ class PageIndexFileSystem: }, } - def find( - self, - target: str, - patterns: Union[str, list[str]], - limit: int = 20, - ) -> list[OpenResult]: - file_ref = self._resolve_target(target) - patterns = [patterns] if isinstance(patterns, str) else list(patterns) - lowered_patterns = [pattern.lower() for pattern in patterns if pattern] - if not lowered_patterns: - return [] - text = self.store.read_text(file_ref) - lines = text.splitlines() - matches = [] - for i, line in enumerate(lines, 1): - haystack = line.lower() - if any(pattern in haystack for pattern in lowered_patterns): - start = max(1, i - 1) - end = min(len(lines), i + 1) - matches.append(self._open_lines(file_ref, start, end)) - if len(matches) >= limit: - break - return matches - def open(self, target: str, location: str = "all") -> OpenResult: file_ref = self._resolve_target(target) entry = self.store.get_file(file_ref) @@ -1387,15 +1247,6 @@ class PageIndexFileSystem: metadata = file.get("metadata") or {} if not isinstance(metadata, dict): raise ValueError("metadata must be a JSON object") - legacy_value_key = "derived_" + "metadata" - legacy_policy_key = "metadata_" + "generation_policy" - legacy_status_key = "metadata_" + "generation_status" - if legacy_value_key in file: - raise ValueError("legacy generated metadata map has been removed; put values in metadata") - if legacy_policy_key in file: - raise ValueError("legacy metadata policy key has been renamed to metadata_policy") - if legacy_status_key in file: - raise ValueError("legacy metadata status key has been renamed to metadata_status") self._validate_register_metadata(metadata) external_id = file.get("external_id") content = file.get("content") or "" @@ -1946,93 +1797,6 @@ class PageIndexFileSystem: def _resolve_target(self, target: str) -> str: return self.store.resolve_file_ref(target) - def _should_use_semantic_retrieval( - self, - query: Union[str, list[str], None], - scope: Optional[dict[str, Any]], - ) -> bool: - if self.semantic_retrieval_backend is None: - return False - if not self._query_text(query): - return False - if not scope: - return True - return bool(scope.get("recursive", True)) - - def _semantic_search( - self, - query: Union[str, list[str], None], - *, - scope: Optional[dict[str, Any]], - metadata_filter: Optional[dict[str, Any]], - limit: int, - channel: str | None = None, - ) -> list[SearchResult]: - if self.semantic_retrieval_backend is None: - return [] - filters = self._semantic_filters_for_scope(scope) - fetch_limit = max(limit * 10, 50) - query_text = self._query_text(query) - if channel: - search_channel = getattr(self.semantic_retrieval_backend, "search_channel", None) - if search_channel is None: - return [] - candidates = search_channel( - channel, - query_text, - limit=fetch_limit, - filters=filters, - ) - else: - candidates = self.semantic_retrieval_backend.search( - query_text, - limit=fetch_limit, - filters=filters, - ) - results: list[SearchResult] = [] - seen: set[str] = set() - scope_path = self._scope_folder_path(scope) - for candidate in candidates: - try: - file_ref = self.store.resolve_file_ref(candidate.document_id) - except KeyError: - continue - if file_ref in seen: - continue - if not self.store.file_matches(file_ref, scope=scope, metadata_filter=metadata_filter): - continue - seen.add(file_ref) - entry = self.store.get_file(file_ref) - folder_paths = [ - folder["path"] - for folder in self.store.folder_memberships(file_ref) - ] - folder_path = self._preferred_folder_path(folder_paths, scope_path, entry.folder_path) - results.append( - SearchResult( - file_ref=file_ref, - external_id=entry.external_id, - title=entry.title, - snippet=candidate.snippet or entry.descriptor, - folder_path=folder_path, - folder_paths=folder_paths, - metadata=entry.metadata, - metadata_status=entry.metadata_status, - source_path=entry.source_path, - id=entry.external_id or file_ref, - document_id=entry.external_id, - name=entry.title, - description=entry.descriptor, - status=entry.pageindex_tree_status, - pageNum=None, - createdAt=None, - folderId=None, - ) - ) - if len(results) >= limit: - break - return results - @staticmethod def _semantic_candidate_score(candidate: Any) -> float | None: try: @@ -2348,135 +2112,6 @@ class PageIndexFileSystem: path = scope.get("folder_path") or scope.get("path") return normalize_path(path) if path else None - @classmethod - def _semantic_filters_for_scope(cls, scope: Optional[dict[str, Any]]) -> dict[str, Any]: - path = cls._scope_folder_path(scope) - if not path or path == "/": - return {} - source_type = cls._source_type_filter_from_path(path) - return {"source_type": source_type} if source_type else {} - - @staticmethod - def _source_type_filter_from_path(path: str) -> str: - segments = [segment for segment in path.strip("/").split("/") if segment] - if not segments: - return "" - if segments[0] == SEMANTIC_FOLDER_ROOT.strip("/"): - segments = segments[1:] - if not segments: - return "" - first_segment = segments[0] - if first_segment.startswith("source_type="): - return first_segment.split("=", 1)[1].replace("-", "_") - if path.startswith(f"{SEMANTIC_FOLDER_ROOT}/"): - return "" - return "" - - @classmethod - def _validate_semantic_folder_projection_item( - cls, - item: dict[str, Any], - allowed_extension_fields: set[str], - ) -> None: - path = item.get("folder_path") or item.get("path") - if not path: - raise ValueError("Semantic Folder Projection items must include a folder path") - cls._validate_semantic_folder_projection_path(str(path)) - allowed_fields = ( - SEMANTIC_FOLDER_BASE_FIELDS - | SEMANTIC_FOLDER_SYSTEM_FIELDS - | allowed_extension_fields - ) - if item.get("dataset_doc_uuid"): - raise ValueError( - "dataset_doc_uuid is not allowed in Semantic Folder Projection memberships; " - "use file_key or file_ref" - ) - fields = [] - explicit_field = cls._canonical_semantic_folder_field_name(item.get("field")) - if explicit_field: - fields.append(explicit_field) - fields.extend(cls._semantic_folder_projection_fields_from_path(str(path))) - for payload_key in ("metadata", "folder_metadata"): - cls._validate_semantic_folder_projection_metadata_payload( - item.get(payload_key), - allowed_fields, - ) - for field in fields: - if is_semantic_folder_forbidden_field(field) or field not in allowed_fields: - raise ValueError(f"Field is not allowed for Semantic Folder Projection: {field}") - - @staticmethod - def _validate_semantic_folder_projection_path(path: str) -> str: - normalized = normalize_path(path) - if normalized != SEMANTIC_FOLDER_ROOT and not normalized.startswith( - f"{SEMANTIC_FOLDER_ROOT}/" - ): - raise ValueError("Semantic Folder Projection paths must be under /semantic") - return normalized - - @classmethod - def _semantic_folder_projection_fields_from_path(cls, path: str) -> list[str]: - normalized = cls._validate_semantic_folder_projection_path(path) - fields: list[str] = [] - for segment in normalized.strip("/").split("/")[1:]: - if "=" not in segment: - continue - field = cls._canonical_semantic_folder_field_name( - segment.split("=", 1)[0] - ) - if field: - fields.append(field) - return fields - - @classmethod - def _validate_semantic_folder_projection_metadata_payload( - cls, - payload: Any, - allowed_fields: set[str], - ) -> None: - if isinstance(payload, dict): - for key, value in payload.items(): - key_text = str(key) - key_field = cls._canonical_semantic_folder_field_name(key) - if is_semantic_folder_forbidden_field(key_field): - raise ValueError( - "Forbidden metadata field in Semantic Folder Projection payload: " - f"{key_text}" - ) - if key_field in {"field", "source_field", "metadata_field"}: - field = cls._canonical_semantic_folder_field_name(value) - if field and ( - is_semantic_folder_forbidden_field(field) - or field not in allowed_fields - ): - raise ValueError( - f"Field is not allowed for Semantic Folder Projection: {field}" - ) - cls._validate_semantic_folder_projection_metadata_payload(value, allowed_fields) - elif isinstance(payload, list): - for item in payload: - cls._validate_semantic_folder_projection_metadata_payload(item, allowed_fields) - elif isinstance(payload, str): - field = cls._canonical_semantic_folder_field_name(payload) - if is_semantic_folder_forbidden_field(field): - raise ValueError( - "Forbidden metadata field label in Semantic Folder Projection payload: " - f"{payload}" - ) - - @staticmethod - def _canonical_semantic_folder_field_name(value: Any) -> str: - return canonical_semantic_folder_field_name(value) - - @staticmethod - def _semantic_folder_projection_document_id(membership: dict[str, Any]) -> str: - for key in ("file_key", "file_ref", "document_ref"): - value = str(membership.get(key) or "").strip() - if value: - return value - raise ValueError("Semantic Folder Projection membership is missing file_key or file_ref") - @staticmethod def _query_text(query: Union[str, list[str], None]) -> str: if query is None: diff --git a/pageindex/filesystem/hybrid_projection.py b/pageindex/filesystem/hybrid_projection.py index cdb97e6..2fa1830 100644 --- a/pageindex/filesystem/hybrid_projection.py +++ b/pageindex/filesystem/hybrid_projection.py @@ -15,28 +15,17 @@ from .semantic_index import SQLiteVecSemanticIndex, SemanticIndexError, Semantic INDEX_BY_CHANNEL = { - "metadata": "metadata_composite_vector", "summary": "summary_only_vector", "entity": "entity_vectors", - "constraint": "constraint_vectors", "relation": "relation_vectors", } -HYBRID_ENTITY_RELATION_CHANNELS = ("metadata", "entity", "constraint", "relation") SEMANTIC_TOOL_CHANNELS = ("summary", "entity", "relation") -HYBRID_ENTITY_RELATION_WEIGHTS = { - "metadata": 0.25, - "entity": 0.25, - "relation": 0.30, - "constraint": 0.20, -} @dataclass(frozen=True) class QueryProjection: entities: list[str] relations: list[str] - constraints: list[str] - expected_answer_type: str = "" @dataclass(frozen=True) @@ -52,7 +41,7 @@ class HybridProjectionCandidate: class HybridProjectionSearchBackend: - """Hybrid entity/relation/vector retrieval over rebuildable projection indexes. + """Semantic channel retrieval over rebuildable projection indexes. The SQLite catalog remains the source of truth. This backend only reads external sqlite-vec projection indexes and returns candidate document ids @@ -68,7 +57,6 @@ class HybridProjectionSearchBackend: embedding_model: str, embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS, embedding_cache_path: str | Path | None = None, - per_channel_limit: int = 100, fetch_multiplier: int = 100, ) -> None: self.index_dir = Path(index_dir).expanduser() @@ -82,7 +70,6 @@ class HybridProjectionSearchBackend: if embedding_cache_path is not None else self.index_dir / "embedding_cache.sqlite" ) - self.per_channel_limit = per_channel_limit self.fetch_multiplier = fetch_multiplier self.indexes = { channel: SQLiteVecSemanticIndex(self.index_dir / f"{index_name}.sqlite") @@ -114,35 +101,6 @@ class HybridProjectionSearchBackend: **kwargs, ) - def search( - self, - query: str, - *, - limit: int = 10, - filters: dict[str, Any] | None = None, - ) -> list[HybridProjectionCandidate]: - query = normalize_text(query) - if not query: - return [] - projection = heuristic_query_projection(query) - channels = tuple( - channel - for channel in HYBRID_ENTITY_RELATION_CHANNELS - if self._channel_document_count(channel) > 0 - ) - if not channels: - if self._channel_document_count("summary") > 0: - return self.search_channel("summary", query, limit=limit, filters=filters) - return [] - channel_hits = self._search_channels( - query=query, - projection=projection, - limit=max(limit, self.per_channel_limit), - filters=filters, - channels=channels, - ) - return aggregate_hybrid_entity_relation(channel_hits, projection)[:limit] - def search_channel( self, channel: str, @@ -187,7 +145,7 @@ class HybridProjectionSearchBackend: "embedding_provider": self.embedding_provider, "embedding_model": self.embedding_model, "embedding_dimensions": self.embedding_dimensions, - "strategy": "hybrid_entity_relation_vector", + "strategy": "semantic_channel_vector", "available_channels": list(self.available_channels()), "channels": { channel: self._safe_channel_info(channel) @@ -221,36 +179,6 @@ class HybridProjectionSearchBackend: } return {**info, "available": int(info.get("document_count") or 0) > 0} - def _search_channels( - self, - *, - query: str, - projection: QueryProjection, - limit: int, - filters: dict[str, Any] | None, - channels: tuple[str, ...], - ) -> dict[str, list[SemanticSearchResult]]: - query_texts = { - channel: query_text_for_channel(channel, query, projection) - for channel in channels - } - vectors = self.embedding_cache.embed_texts( - [query_texts[channel] for channel in channels], - provider=self.embedding_provider, - model=self.cache_model, - embedder=self.embedder, - batch_size=1, - ) - return { - channel: self.indexes[channel].search( - vector, - limit=limit, - filters=filters, - fetch_multiplier=self.fetch_multiplier, - ) - for channel, vector in zip(channels, vectors) - } - class EmbeddingCache: def __init__(self, db_path: Path): @@ -368,12 +296,10 @@ def make_embedder(provider: str, model: str, *, dimensions: int, timeout: float) def query_text_for_channel(channel: str, query: str, projection: QueryProjection) -> str: - if channel in {"metadata", "summary"}: + if channel == "summary": return query if channel == "entity": return compact_join(projection.entities, limit=24) or query - if channel == "constraint": - return compact_join(projection.constraints, limit=24) or query if channel == "relation": return "\n".join(projection.relations) or query raise ValueError(f"unknown semantic channel: {channel}") @@ -405,87 +331,6 @@ def rank_single_semantic_channel( return rows -def aggregate_hybrid_entity_relation( - channel_hits: dict[str, list[SemanticSearchResult]], - projection: QueryProjection, -) -> list[HybridProjectionCandidate]: - by_doc: dict[str, dict[str, Any]] = {} - for channel, results in channel_hits.items(): - weight = HYBRID_ENTITY_RELATION_WEIGHTS[channel] - seen_in_channel = set() - for rank, result in enumerate(results, 1): - doc_id = str(result.external_id or result.file_ref) - if doc_id in seen_in_channel: - continue - seen_in_channel.add(doc_id) - item = by_doc.setdefault( - doc_id, - { - "document_id": doc_id, - "score": 0.0, - "sources": [], - "source_type": result.source_type, - "source_path": result.source_path, - "title": result.title, - "metadata": result.metadata, - }, - ) - item["score"] += weight * (1 / (60 + rank)) - item["sources"].append({"channel": channel, "rank": rank, "distance": result.distance}) - candidates = [] - for item in by_doc.values(): - item["score"] += exact_match_bonus(item, projection) - candidates.append( - HybridProjectionCandidate( - document_id=item["document_id"], - score=float(item["score"]), - sources=item["sources"], - source_type=item["source_type"], - source_path=item["source_path"], - title=item["title"], - metadata=item["metadata"], - snippet=hybrid_snippet(item), - ) - ) - return sorted( - candidates, - key=lambda item: ( - -item.score, - min(source["rank"] for source in item.sources), - item.document_id, - ), - ) - - -def exact_match_bonus(item: dict[str, Any], projection: QueryProjection) -> float: - haystack = json.dumps( - { - "title": item.get("title", ""), - "source_path": item.get("source_path", ""), - "metadata": item.get("metadata", {}), - }, - ensure_ascii=False, - ).lower() - terms = [*projection.entities[:8], *projection.constraints[:6]] - matched = 0 - for term in terms: - normalized = str(term).lower().strip() - if len(normalized) >= 3 and normalized in haystack: - matched += 1 - return min(0.02, matched * 0.004) - - -def hybrid_snippet(item: dict[str, Any]) -> str: - channels = ", ".join( - f"{source['channel']}@{source['rank']}" for source in item.get("sources", [])[:4] - ) - topic = str((item.get("metadata") or {}).get("topic") or "").strip() - parts = [f"hybrid_entity_relation_vector {channels}"] - if topic: - parts.append(f"topic: {topic}") - return "; ".join(parts) - - def heuristic_query_projection(question: str) -> QueryProjection: entities = dedupe( [ @@ -493,19 +338,11 @@ def heuristic_query_projection(question: str) -> QueryProjection: *keyword_terms(question)[:16], ] )[:16] - constraints = dedupe( - [ - *extract_constraint_terms(question), - *numeric_terms(question), - ] - )[:12] predicate = infer_query_predicate(question) subject = entities[0] if entities else "question" return QueryProjection( entities=entities, relations=[f"{subject} | {predicate} | {question}"], - constraints=constraints, - expected_answer_type=infer_answer_type(question), ) @@ -554,24 +391,6 @@ def keyword_terms(text: str) -> list[str]: return dedupe(terms) -def extract_constraint_terms(text: str) -> list[str]: - constraints = [] - for pattern in [ - r"\b(?:must|should|required|requires?|default(?:s)?|limit(?:s)?|maximum|minimum)\b[^.!?\n]{0,120}", - r"\b[A-Za-z_][A-Za-z0-9_]{2,}\s*(?:=|:)\s*[A-Za-z0-9_.:/-]+", - ]: - constraints.extend(match.strip() for match in re.findall(pattern, text, flags=re.IGNORECASE)) - return dedupe(constraints) - - -def numeric_terms(text: str) -> list[str]: - return re.findall( - r"\b\d+(?:\.\d+)?\s*(?:MiB|GiB|MB|GB|ms|sec|seconds|minutes|hours|days|%|tokens?|req/s|rps)\b", - text, - flags=re.IGNORECASE, - ) - - def infer_query_predicate(question: str) -> str: lowered = question.lower() rules = [ @@ -589,19 +408,6 @@ def infer_query_predicate(question: str) -> str: return "asks_about" -def infer_answer_type(question: str) -> str: - lowered = question.lower() - if "how many" in lowered or "limit" in lowered or "size" in lowered: - return "number_or_limit" - if lowered.startswith("who"): - return "person_or_team" - if lowered.startswith("when"): - return "date_or_time" - if "why" in lowered or "caused" in lowered: - return "cause" - return "fact" - - def dedupe(values: Any) -> list[str]: seen = set() result = [] diff --git a/pageindex/filesystem/semantic_folder_policy.py b/pageindex/filesystem/semantic_folder_policy.py deleted file mode 100644 index 8e81d5f..0000000 --- a/pageindex/filesystem/semantic_folder_policy.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -import re -from typing import Any, Iterable - - -SEMANTIC_FOLDER_ROOT = "/semantic" -SEMANTIC_FOLDER_BASE_FIELDS = {"doc_type", "domain", "topic"} -SEMANTIC_FOLDER_SYSTEM_FIELDS = {"source_type"} -SEMANTIC_FOLDER_FORBIDDEN_FIELDS = { - "summary", - "entities", - "relations", - "constraints", - "retrieval_cues", - "dataset_doc_uuid", - "path", - "uri", - "source_path", - "storage_uri", - "title", - "content_type", - "created_at", - "updated_at", -} - - -def canonical_semantic_folder_field_name(value: Any) -> str: - text = str(value or "").strip() - if not text: - return "" - text = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", text) - text = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", text) - return re.sub(r"[^A-Za-z0-9]+", "_", text).strip("_").casefold() - - -def compact_semantic_folder_field_name(value: Any) -> str: - return re.sub(r"[^a-z0-9]+", "", canonical_semantic_folder_field_name(value)) - - -def semantic_folder_field_identity_keys(value: Any) -> frozenset[str]: - canonical = canonical_semantic_folder_field_name(value) - compact = compact_semantic_folder_field_name(value) - return frozenset(key for key in (canonical, compact) if key) - - -def semantic_folder_field_identity_set(fields: Iterable[Any]) -> frozenset[str]: - keys: set[str] = set() - for field in fields: - keys.update(semantic_folder_field_identity_keys(field)) - return frozenset(keys) - - -SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES = semantic_folder_field_identity_set( - SEMANTIC_FOLDER_FORBIDDEN_FIELDS -) - - -def is_semantic_folder_forbidden_field(value: Any) -> bool: - return bool( - semantic_folder_field_identity_keys(value) - & SEMANTIC_FOLDER_FORBIDDEN_FIELD_IDENTITIES - ) - - -def semantic_folder_allowed_extension_fields(fields: Iterable[Any]) -> set[str]: - allowed = set() - for field in fields: - name = canonical_semantic_folder_field_name(field) - if name and not is_semantic_folder_forbidden_field(field): - allowed.add(name) - return allowed diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 46b1161..b5b9491 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -308,7 +308,7 @@ def test_browse_supports_fixed_size_one_based_pagination_and_metadata_filter(tmp assert filtered["data"][0]["summary"] == "summary for doc_10" -def test_browse_scopes_semantic_search_before_candidate_limit(tmp_path): +def test_browse_scopes_channel_candidates_before_candidate_limit(tmp_path): import json from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem @@ -738,20 +738,6 @@ def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_ assert "semantic-grep" not in rendered -def test_semantic_search_scope_filters_explicit_source_type_facets(): - from pageindex.filesystem import PageIndexFileSystem - - assert PageIndexFileSystem._semantic_filters_for_scope( - {"folder_path": "/source_type=google-drive"} - ) == {"source_type": "google_drive"} - assert PageIndexFileSystem._semantic_filters_for_scope( - {"folder_path": "/semantic/source_type=google-drive"} - ) == {"source_type": "google_drive"} - assert PageIndexFileSystem._semantic_filters_for_scope( - {"folder_path": "/documents"} - ) == {} - - def test_grep_source_file_requires_terms_on_same_line(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem @@ -904,7 +890,7 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval( filesystem.configure_existing_projection_retrieval() -def test_default_semantic_search_uses_summary_projection_when_only_summary_available(tmp_path): +def test_browse_semantic_files_uses_summary_projection_when_only_summary_available(tmp_path): from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.hybrid_projection import HybridProjectionSearchBackend from pageindex.filesystem.metadata_generation import MetadataGenerationResult @@ -961,9 +947,14 @@ def test_default_semantic_search_uses_summary_projection_when_only_summary_avail }, ) - assert filesystem.search("purchase order exposure", semantic=False) == [] + assert filesystem.search("purchase order exposure") == [] - results = filesystem.search("purchase order exposure", semantic=True) + results = filesystem.browse_semantic_files( + "/documents", + "purchase order exposure", + recursive=True, + page_size=5, + ) - assert [result.external_id for result in results] == ["doc_summary_only"] - assert results[0].snippet == "summary_vector rank=1" + assert [item["external_id"] for item in results["data"]] == ["doc_summary_only"] + assert results["data"][0]["snippet"] == "summary_vector rank=1" diff --git a/tests/test_pifs_add_command.py b/tests/test_pifs_add_command.py index d2b8f9c..1679431 100644 --- a/tests/test_pifs_add_command.py +++ b/tests/test_pifs_add_command.py @@ -158,13 +158,13 @@ def test_add_configures_semantic_retrieval_in_same_filesystem_instance(tmp_path) filesystem.add_file(source, "/documents") assert filesystem.semantic_retrieval_channels() == ("summary",) - results = filesystem.search_semantic_channel( - "summary", + results = filesystem.browse_semantic_files( + "/documents", "semantic recall", - scope={"folder_path": "/documents", "recursive": True}, - limit=5, + recursive=True, + page_size=5, ) - assert [result.source_path for result in results] == ["documents/semantic.txt"] + assert [item["source_path"] for item in results["data"]] == ["documents/semantic.txt"] def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch): diff --git a/tests/test_pifs_agent_stream.py b/tests/test_pifs_agent_stream.py index de93856..3c833ee 100644 --- a/tests/test_pifs_agent_stream.py +++ b/tests/test_pifs_agent_stream.py @@ -263,7 +263,7 @@ class PIFSAgentStreamTest(unittest.TestCase): ): self.assertNotIn(old_command, prompt_surface) - def test_demo_prompt_uses_browse_strategy_and_not_legacy_semantic_search(self): + def test_demo_prompt_uses_browse_strategy_and_not_old_vector_commands(self): demo_prompt = load_demo_agent_prompt() self.assertIn("Start with ls or tree", demo_prompt) diff --git a/tests/test_pifs_like_escape.py b/tests/test_pifs_like_escape.py index 82e7ef9..5c0751e 100644 --- a/tests/test_pifs_like_escape.py +++ b/tests/test_pifs_like_escape.py @@ -46,7 +46,6 @@ def test_descendant_folder_filter_treats_underscore_literally(tmp_path): folder_id = filesystem.folder_info("/proj_1")["folder_id"] scoped_results = filesystem.search( scope={"folder_id": folder_id, "recursive": True}, - semantic=False, limit=10, ) ranked_folders = { @@ -102,12 +101,10 @@ def test_metadata_contains_treats_percent_and_underscore_literally(tmp_path): percent_results = filesystem.search( metadata_filter={"status": {"$contains": "100% done"}}, - semantic=False, limit=10, ) underscore_results = filesystem.search( metadata_filter={"status": {"$contains": "build_alpha"}}, - semantic=False, limit=10, )