From dc4de3116f98c368aec0c14b038ca1ce95b79703 Mon Sep 17 00:00:00 2001 From: BukeLy Date: Mon, 1 Jun 2026 01:40:44 +0800 Subject: [PATCH] refactor: remove source paths from PIFS --- README.md | 2 +- examples/pifs_demo.py | 3 +- pageindex/config.yaml | 4 +- pageindex/filesystem/cli.py | 5 +- pageindex/filesystem/commands.py | 254 +------------------- pageindex/filesystem/core.py | 184 +++++++------- pageindex/filesystem/metadata_generation.py | 1 - pageindex/filesystem/semantic_index.py | 19 +- pageindex/filesystem/semantic_projection.py | 3 - pageindex/filesystem/store.py | 142 ++++++----- pageindex/filesystem/types.py | 3 - tests/test_filesystem_store.py | 1 - tests/test_metadata_generation.py | 1 - tests/test_pageindex_filesystem_scope.py | 122 ++++++---- tests/test_pageindex_structural_read.py | 36 +-- tests/test_pifs_add_command.py | 17 +- tests/test_pifs_cli.py | 23 +- tests/test_pifs_find_maxdepth.py | 6 - tests/test_pifs_like_escape.py | 1 - tests/test_pifs_path_resolution.py | 16 +- tests/test_pifs_register_side_effects.py | 1 - tests/test_semantic_index.py | 8 - 22 files changed, 324 insertions(+), 528 deletions(-) diff --git a/README.md b/README.md index f8cca05..49c1785 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf You can customize the processing with additional optional arguments: ``` ---model LLM model to use (default: gpt-4o-2024-11-20) +--model LLM model to use (default: gpt-5.4) --toc-check-pages Pages to check for table of contents (default: 20) --max-pages-per-node Max pages per node (default: 10) --max-tokens-per-node Max tokens per node (default: 20000) diff --git a/examples/pifs_demo.py b/examples/pifs_demo.py index 2434371..f5dffe7 100644 --- a/examples/pifs_demo.py +++ b/examples/pifs_demo.py @@ -48,7 +48,7 @@ from pageindex.filesystem.core import DEFAULT_EMBEDDING_DIMENSIONS EXAMPLES_DIR = Path(__file__).parent DOCUMENTS_DIR = EXAMPLES_DIR / "documents" WORKSPACE = EXAMPLES_DIR / "pifs_workspace" -DEFAULT_MODEL = os.environ.get("PIFS_DEMO_MODEL", "gpt-5.4-mini") +DEFAULT_MODEL = os.environ.get("PIFS_DEMO_MODEL", "gpt-5.4") DEFAULT_METADATA_PROVIDER = os.environ.get("PIFS_DEMO_METADATA_PROVIDER") or os.environ.get( "PIFS_METADATA_PROVIDER", "openai" ) @@ -416,7 +416,6 @@ def register_documents( register_started = time.perf_counter() file_ref = filesystem.register( storage_uri=document_path.as_uri(), - source_path=str(document_path), folder_path="/documents", external_id=external_id, title=document_path.name, diff --git a/pageindex/config.yaml b/pageindex/config.yaml index 591fe93..5da9ee4 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,4 +1,4 @@ -model: "gpt-4o-2024-11-20" +model: "gpt-5.4" # model: "anthropic/claude-sonnet-4-6" retrieve_model: "gpt-5.4" # defaults to `model` if not set toc_check_page_num: 20 @@ -7,4 +7,4 @@ max_token_num_each_node: 20000 if_add_node_id: "yes" if_add_node_summary: "yes" if_add_doc_description: "no" -if_add_node_text: "no" \ No newline at end of file +if_add_node_text: "no" diff --git a/pageindex/filesystem/cli.py b/pageindex/filesystem/cli.py index bb01f80..8e13d1d 100644 --- a/pageindex/filesystem/cli.py +++ b/pageindex/filesystem/cli.py @@ -21,7 +21,7 @@ from .core import PageIndexFileSystem AGENT_STREAM_MODE_CHOICES = ("off", "tools", "model", "all") -DEFAULT_AGENT_MODEL = "gpt-5.4-mini" +DEFAULT_AGENT_MODEL = "gpt-5.4" EXIT_COMMANDS = {"exit", "quit", ":q"} ANSI_ESCAPE_RE = re.compile(r"\x1b(?:\[[0-?]*[ -/]*[@-~]|.)") PIFS_CONFIG_FILE_ENV = "PIFS_CONFIG_FILE" @@ -290,9 +290,8 @@ def _run_add(argv: list[str], *, workspace: str) -> int: filesystem = _filesystem_from_workspace(workspace) info = filesystem.add_file(args.physical_path, args.virtual_target) - print(f"added: {info.get('path') or '/' + str(info.get('source_path') or '').strip('/')}") + print(f"added: {info.get('path')}") print(f"file_ref: {info['file_ref']}") - print(f"storage_uri: {info['storage_uri']}") return 0 diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 16a7b22..3b1819d 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -3,9 +3,7 @@ from __future__ import annotations import json import re import shlex -import subprocess from dataclasses import asdict, is_dataclass -from pathlib import Path from typing import Any from .core import SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem @@ -411,35 +409,18 @@ class PIFSCommandExecutor: "mode": "files", "query": query, "scope": normalized, - "data": self._grep_file_hits_from_results(direct_results, query), + "data": self._grep_file_hits_from_results( + direct_results, + query, + require_match=True, + ), } - if where is None: - direct_source_hits = self._grep_source_file_hits( - normalized, - query, - limit=limit, - direct_only=True, - ) - if direct_source_hits: - return { - "mode": "files", - "query": query, - "scope": normalized, - "data": direct_source_hits, - } ranked = self._rank_child_folders( query=query, children=children, metadata_filter=where, limit=limit, ) - if not ranked and where is None: - ranked = self._rank_child_folders_from_source( - query=query, - parent_path=normalized, - children=children, - limit=limit, - ) return { "mode": "folders", "query": query, @@ -453,19 +434,15 @@ class PIFSCommandExecutor: metadata_filter=where, limit=limit, ) - if not results and where is None: - source_hits = self._grep_source_file_hits(normalized, query, limit=limit) - return { - "mode": "files", - "query": query, - "scope": normalized, - "data": source_hits, - } return { "mode": "files", "query": query, "scope": normalized, - "data": self._grep_file_hits_from_results(results, query), + "data": self._grep_file_hits_from_results( + results, + query, + require_match=True, + ), } return { "mode": "matches", @@ -976,11 +953,9 @@ class PIFSCommandExecutor: if data.get("mode") == "files": return "\n\n".join(self._render_stat(item) for item in data.get("data", [])) lines = [ - f"target: {data.get('target') or data.get('file_ref')}", + f"target: {data.get('path') or data.get('target') or data.get('file_ref')}", f"file_ref: {data.get('file_ref')}", f"document_id: {data.get('external_id') or data.get('document_id') or '-'}", - f"source_path: {data.get('source_path') or '-'}", - f"storage_uri: {data.get('storage_uri') or '-'}", ] folders = data.get("folders") or [] if folders: @@ -1019,11 +994,10 @@ class PIFSCommandExecutor: file_ref = item.get("file_ref") doc_id = item.get("external_id") or item.get("document_id") or "-" title = self._compact_text(item.get("title") or item.get("name") or "", max_chars=80) - source_path = item.get("source_path") or "-" folder_paths = item.get("folder_paths") or self._folder_paths_for_file(file_ref) folders = f" folders={','.join(folder_paths)}" if folder_paths else "" target = self._file_target_path(item) - return f"{target} id={doc_id} file_ref={file_ref or '-'} title={title} source={source_path}{folders}".strip() + return f"{target} id={doc_id} file_ref={file_ref or '-'} title={title}{folders}".strip() def _grep_file_hit_text(self, item: dict[str, Any]) -> str: doc_id = item.get("external_id") or "-" @@ -1046,7 +1020,7 @@ class PIFSCommandExecutor: if folder_paths and title: folder = str(folder_paths[0] or "/").rstrip("/") return f"{folder}/{title}" if folder else f"/{title}" - return str(item.get("source_path") or item.get("external_id") or file_ref or "-") + return str(item.get("external_id") or file_ref or "-") def _semantic_retrieval_query(self, query: str) -> str: query = str(query or "").strip() @@ -1150,7 +1124,6 @@ class PIFSCommandExecutor: "file_ref": result.file_ref, "external_id": result.external_id, "title": result.title, - "source_path": result.source_path, "folder_paths": result.folder_paths, "line": line, "text": text or result.snippet, @@ -1160,76 +1133,6 @@ class PIFSCommandExecutor: break return hits - def _rank_child_folders_from_source( - self, - *, - query: str, - parent_path: str, - children: list[dict[str, Any]], - limit: int, - ) -> list[dict[str, Any]]: - source_dir = self._source_dir_for_folder(parent_path) - source_root = self._source_root() - if source_dir is None or source_root is None: - return [] - child_paths = {child["path"]: child for child in children} - counts: dict[str, int] = {} - for path in self._rg_candidate_files(query, source_dir, max_files=5000): - source_path = self._source_path_from_storage(path, source_root) - folder_path = "/" + str(Path(source_path).parent).strip("/") - child_path = self._matching_child_path(parent_path, folder_path, child_paths) - if child_path: - counts[child_path] = counts.get(child_path, 0) + 1 - ranked = [ - { - "path": path, - "name": child_paths[path]["name"], - "matched_files": matched, - "files": self.filesystem.store.count_files_in_folder(path, recursive=True), - "children_count": child_paths[path].get("children_count", 0), - } - for path, matched in counts.items() - ] - ranked.sort(key=lambda item: (-item["matched_files"], item["path"])) - return ranked[:limit] - - def _grep_source_file_hits( - self, - folder_path: str, - query: str, - *, - limit: int, - direct_only: bool = False, - ) -> list[dict[str, Any]]: - source_dir = self._source_dir_for_folder(folder_path) - source_root = self._source_root() - if source_dir is None or source_root is None: - return [] - hits = [] - for path in self._rg_candidate_files(query, source_dir, max_files=max(limit * 10, 50)): - file_row = self._file_row_for_storage(path) - if not file_row: - continue - if direct_only and self._folder_path_for_source_path(file_row["source_path"]) != folder_path: - continue - line_number, text = self._first_matching_source_line(path, query) - if line_number is None: - continue - hits.append( - { - "file_ref": file_row["file_ref"], - "external_id": file_row["external_id"], - "title": file_row["title"], - "source_path": file_row["source_path"], - "folder_paths": self._folder_paths_for_file(file_row["file_ref"]), - "line": line_number, - "text": text or file_row["title"], - } - ) - if len(hits) >= limit: - break - return hits - def _grep_file_matches(self, target: str, query: str, *, limit: int) -> list[dict[str, Any]]: file_ref = self.filesystem._resolve_target(target) entry = self.filesystem.store.get_file(file_ref) @@ -1241,7 +1144,6 @@ class PIFSCommandExecutor: "file_ref": file_ref, "external_id": entry.external_id, "title": entry.title, - "source_path": entry.source_path, "folder_paths": self._folder_paths_for_file(file_ref), "line": line_number, "text": self._compact_text(line, max_chars=220), @@ -1269,136 +1171,6 @@ class PIFSCommandExecutor: def _is_combined_grep_flag(arg: str) -> bool: return bool(re.fullmatch(r"-[Rrni]+", arg)) and len(arg) > 2 - def _rg_candidate_files(self, query: str, directory: Path, *, max_files: int) -> list[Path]: - if not directory.exists(): - return [] - terms = [term.lower() for term in re.findall(r"[A-Za-z0-9_]{3,}", query)] - if not terms: - return [] - primary = max(terms, key=len) - try: - completed = subprocess.run( - [ - "rg", - "-l", - "-i", - "-F", - primary, - str(directory), - "--glob", - "*.json", - "--no-messages", - ], - check=False, - capture_output=True, - text=True, - timeout=20, - ) - except (OSError, subprocess.TimeoutExpired): - return [] - candidates = [Path(line) for line in completed.stdout.splitlines() if line.strip()] - filtered = [] - for path in candidates[: max(max_files * 20, max_files)]: - try: - text = path.read_text(encoding="utf-8", errors="ignore").lower() - except OSError: - continue - if all(term in text for term in terms): - filtered.append(path) - if len(filtered) >= max_files: - break - return filtered - - def _first_matching_source_line(self, path: Path, query: str) -> tuple[int | None, str]: - try: - lines = path.read_text(encoding="utf-8", errors="ignore").splitlines() - except OSError: - return None, "" - for line_number, line in enumerate(lines, 1): - if self._line_matches(line, query): - return line_number, self._compact_text(line, max_chars=220) - return None, "" - - def _source_root(self) -> Path | None: - with self.filesystem.store.connect() as conn: - row = conn.execute( - """ - SELECT storage_uri, source_path - FROM files - WHERE deleted_at IS NULL - LIMIT 1 - """ - ).fetchone() - if row is None: - return None - storage_path = Path(row["storage_uri"]) - source_path = Path(row["source_path"]) - root = storage_path - for _ in range(len(source_path.parts)): - root = root.parent - return root - - def _source_dir_for_folder(self, folder_path: str) -> Path | None: - source_root = self._source_root() - if source_root is None: - return None - stripped = folder_path.strip("/") - return source_root / stripped if stripped else source_root - - @staticmethod - def _source_path_from_storage(path: Path, source_root: Path) -> str: - try: - return path.relative_to(source_root).as_posix() - except ValueError: - return path.name - - @staticmethod - def _matching_child_path( - parent_path: str, - folder_path: str, - child_paths: dict[str, dict[str, Any]], - ) -> str | None: - normalized_parent = parent_path.rstrip("/") - if normalized_parent == "": - normalized_parent = "/" - if normalized_parent == "/": - parts = [part for part in folder_path.strip("/").split("/") if part] - candidate = "/" + parts[0] if parts else "/" - return candidate if candidate in child_paths else None - prefix = normalized_parent + "/" - if not folder_path.startswith(prefix): - return None - remainder = folder_path[len(prefix):] - first = remainder.split("/", 1)[0] - candidate = prefix + first - return candidate if candidate in child_paths else None - - def _file_row_for_storage(self, path: Path) -> dict[str, Any] | None: - storage_uri = str(path) - with self.filesystem.store.connect() as conn: - row = conn.execute( - """ - SELECT file_ref, external_id, title, source_path - FROM files - WHERE storage_uri = ? AND deleted_at IS NULL - LIMIT 1 - """, - (storage_uri,), - ).fetchone() - if row is None: - return None - return { - "file_ref": row["file_ref"], - "external_id": row["external_id"], - "title": row["title"], - "source_path": row["source_path"], - } - - @staticmethod - def _folder_path_for_source_path(source_path: str) -> str: - parent = str(Path(source_path).parent).strip(".") - return "/" + parent.strip("/") if parent and parent != "." else "/" - def _folder_paths_for_file(self, file_ref: str | None) -> list[str]: if not file_ref: return [] diff --git a/pageindex/filesystem/core.py b/pageindex/filesystem/core.py index 67adb8f..91a4971 100644 --- a/pageindex/filesystem/core.py +++ b/pageindex/filesystem/core.py @@ -144,13 +144,12 @@ class PageIndexFileSystem: self, *, storage_uri: str, - source_path: str, folder_path: Optional[str] = None, metadata: Optional[dict[str, Any]] = None, external_id: Optional[str] = None, title: Optional[str] = None, content: str = "", - content_type: str = "text/plain", + content_type: str | None = None, source_type: Optional[str] = None, metadata_policy: Optional[dict[str, Any]] = None, metadata_status: Optional[str] = None, @@ -159,7 +158,6 @@ class PageIndexFileSystem: [ { "storage_uri": storage_uri, - "source_path": source_path, "folder_path": folder_path, "metadata": metadata, "external_id": external_id, @@ -231,7 +229,6 @@ class PageIndexFileSystem: record = self._prepare_file_record( { "storage_uri": final_path.as_uri(), - "source_path": virtual_path.strip("/"), "folder_path": folder_path, "metadata": {}, "external_id": None, @@ -604,23 +601,27 @@ class PageIndexFileSystem: folder["path"] for folder in self.store.folder_memberships(file_ref) ] + folder_path = self._preferred_folder_path( + folder_paths, + path, + entry.folder_path, + ) rank = len(rows) + 1 rows.append( { "rank": rank, "similarity": self._semantic_candidate_similarity(candidate), "score": self._semantic_candidate_score(candidate), - "path": self._stable_file_locator(file_ref, entry), + "path": self._stable_file_locator( + file_ref, + entry, + folder_path=folder_path, + ), "file_ref": file_ref, "document_id": entry.external_id, "external_id": entry.external_id, "title": entry.title, - "source_path": entry.source_path, - "folder_path": self._preferred_folder_path( - folder_paths, - path, - entry.folder_path, - ), + "folder_path": folder_path, "folder_paths": folder_paths, "summary": str((entry.metadata or {}).get("summary") or ""), "snippet": str(getattr(candidate, "snippet", "") or entry.descriptor), @@ -724,7 +725,6 @@ class PageIndexFileSystem: folder_paths=folder_paths, metadata=row["metadata"], metadata_status=row["metadata_status"], - source_path=row["source_path"], id=row["id"], document_id=row["document_id"], name=row["name"], @@ -845,7 +845,6 @@ class PageIndexFileSystem: "mode": "structure", "file_ref": file_ref, "external_id": entry.external_id, - "source_path": entry.source_path, "status": entry.pageindex_tree_status, "available": True, "pageindex_doc_id": doc_id, @@ -887,7 +886,6 @@ class PageIndexFileSystem: "mode": "page", "file_ref": file_ref, "external_id": entry.external_id, - "source_path": entry.source_path, "status": entry.pageindex_tree_status, "available": True, "pageindex_doc_id": doc_id, @@ -905,7 +903,7 @@ class PageIndexFileSystem: return raise ValueError( f"{command} is only supported for txt/text files; " - f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " + f"got title={entry.title!r}, content_type={entry.content_type!r}. " "Use cat --structure, " "or cat --page for PDF/Markdown PageIndex files." ) @@ -915,29 +913,22 @@ class PageIndexFileSystem: return raise ValueError( f"{command} is only supported for PDF/Markdown PageIndex files; " - f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. " + f"got title={entry.title!r}, content_type={entry.content_type!r}. " "Use cat --all for txt/text files." ) @classmethod def _file_format(cls, entry: Any) -> str: - suffix = Path(str(entry.source_path or "")).suffix.lower() - content_type = cls._normalized_content_type(entry.content_type) - if suffix == ".pdf" or content_type == "application/pdf": - return "pdf" - if suffix in PAGEINDEX_DOCUMENT_SUFFIXES or content_type in PAGEINDEX_DOCUMENT_CONTENT_TYPES: - return "markdown" - if suffix in TEXT_ARTIFACT_SUFFIXES: - return "text" - if entry.pageindex_doc_id or entry.pageindex_tree_status != "not_built": + if getattr(entry, "pageindex_doc_id", None) or entry.pageindex_tree_status != "not_built": return "pageindex" - if content_type in TEXT_ARTIFACT_CONTENT_TYPES: - return "text" + file_format = cls._content_format(getattr(entry, "title", ""), entry.content_type) + if file_format != "unsupported": + return file_format return "unsupported" @classmethod - def _source_format(cls, source_path: Any, content_type: str | None) -> str: - suffix = Path(str(source_path or "")).suffix.lower() + def _content_format(cls, filename: Any, content_type: str | None) -> str: + suffix = Path(str(filename or "")).suffix.lower() normalized_content_type = cls._normalized_content_type(content_type) if suffix == ".pdf" or normalized_content_type == "application/pdf": return "pdf" @@ -977,27 +968,27 @@ class PageIndexFileSystem: self, *, storage_uri: str, - source_path: str, + title: str, content_type: str, ) -> tuple[str | None, str, dict[str, Any] | None]: - if self._source_format(source_path, content_type) not in {"pdf", "markdown"}: + if self._content_format(title, content_type) not in {"pdf", "markdown"}: return None, "not_built", None client = self._pageindex_client() - source = self._canonical_source_path(storage_uri=storage_uri, source_path=source_path) - cached_doc_id = self._find_cached_pageindex_doc_id(client, source) + local_path = self._canonical_storage_uri_path(storage_uri) + cached_doc_id = self._find_cached_pageindex_doc_id(client, local_path) if cached_doc_id: return cached_doc_id, "built", None - if source is None: + if local_path is None: return None, "failed", self._pageindex_tree_failure_record( source="PageIndexFileSystem.registration", - error_type="UnresolvableSourcePath", + error_type="UnresolvableStorageUri", message=( - "PageIndex source path must resolve to a local file path for " + "storage_uri must resolve to a local file path for " "PDF/Markdown registration." ), ) try: - doc_id = client.index(source) + doc_id = client.index(local_path) return doc_id, "built", None except Exception as exc: return None, "failed", self._pageindex_tree_failure_record( @@ -1024,25 +1015,41 @@ class PageIndexFileSystem: def _find_cached_pageindex_doc_id( self, client: PageIndexClient, - source_path: str | None, + local_path: str | None, ) -> str | None: - if source_path is None: + if local_path is None: return None for doc_id, doc in client.documents.items(): - if self._canonical_path(doc.get("path")) == source_path: + if self._canonical_path(doc.get("path")) == local_path: return doc_id return None - def _canonical_source_path(self, *, storage_uri: str, source_path: str) -> str | None: + def _canonical_storage_uri_path(self, storage_uri: str) -> str | None: parsed = urlparse(storage_uri) if parsed.scheme == "file": return self._canonical_path(unquote(parsed.path)) if storage_uri and not parsed.scheme: return self._canonical_path(storage_uri) - if Path(source_path).expanduser().is_absolute(): - return self._canonical_path(source_path) return None + @staticmethod + def _title_from_storage_uri(storage_uri: str) -> str: + parsed = urlparse(str(storage_uri or "")) + path = unquote(parsed.path) if parsed.scheme else str(storage_uri or "") + return Path(path).name + + @classmethod + def _infer_content_type(cls, *, title: str, storage_uri: str) -> str: + for filename in (title, cls._title_from_storage_uri(storage_uri)): + suffix = Path(str(filename or "")).suffix.lower() + if suffix == ".pdf": + return "application/pdf" + if suffix in PAGEINDEX_DOCUMENT_SUFFIXES: + return "text/markdown" + if suffix in TEXT_ARTIFACT_SUFFIXES: + return "text/plain" + return "text/plain" + @staticmethod def _canonical_path(path: Any) -> str | None: if not path: @@ -1124,12 +1131,12 @@ class PageIndexFileSystem: } def _add_file_content(self, path: Path, content_type: str) -> str: - if self._source_format(str(path), content_type) in {"markdown", "text"}: + if self._content_format(path.name, content_type) in {"markdown", "text"}: return path.read_text(encoding="utf-8") return "" def _require_add_pageindex_ready(self, record: dict[str, Any]) -> None: - if self._source_format(record["source_path"], record["content_type"]) not in { + if self._content_format(record["title"], record["content_type"]) not in { "pdf", "markdown", }: @@ -1178,33 +1185,47 @@ class PageIndexFileSystem: def _prepare_file_record(self, file: dict[str, Any]) -> dict[str, Any]: storage_uri = file["storage_uri"] - raw_source_path = str(file["source_path"]) - source_path = raw_source_path.strip("/") metadata = file.get("metadata") or {} if not isinstance(metadata, dict): raise ValueError("metadata must be a JSON object") self._validate_register_metadata(metadata) external_id = file.get("external_id") content = file.get("content") or "" - content_type = file.get("content_type") or "text/plain" + folder_path = normalize_path(file.get("folder_path") or "/") + title = str( + file.get("title") + or metadata.get("title") + or self._title_from_storage_uri(storage_uri) + or external_id + or "" + ).strip() + if not title: + raise ValueError("file title is required") + content_type = file.get("content_type") or self._infer_content_type( + title=title, + storage_uri=storage_uri, + ) + file_ref = make_file_ref( + str(external_id or self._join_virtual_file_path(folder_path, title).strip("/")) + ) ( pageindex_doc_id, pageindex_tree_status, pageindex_tree_failure, ) = self._registration_pageindex_pointer( storage_uri=storage_uri, - source_path=raw_source_path, + title=title, content_type=content_type, ) artifact_content = self._registration_text_artifact_content( - source_path=raw_source_path, + title=title, content_type=content_type, pageindex_doc_id=pageindex_doc_id, pageindex_tree_status=pageindex_tree_status, fallback_content=content, ) fts_content = file.get("fts_content", artifact_content) - source_type = file.get("source_type") or self._infer_source_type(source_path) + source_type = file.get("source_type") metadata_policy = self._normalize_metadata_policy( file.get("metadata_policy"), metadata=metadata, @@ -1217,9 +1238,6 @@ class PageIndexFileSystem: self._attach_pageindex_tree_failure(metadata_status, pageindex_tree_failure) indexed_metadata = SQLiteFileSystemStore.indexed_metadata_values(metadata) searchable_metadata = dict(metadata) - folder_path = normalize_path(file.get("folder_path") or "/") - title = file.get("title") or metadata.get("title") or Path(source_path).stem - file_ref = make_file_ref(external_id or source_path) text_artifact_path = file.get("text_artifact_path") owns_text_artifact = text_artifact_path is None if text_artifact_path is None: @@ -1234,7 +1252,6 @@ class PageIndexFileSystem: "file_ref": file_ref, "external_id": external_id, "storage_uri": storage_uri, - "source_path": source_path, "title": title, "descriptor": descriptor, "content_type": content_type, @@ -1260,13 +1277,13 @@ class PageIndexFileSystem: def _registration_text_artifact_content( self, *, - source_path: str, + title: str, content_type: str, pageindex_doc_id: str | None, pageindex_tree_status: str, fallback_content: str, ) -> str: - if self._source_format(source_path, content_type) not in {"pdf", "markdown"}: + if self._content_format(title, content_type) not in {"pdf", "markdown"}: return fallback_content if pageindex_tree_status != "built" or not pageindex_doc_id: return fallback_content @@ -1296,15 +1313,11 @@ class PageIndexFileSystem: @staticmethod def _raw_artifact_payload( *, - storage_uri: str, - source_path: str, folder_path: str, metadata: dict[str, Any], metadata_status: dict[str, Any], ) -> dict[str, Any]: return { - "storage_uri": storage_uri, - "source_path": source_path, "folder_path": folder_path, "metadata": metadata, "metadata_status": metadata_status, @@ -1323,8 +1336,6 @@ class PageIndexFileSystem: self.store.write_raw_artifact( record["file_ref"], self._raw_artifact_payload( - storage_uri=record["storage_uri"], - source_path=record["source_path"], folder_path=record["folder_path"], metadata=record["metadata"], metadata_status=record["metadata_status"], @@ -1351,7 +1362,6 @@ class PageIndexFileSystem: "file_ref": entry.file_ref, "external_id": entry.external_id, "storage_uri": entry.storage_uri, - "source_path": entry.source_path, "title": entry.title, "descriptor": entry.descriptor, "content_type": entry.content_type, @@ -1394,7 +1404,6 @@ class PageIndexFileSystem: file_ref=record["file_ref"], external_id=record.get("external_id"), title=record["title"], - source_path=record["source_path"], content_type=record["content_type"], source_type=record.get("source_type"), text=Path(record["text_artifact_path"]).read_text(encoding="utf-8"), @@ -1638,7 +1647,6 @@ class PageIndexFileSystem: text=text, external_id=entry.external_id, folder_path=entry.folder_path, - source_path=entry.source_path, ) def _open_all(self, file_ref: str) -> OpenResult: @@ -1652,7 +1660,6 @@ class PageIndexFileSystem: text=text, external_id=entry.external_id, folder_path=entry.folder_path, - source_path=entry.source_path, ) @classmethod @@ -1671,7 +1678,6 @@ class PageIndexFileSystem: "mode": mode, "file_ref": entry.file_ref, "external_id": entry.external_id, - "source_path": entry.source_path, "status": entry.pageindex_tree_status, "available": False, "message": message, @@ -1744,19 +1750,30 @@ class PageIndexFileSystem: separators=(",", ":"), ) - def _stable_file_locator(self, file_ref: str, entry: Any) -> str: - source_path = str(getattr(entry, "source_path", "") or "").strip() - if source_path: - target = "/" + source_path.strip("/") - try: - if self.store.resolve_file_ref(target) == file_ref: - return target - except KeyError: - pass - external_id = str(getattr(entry, "external_id", "") or "").strip() - if external_id: - return external_id - return file_ref + def _stable_file_locator( + self, + file_ref: str, + entry: Any, + *, + folder_path: str | None = None, + ) -> str: + folder_path = normalize_path(folder_path or getattr(entry, "folder_path", None) or "/") + title = str(getattr(entry, "title", "") or "").strip() + if not title: + raise RuntimeError(f"browse cannot build a virtual path for {file_ref}: missing title") + target = self._join_virtual_file_path(folder_path, title.strip("/")) + try: + resolved_file_ref = self.store.resolve_file_ref(target) + except KeyError as exc: + raise RuntimeError( + f"browse produced an unresolved virtual path for {file_ref}: {target}" + ) from exc + if resolved_file_ref != file_ref: + raise RuntimeError( + "browse produced a non-idempotent virtual path: " + f"{target} resolved to {resolved_file_ref}, expected {file_ref}" + ) + return target @staticmethod def _build_descriptor(title: str, metadata: dict[str, Any]) -> str: @@ -2011,11 +2028,6 @@ class PageIndexFileSystem: return "number" return "string" - @staticmethod - def _infer_source_type(source_path: str) -> Optional[str]: - parts = [part for part in Path(source_path).parts if part not in ("", ".")] - return parts[0] if parts else None - @staticmethod def _scope_folder_path(scope: Optional[dict[str, Any]]) -> Optional[str]: if not scope: diff --git a/pageindex/filesystem/metadata_generation.py b/pageindex/filesystem/metadata_generation.py index 86b2ac6..1057c37 100644 --- a/pageindex/filesystem/metadata_generation.py +++ b/pageindex/filesystem/metadata_generation.py @@ -18,7 +18,6 @@ class MetadataGenerationInput: file_ref: str external_id: str | None title: str - source_path: str content_type: str source_type: str | None text: str diff --git a/pageindex/filesystem/semantic_index.py b/pageindex/filesystem/semantic_index.py index 5b3e393..cc01d82 100644 --- a/pageindex/filesystem/semantic_index.py +++ b/pageindex/filesystem/semantic_index.py @@ -21,7 +21,6 @@ class SemanticIndexRecord: text: str external_id: str | None = None source_type: str = "" - source_path: str = "" title: str = "" metadata: dict[str, Any] | None = None @@ -32,7 +31,6 @@ class SemanticSearchResult: distance: float external_id: str | None source_type: str - source_path: str title: str text_hash: str metadata: dict[str, Any] @@ -88,7 +86,6 @@ class SQLiteVecSemanticIndex: file_ref TEXT NOT NULL UNIQUE, external_id TEXT, source_type TEXT NOT NULL DEFAULT '', - source_path TEXT NOT NULL DEFAULT '', title TEXT NOT NULL DEFAULT '', text_hash TEXT NOT NULL, text_chars INTEGER NOT NULL DEFAULT 0, @@ -215,7 +212,6 @@ class SQLiteVecSemanticIndex: d.file_ref, d.external_id, d.source_type, - d.source_path, d.title, d.text_hash, d.metadata_json, @@ -245,7 +241,6 @@ class SQLiteVecSemanticIndex: d.file_ref, d.external_id, d.source_type, - d.source_path, d.title, d.text_hash, d.metadata_json, @@ -269,7 +264,6 @@ class SQLiteVecSemanticIndex: distance=float(row["distance"]), external_id=row["external_id"], source_type=row["source_type"], - source_path=row["source_path"], title=row["title"], text_hash=row["text_hash"], metadata=metadata, @@ -361,15 +355,14 @@ class SQLiteVecSemanticIndex: cursor = conn.execute( """ INSERT INTO semantic_index_docs( - file_ref, external_id, source_type, source_path, title, + file_ref, external_id, source_type, title, text_hash, text_chars, metadata_json - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?) """, ( record.file_ref, record.external_id, record.source_type, - record.source_path, record.title, text_hash, len(record.text), @@ -381,10 +374,9 @@ class SQLiteVecSemanticIndex: conn.execute( """ UPDATE semantic_index_docs - SET external_id = ?, - source_type = ?, - source_path = ?, - title = ?, + SET external_id = ?, + source_type = ?, + title = ?, text_hash = ?, text_chars = ?, metadata_json = ?, @@ -394,7 +386,6 @@ class SQLiteVecSemanticIndex: ( record.external_id, record.source_type, - record.source_path, record.title, text_hash, len(record.text), diff --git a/pageindex/filesystem/semantic_projection.py b/pageindex/filesystem/semantic_projection.py index 6059a9b..d3c8872 100644 --- a/pageindex/filesystem/semantic_projection.py +++ b/pageindex/filesystem/semantic_projection.py @@ -39,7 +39,6 @@ class SemanticProjectionCandidate: score: float sources: list[dict[str, Any]] source_type: str - source_path: str title: str metadata: dict[str, Any] snippet: str @@ -261,7 +260,6 @@ class SummaryProjectionIndexer: text=summary, external_id=record.get("external_id"), source_type=str(record.get("source_type") or ""), - source_path=str(record.get("source_path") or ""), title=str(record.get("title") or ""), metadata=metadata, ) @@ -493,7 +491,6 @@ def rank_single_semantic_channel( score=1 / (60 + rank), sources=[{"channel": channel, "rank": rank, "distance": result.distance}], source_type=result.source_type, - source_path=result.source_path, title=result.title, metadata=result.metadata, snippet=f"{channel}_vector rank={rank}", diff --git a/pageindex/filesystem/store.py b/pageindex/filesystem/store.py index 10e1e7a..b1754da 100644 --- a/pageindex/filesystem/store.py +++ b/pageindex/filesystem/store.py @@ -43,7 +43,6 @@ class SQLiteFileSystemStore: file_ref TEXT PRIMARY KEY, external_id TEXT, storage_uri TEXT NOT NULL, - source_path TEXT NOT NULL, title TEXT NOT NULL, descriptor TEXT NOT NULL, content_type TEXT NOT NULL, @@ -124,7 +123,6 @@ class SQLiteFileSystemStore: USING fts5(file_ref UNINDEXED, title, body, metadata_text); CREATE INDEX IF NOT EXISTS idx_files_external_id ON files(external_id); - CREATE INDEX IF NOT EXISTS idx_files_source_path ON files(source_path); CREATE INDEX IF NOT EXISTS idx_files_source_type ON files(source_type); CREATE INDEX IF NOT EXISTS idx_folders_path ON folders(path); CREATE INDEX IF NOT EXISTS idx_folders_parent_id ON folders(parent_id); @@ -168,6 +166,7 @@ class SQLiteFileSystemStore: fts_file_ref_rows = [] fts_rows = [] metadata_rows = [] + pending_folder_titles: dict[tuple[str, str], str] = {} metadata_field_ids = { row["name"]: row["field_id"] for row in conn.execute( @@ -184,6 +183,18 @@ class SQLiteFileSystemStore: kind=record.get("folder_kind", "manual"), ) folder_cache[folder_cache_key] = folder_id + self._ensure_title_available_in_folder( + conn, + folder_id=folder_id, + file_ref=record["file_ref"], + title=record["title"], + ) + title_key = (folder_id, str(record["title"])) + existing_file_ref = pending_folder_titles.get(title_key) + if existing_file_ref and existing_file_ref != record["file_ref"]: + target = self._virtual_file_target(conn, folder_id, str(record["title"])) + raise FileExistsError(f"File already exists at {target}") + pending_folder_titles[title_key] = record["file_ref"] file_rows.append(self._file_insert_values(record)) membership_rows.append( ( @@ -244,7 +255,6 @@ class SQLiteFileSystemStore: "file_ref", "external_id", "storage_uri", - "source_path", "title", "descriptor", "content_type", @@ -270,7 +280,6 @@ class SQLiteFileSystemStore: record["file_ref"], record["external_id"], record["storage_uri"], - record["source_path"], record["title"], record["descriptor"], record["content_type"], @@ -338,6 +347,12 @@ class SQLiteFileSystemStore: with self.connect() as conn: resolved_file_ref = self._resolve_file_ref(conn, file_ref) folder_id = self._resolve_or_create_folder(conn, folder_path_or_id) + self._ensure_title_available_in_folder( + conn, + folder_id=folder_id, + file_ref=resolved_file_ref, + title=self._file_title(conn, resolved_file_ref), + ) conn.execute( """ INSERT INTO file_folders(file_ref, folder_id, metadata_json) @@ -357,6 +372,12 @@ class SQLiteFileSystemStore: for item in items: resolved_file_ref = self._resolve_file_ref(conn, item["file_ref"]) folder_id = self._resolve_or_create_folder(conn, item["folder"]) + self._ensure_title_available_in_folder( + conn, + folder_id=folder_id, + file_ref=resolved_file_ref, + title=self._file_title(conn, resolved_file_ref), + ) conn.execute( """ INSERT INTO file_folders(file_ref, folder_id, metadata_json) @@ -371,6 +392,56 @@ class SQLiteFileSystemStore: ), ) + def _ensure_title_available_in_folder( + self, + conn: sqlite3.Connection, + *, + folder_id: str, + file_ref: str, + title: str, + ) -> None: + row = conn.execute( + """ + SELECT f.file_ref, fo.path + FROM files f + JOIN file_folders ff ON ff.file_ref = f.file_ref + JOIN folders fo ON fo.folder_id = ff.folder_id + WHERE f.deleted_at IS NULL + AND ff.folder_id = ? + AND f.title = ? + AND f.file_ref != ? + LIMIT 1 + """, + (folder_id, title, file_ref), + ).fetchone() + if row: + raise FileExistsError( + f"File already exists at {self._virtual_file_target(conn, folder_id, title)}" + ) + + @staticmethod + def _virtual_file_target( + conn: sqlite3.Connection, + folder_id: str, + title: str, + ) -> str: + row = conn.execute( + "SELECT path FROM folders WHERE folder_id = ?", + (folder_id,), + ).fetchone() + folder_path = normalize_path(row["path"] if row else "/") + return f"/{title}" if folder_path == "/" else f"{folder_path}/{title}" + + @staticmethod + def _file_title(conn: sqlite3.Connection, file_ref: str) -> str: + row = conn.execute( + "SELECT title FROM files WHERE file_ref = ? AND deleted_at IS NULL", + (file_ref,), + ).fetchone() + if row is None: + raise KeyError(f"Unknown file target: {file_ref}") + return str(row["title"]) + def replace_metadata_values( self, conn: sqlite3.Connection, @@ -791,7 +862,6 @@ class SQLiteFileSystemStore: selects = [ "f.file_ref", "f.external_id", - "f.source_path", "f.title", "f.descriptor", "f.pageindex_tree_status", @@ -984,7 +1054,6 @@ class SQLiteFileSystemStore: f.file_ref, f.external_id, f.storage_uri, - f.source_path, f.title, f.descriptor, f.content_type, @@ -1125,30 +1194,6 @@ class SQLiteFileSystemStore: ).fetchone() if row: return row["file_ref"] - stripped = target.strip("/") - rows = conn.execute( - """ - SELECT - f.file_ref, - f.external_id, - f.title, - f.source_path, - COALESCE(MIN(fo.path), '/') AS folder_path - FROM files f - LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref - LEFT JOIN folders fo ON fo.folder_id = ff.folder_id - WHERE f.source_path = ? AND f.deleted_at IS NULL - GROUP BY f.file_ref, f.external_id, f.title, f.source_path - ORDER BY f.file_ref - LIMIT 2 - """, - (stripped,), - ).fetchall() - if len(rows) > 1: - matches = "; ".join(self._virtual_match_summary(row) for row in rows) - raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}") - if rows: - return rows[0]["file_ref"] virtual_file_ref = self._resolve_virtual_file_ref(conn, target) if virtual_file_ref: return virtual_file_ref @@ -1163,12 +1208,9 @@ class SQLiteFileSystemStore: f.file_ref, f.external_id, f.title, - f.source_path, pf.path AS folder_path, (CASE WHEN pf.path = '/' THEN '/' ELSE pf.path || '/' END) - || ltrim(f.title, '/') AS title_virtual_path, - (CASE WHEN pf.path = '/' THEN '/' ELSE pf.path || '/' END) - || ltrim(f.source_path, '/') AS source_virtual_path + || ltrim(f.title, '/') AS title_virtual_path FROM files f JOIN file_folders ff ON ff.file_ref = f.file_ref JOIN folders pf ON pf.folder_id = ff.folder_id @@ -1178,16 +1220,14 @@ class SQLiteFileSystemStore: file_ref, external_id, title, - source_path, MIN(folder_path) AS folder_path FROM virtual_matches WHERE title_virtual_path = ? - OR source_virtual_path = ? - GROUP BY file_ref, external_id, title, source_path + GROUP BY file_ref, external_id, title ORDER BY file_ref LIMIT 2 """, - (virtual_target, virtual_target), + (virtual_target,), ).fetchall() if not rows: return None @@ -1201,8 +1241,7 @@ class SQLiteFileSystemStore: external_id = row["external_id"] or "-" return ( f"file_ref={row['file_ref']} external_id={external_id} " - f"folder={row['folder_path']} title={row['title']!r} " - f"source_path={row['source_path']!r}" + f"folder={row['folder_path']} title={row['title']!r}" ) def ensure_folder( @@ -1475,18 +1514,12 @@ class SQLiteFileSystemStore: JOIN folders fo ON fo.folder_id = ff.folder_id WHERE f.deleted_at IS NULL AND fo.path = ? - AND ( - f.title = ? - OR f.source_path = ? - OR f.source_path LIKE ? ESCAPE '\\' - ) + AND f.title = ? LIMIT 1 """, ( path, basename, - basename, - "%/" + self._like_escape(basename), ), ).fetchone() return row is not None @@ -1548,7 +1581,6 @@ class SQLiteFileSystemStore: f.file_ref, f.external_id, f.storage_uri, - f.source_path, f.title, f.descriptor, f.content_type, @@ -1592,7 +1624,6 @@ class SQLiteFileSystemStore: f.external_id, f.title, f.descriptor, - f.source_path, f.pageindex_tree_status, f.metadata_json, f.metadata_status_json, @@ -1804,7 +1835,6 @@ class SQLiteFileSystemStore: "pageNum": None, "createdAt": cls._row_value(row, "created_at"), "folderId": cls._row_value(row, "folder_id"), - "source_path": row["source_path"], "folder_path": row["folder_path"], "metadata": json.loads(row["metadata_json"] or "{}"), "metadata_status": json.loads( @@ -1827,7 +1857,6 @@ class SQLiteFileSystemStore: "pageNum": None, "createdAt": cls._row_value(row, "created_at"), "folderId": cls._row_value(row, "folder_id"), - "source_path": row["source_path"], "snippet": row["snippet"] or row["title"], "folder_path": row["folder_path"], "metadata": json.loads(row["metadata_json"] or "{}"), @@ -1846,7 +1875,6 @@ class SQLiteFileSystemStore: file_ref=row["file_ref"], external_id=row["external_id"], storage_uri=row["storage_uri"], - source_path=row["source_path"], title=row["title"], descriptor=row["descriptor"], content_type=row["content_type"], @@ -1871,8 +1899,7 @@ class SQLiteFileSystemStore: "document_id": entry.external_id, "external_id": entry.external_id, "name": entry.title, - "storage_uri": entry.storage_uri, - "source_path": entry.source_path, + "path": cls._virtual_file_path(entry.folder_path, entry.title), "title": entry.title, "description": entry.descriptor, "status": entry.pageindex_tree_status, @@ -1881,8 +1908,6 @@ class SQLiteFileSystemStore: "content_type": entry.content_type, "source_type": entry.source_type, "fingerprint": entry.fingerprint, - "text_artifact_path": entry.text_artifact_path, - "raw_artifact_path": entry.raw_artifact_path, "pageindex_doc_id": entry.pageindex_doc_id, "pageindex_tree_status": entry.pageindex_tree_status, "metadata": entry.metadata, @@ -1890,6 +1915,11 @@ class SQLiteFileSystemStore: "folder_path": entry.folder_path, } + @staticmethod + def _virtual_file_path(folder_path: str, title: str) -> str: + folder_path = normalize_path(folder_path) + return f"/{title}" if folder_path == "/" else f"{folder_path}/{title}" + @staticmethod def _query_text(query: str | list[str] | None) -> str: if query is None: diff --git a/pageindex/filesystem/types.py b/pageindex/filesystem/types.py index 103d28d..b65c3b0 100644 --- a/pageindex/filesystem/types.py +++ b/pageindex/filesystem/types.py @@ -13,7 +13,6 @@ class SearchResult: folder_path: str folder_paths: list[str] metadata: dict[str, Any] - source_path: str = "" id: Optional[str] = None document_id: Optional[str] = None name: str = "" @@ -33,7 +32,6 @@ class OpenResult: text: str external_id: Optional[str] = None folder_path: str = "" - source_path: str = "" @dataclass(frozen=True) @@ -50,7 +48,6 @@ class FileEntry: file_ref: str external_id: Optional[str] storage_uri: str - source_path: str title: str descriptor: str content_type: str diff --git a/tests/test_filesystem_store.py b/tests/test_filesystem_store.py index 7f42503..ed9ef38 100644 --- a/tests/test_filesystem_store.py +++ b/tests/test_filesystem_store.py @@ -21,7 +21,6 @@ def test_insert_files_does_not_disable_sqlite_synchronous(tmp_path): "file_ref": "ref_report", "external_id": "doc_report", "storage_uri": "file:///tmp/report.pdf", - "source_path": "documents/report.pdf", "folder_path": "/documents", "title": "Report", "descriptor": "documents/report.pdf", diff --git a/tests/test_metadata_generation.py b/tests/test_metadata_generation.py index 3e64a4b..1f1aec6 100644 --- a/tests/test_metadata_generation.py +++ b/tests/test_metadata_generation.py @@ -20,7 +20,6 @@ def test_metadata_generator_uses_provider_parameter(): file_ref="file_a", external_id="doc_a", title="A", - source_path="docs/a.txt", content_type="text/plain", source_type=None, text="hello", diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index b74cc79..63d5b1b 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -135,7 +135,6 @@ def _register_browse_file( filesystem.metadata_generator = SummaryGenerator() return filesystem.register_file( storage_uri=f"file:///tmp/{external_id}.txt", - source_path=f"documents/{external_id}.txt", folder_path=folder_path, external_id=external_id, title=f"{external_id}.txt", @@ -427,7 +426,7 @@ def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path) assert "score:" not in rendered -def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp_path): +def test_browse_shell_path_uses_virtual_locator_when_source_collides(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult @@ -443,7 +442,6 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp ) first_ref = filesystem.register_file( storage_uri="file:///tmp/first.json", - source_path="shared/source.json", folder_path="/documents", external_id="dsid_first", title="First", @@ -459,7 +457,6 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp ) filesystem.register_file( storage_uri="file:///tmp/second.json", - source_path="shared/source.json", folder_path="/documents", external_id="dsid_second", title="Second", @@ -478,13 +475,52 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp rendered = executor.execute('browse /documents "first"') - assert "path: dsid_first" in rendered + assert "path: /documents/First" in rendered assert "path: /shared/source.json" not in rendered - assert filesystem.store.resolve_file_ref("dsid_first") == first_ref - with pytest.raises(KeyError, match="Ambiguous file target"): + assert filesystem.store.resolve_file_ref("/documents/First") == first_ref + with pytest.raises(KeyError, match="Unknown file target"): filesystem.store.resolve_file_ref("/shared/source.json") +def test_browse_shell_path_never_returns_storage_uri_path(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": "summary for physical source report"} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + file_ref = filesystem.register_file( + storage_uri="file:///Users/chengjie/Downloads/source/report.pdf", + folder_path="/documents/reports", + external_id="dsid_report", + title="report.pdf", + content="physical source report content", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_report"]) + executor = PIFSCommandExecutor(filesystem) + + rendered = executor.execute('browse /documents/reports "physical source"') + + assert "path: /documents/reports/report.pdf" in rendered + assert "/Users/chengjie/Downloads" not in rendered + assert filesystem.store.resolve_file_ref("/documents/reports/report.pdf") == file_ref + + def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult @@ -501,7 +537,6 @@ def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path ) file_ref = filesystem.register_file( storage_uri="file:///tmp/report.pdf", - source_path="examples/documents/report.pdf", folder_path="/documents", external_id="dsid_report", title="report.pdf", @@ -525,14 +560,13 @@ def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path ) assert "source_type" not in backend.calls[0][2] - assert "source_path" not in backend.calls[0][2] - assert result["data"]["data"][0]["path"] == "/examples/documents/report.pdf" + assert result["data"]["data"][0]["path"] == "/documents/report.pdf" assert result["data"]["data"][0]["summary"] == "Federal Reserve annual report summary" assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref -def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path): - from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem +def test_register_file_rejects_duplicate_title_in_folder(tmp_path): + from pageindex.filesystem import PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult class SummaryGenerator: @@ -545,9 +579,8 @@ def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path): workspace=tmp_path / "workspace", metadata_generator=SummaryGenerator(), ) - first_ref = filesystem.register_file( + filesystem.register_file( storage_uri="file:///tmp/first.json", - source_path="slack/dsid_first.json", folder_path="/documents", external_id="dsid_first", title="announcements", @@ -561,34 +594,25 @@ def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path): } }, ) - filesystem.register_file( - storage_uri="file:///tmp/second.json", - source_path="slack/dsid_second.json", - folder_path="/documents", - external_id="dsid_second", - title="announcements", - content="second announcement mentions unrelated maintenance.", - metadata_policy={ - "fields": { - "summary": True, - "doc_type": False, - "domain": False, - "topic": False, - } - }, - ) - filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first") - executor = PIFSCommandExecutor(filesystem, json_output=True) - - result = json.loads(executor.execute('browse /documents "H200 reservations"')) - - assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json" - assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref - with pytest.raises(KeyError, match="Ambiguous file target"): - filesystem.store.resolve_file_ref("/documents/announcements") + with pytest.raises(FileExistsError, match="File already exists at /documents/announcements"): + filesystem.register_file( + storage_uri="file:///tmp/second.json", + folder_path="/documents", + external_id="dsid_second", + title="announcements", + content="second announcement mentions unrelated maintenance.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) -def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path): +def test_browse_path_uses_virtual_title_when_storage_paths_are_unrelated(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult @@ -604,7 +628,6 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path): ) first_ref = filesystem.register_file( storage_uri="file:///tmp/first.json", - source_path="shared/source.json", folder_path="/documents", external_id="dsid_first", title="First", @@ -620,7 +643,6 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path): ) filesystem.register_file( storage_uri="file:///tmp/second.json", - source_path="shared/source.json", folder_path="/documents", external_id="dsid_second", title="Second", @@ -639,7 +661,7 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path): result = json.loads(executor.execute('browse /documents "first"')) - assert result["data"]["data"][0]["path"] == "dsid_first" + assert result["data"]["data"][0]["path"] == "/documents/First" assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref @@ -663,7 +685,6 @@ def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path) ) filesystem.register_file( storage_uri="file:///tmp/market-note.pdf", - source_path="examples/documents/market-note.pdf", folder_path="/documents", external_id="dsid_market_note", title="market-note.pdf", @@ -695,13 +716,13 @@ def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path) executor.execute('browse /documents "Federal Reserve" --space entity') ) assert entity["data"]["data"][0]["summary"] == "Risk and compliance summary" - assert entity["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf" + assert entity["data"]["data"][0]["path"] == "/documents/market-note.pdf" relation = json.loads( executor.execute('browse /documents "Disney valuation" --space relation') ) assert relation["data"]["data"][0]["summary"] == "Risk and compliance summary" - assert relation["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf" + assert relation["data"]["data"][0]["path"] == "/documents/market-note.pdf" def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path): @@ -711,7 +732,6 @@ def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path): filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") filesystem.register_file( storage_uri="file:///tmp/report.pdf", - source_path="examples/documents/report.pdf", folder_path="/documents", external_id="dsid_report", title="Annual report", @@ -755,7 +775,7 @@ def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_ assert "semantic-grep" not in rendered -def test_grep_source_file_requires_terms_on_same_line(tmp_path): +def test_grep_file_requires_terms_on_same_line(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem source_dir = tmp_path / "source" / "documents" @@ -769,11 +789,10 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path): filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") filesystem.register_file( storage_uri=str(source), - source_path="documents/split.json", folder_path="/documents", external_id="doc_split_terms", title="Split source terms", - content="registered artifact without the searched tokens", + content=source.read_text(encoding="utf-8"), ) executor = PIFSCommandExecutor(filesystem, json_output=True) @@ -813,7 +832,6 @@ def test_existing_summary_projection_index_uses_current_config_when_dimensions_m file_ref="file_a", external_id="doc_a", source_type="documents", - source_path="documents/a.pdf", title="A", text="summary", vector=[1.0, 0.0, 0.0], @@ -879,7 +897,6 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval( file_ref="file_a", external_id="doc_a", source_type="documents", - source_path="documents/a.pdf", title="A", text="summary", vector=[1.0, 0.0, 0.0], @@ -948,7 +965,6 @@ def test_browse_semantic_files_uses_summary_projection_when_only_summary_availab ) filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/source.txt", folder_path="/documents", external_id="doc_summary_only", title="Operations note", diff --git a/tests/test_pageindex_structural_read.py b/tests/test_pageindex_structural_read.py index 2f27077..29e7c9a 100644 --- a/tests/test_pageindex_structural_read.py +++ b/tests/test_pageindex_structural_read.py @@ -60,7 +60,6 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch): monkeypatch.setattr(PageIndexClient, "index", fail_index) filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/report.md", external_id="dsid_structural_missing", title="Structural report", content=source.read_text(encoding="utf-8"), @@ -152,14 +151,12 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft filesystem.register_file( storage_uri=source_pdf.as_uri(), - source_path="docs/report.pdf", external_id="dsid_pdf_extracted", title="PDF extracted", content="CALLER PDF CONTENT MUST NOT REACH GENERATOR", ) filesystem.register_file( storage_uri=source_md.as_uri(), - source_path="docs/notes.md", external_id="dsid_md_extracted", title="Markdown extracted", content="CALLER MD CONTENT MUST NOT REACH GENERATOR", @@ -167,8 +164,12 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft pdf_request = generator.calls[0][0] md_request = generator.calls[1][0] - pdf_stat = filesystem.store.file_info("dsid_pdf_extracted") - md_stat = filesystem.store.file_info("dsid_md_extracted") + pdf_entry = filesystem.store.get_file( + filesystem.store.resolve_file_ref("dsid_pdf_extracted") + ) + md_entry = filesystem.store.get_file( + filesystem.store.resolve_file_ref("dsid_md_extracted") + ) assert "PageIndex PDF extracted alpha text" in pdf_request.text assert "Second PageIndex PDF extracted beta text" in pdf_request.text @@ -176,10 +177,10 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft assert "PageIndex Markdown extracted gamma text" in md_request.text assert "CALLER MD CONTENT" not in md_request.text assert "PageIndex PDF extracted alpha text" in Path( - pdf_stat["text_artifact_path"] + pdf_entry.text_artifact_path ).read_text(encoding="utf-8") assert "PageIndex Markdown extracted gamma text" in Path( - md_stat["text_artifact_path"] + md_entry.text_artifact_path ).read_text(encoding="utf-8") assert [r.external_id for r in filesystem.search("alpha beta", limit=5)] == [ "dsid_pdf_extracted" @@ -207,7 +208,6 @@ def test_register_text_metadata_generation_keeps_caller_content_without_pageinde filesystem.register_file( storage_uri="file:///tmp/readme.txt", - source_path="docs/readme.txt", external_id="dsid_text_generation", title="Text generation", content="Plain text caller content stays authoritative.", @@ -215,11 +215,14 @@ def test_register_text_metadata_generation_keeps_caller_content_without_pageinde ) stat = filesystem.store.file_info("dsid_text_generation") + entry = filesystem.store.get_file( + filesystem.store.resolve_file_ref("dsid_text_generation") + ) assert generator.calls[0][0].text == "Plain text caller content stays authoritative." assert stat["pageindex_doc_id"] is None assert stat["pageindex_tree_status"] == "not_built" - assert Path(stat["text_artifact_path"]).read_text( + assert Path(entry.text_artifact_path).read_text( encoding="utf-8" ) == "Plain text caller content stays authoritative." @@ -261,14 +264,12 @@ def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeyp filesystem.register_file( storage_uri=str(source_pdf), - source_path="docs/report.pdf", external_id="dsid_pdf_build", title="PDF build", content="pdf text", ) filesystem.register_file( storage_uri=source_md.as_uri(), - source_path="docs/notes.md", external_id="dsid_md_build", title="Markdown build", content=source_md.read_text(encoding="utf-8"), @@ -332,7 +333,6 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke monkeypatch.setattr(PageIndexClient, "index", fail_index) filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/report.pdf", external_id="dsid_structural_cached", title="Cached structural report", content="text artifact remains available for grep, not cat --all", @@ -370,7 +370,6 @@ def test_cat_node_is_not_supported(): filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") filesystem.register_file( storage_uri="file:///tmp/notes.md", - source_path="docs/notes.md", external_id="dsid_md_cached", title="Cached markdown notes", content="# Notes\n\nBody", @@ -419,7 +418,6 @@ def test_cat_structure_page_and_text_outputs_are_hard_limited(): ) filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/report.pdf", external_id="dsid_limited_pdf", title="Limited structural report", content="text artifact remains available for grep", @@ -427,7 +425,6 @@ def test_cat_structure_page_and_text_outputs_are_hard_limited(): text_content = "\n".join(f"line {index}" for index in range(1, 106)) filesystem.register_file( storage_uri="file:///tmp/long.txt", - source_path="docs/long.txt", external_id="dsid_long_text", title="Long text", content=text_content, @@ -474,7 +471,6 @@ def test_tree_folder_behavior_is_preserved(): filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") filesystem.register_file( storage_uri="file:///tmp/report.txt", - source_path="docs/report.txt", folder_path="/docs/reports", external_id="dsid_folder_tree", title="Folder report", @@ -514,7 +510,6 @@ def test_tree_does_not_read_file_internal_pageindex_structure(): ) filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/report.pdf", external_id="dsid_tree_is_folder_only", title="Cached structural report", content="text artifact remains available", @@ -536,28 +531,24 @@ def test_cat_all_is_limited_to_text_files(): filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") filesystem.register_file( storage_uri="file:///tmp/readme.txt", - source_path="docs/readme.txt", external_id="dsid_text_file", title="Text readme", content="plain text body", ) filesystem.register_file( storage_uri="file:///tmp/report.pdf", - source_path="docs/report.pdf", external_id="dsid_pdf_file", title="PDF report", content="extracted text should not be served through cat --all", ) filesystem.register_file( storage_uri="file:///tmp/notes.md", - source_path="docs/notes.md", external_id="dsid_md_file", title="Markdown notes", content="markdown text should use PageIndex structure reads", ) filesystem.register_file( storage_uri="file:///tmp/data.json", - source_path="docs/data.json", external_id="dsid_json_file", title="JSON record", content='{"body":"json"}', @@ -589,7 +580,6 @@ def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown(): filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") filesystem.register_file( storage_uri="file:///tmp/readme.txt", - source_path="docs/readme.txt", external_id="dsid_text_only", title="Text readme", content="plain text body", @@ -617,7 +607,6 @@ def test_existing_pageindex_status_allows_legacy_record_without_format_suffix(): filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace") file_ref = filesystem.register_file( storage_uri=source.as_uri(), - source_path="uploads/uploaded", external_id="dsid_legacy_pageindex", title="Legacy PageIndex record", content="text/plain is only a weak default here", @@ -665,7 +654,6 @@ def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monke monkeypatch.setattr(PageIndexClient, "index", fail_index) filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/late.md", external_id="dsid_late_cache", title="Late cache", content=source.read_text(encoding="utf-8"), diff --git a/tests/test_pifs_add_command.py b/tests/test_pifs_add_command.py index 4161b80..47d5913 100644 --- a/tests/test_pifs_add_command.py +++ b/tests/test_pifs_add_command.py @@ -80,12 +80,13 @@ def test_add_text_folder_target_copies_artifact_indexes_summary_and_is_readable( info = filesystem.add_file(str(source), "/documents/reports") - assert info["source_path"] == "documents/reports/filing.txt" + assert info["path"] == "/documents/reports/filing.txt" assert info["folder_path"] == "/documents/reports" assert filesystem.folder_info("/documents/reports")["path"] == "/documents/reports" - assert info["storage_uri"] != source.as_uri() - assert "/artifacts/uploads/" in info["storage_uri"] - copied_path = Path(info["storage_uri"].removeprefix("file://")) + entry = filesystem.store.get_file(info["file_ref"]) + assert entry.storage_uri != source.as_uri() + assert "/artifacts/uploads/" in entry.storage_uri + copied_path = Path(entry.storage_uri.removeprefix("file://")) assert copied_path.read_text(encoding="utf-8") == "alpha filing text for pifs add" assert copied_path.resolve() != source.resolve() @@ -164,7 +165,7 @@ def test_add_configures_semantic_retrieval_in_same_filesystem_instance(tmp_path) recursive=True, page_size=5, ) - assert [item["source_path"] for item in results["data"]] == ["documents/semantic.txt"] + assert [item["path"] for item in results["data"]] == ["/documents/semantic.txt"] def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch): @@ -205,10 +206,11 @@ def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monke info = filesystem.add_file(source, "/documents") executor = PIFSCommandExecutor(filesystem, json_output=True) structure = json.loads(executor.execute("cat /documents/notes.md --structure")) + entry = filesystem.store.get_file(info["file_ref"]) assert structure["data"]["available"] is True assert structure["data"]["structure"][0]["title"] == "Notes" - assert indexed_paths == [Path(info["storage_uri"].removeprefix("file://"))] + assert indexed_paths == [Path(entry.storage_uri.removeprefix("file://"))] assert indexed_paths[0].resolve() != source.resolve() @@ -469,8 +471,6 @@ def test_cli_add_uses_workspace_and_prints_added_file(monkeypatch, capsys, tmp_p return { "file_ref": "file_cli", "path": "/documents/cli.txt", - "source_path": "documents/cli.txt", - "storage_uri": "file:///workspace/artifacts/uploads/file_cli/cli.txt", } monkeypatch.setattr(cli, "PageIndexFileSystem", FakeAddFileSystem) @@ -482,5 +482,4 @@ def test_cli_add_uses_workspace_and_prints_added_file(monkeypatch, capsys, tmp_p assert capsys.readouterr().out == ( "added: /documents/cli.txt\n" "file_ref: file_cli\n" - "storage_uri: file:///workspace/artifacts/uploads/file_cli/cli.txt\n" ) diff --git a/tests/test_pifs_cli.py b/tests/test_pifs_cli.py index 405f5b9..67c3a3a 100644 --- a/tests/test_pifs_cli.py +++ b/tests/test_pifs_cli.py @@ -76,7 +76,6 @@ def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path): file_ref="file_a", external_id="doc_a", source_type="documents", - source_path="documents/a.pdf", title="A", text="summary", vector=[1.0, 0.0, 0.0], @@ -226,6 +225,28 @@ def test_cli_ask_invokes_agent_with_question(monkeypatch, capsys, tmp_path): } +def test_cli_ask_defaults_to_global_agent_model(monkeypatch, capsys, tmp_path): + from pageindex.filesystem import cli + + workspace = tmp_path / "workspace" + agent_calls = [] + monkeypatch.delenv("PIFS_AGENT_MODEL", raising=False) + monkeypatch.delenv("PIFS_MODEL", raising=False) + + def fake_run_pifs_agent(filesystem, question, **kwargs): + agent_calls.append(kwargs) + return "agent answer" + + monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem) + monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent) + + status = cli.main(["ask", "--workspace", str(workspace), "What?"]) + + assert status == 0 + assert capsys.readouterr().out == "agent answer\n" + assert agent_calls[0]["model"] == "gpt-5.4" + + def test_cli_ask_loads_env_file_before_running_agent(monkeypatch, capsys, tmp_path): from pageindex.filesystem import cli diff --git a/tests/test_pifs_find_maxdepth.py b/tests/test_pifs_find_maxdepth.py index 8b93f70..7fbc445 100644 --- a/tests/test_pifs_find_maxdepth.py +++ b/tests/test_pifs_find_maxdepth.py @@ -24,7 +24,6 @@ def _register_find_fixture(tmp_path: Path): source.write_text(f"{title} fixture text", encoding="utf-8") filesystem.register_file( storage_uri=source.as_uri(), - source_path=f"docs/{filename}", folder_path=folder_path, external_id=external_id, title=title, @@ -145,7 +144,6 @@ def test_stat_shell_output_includes_unified_metadata_status(tmp_path): ) filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/source.txt", folder_path="/documents", external_id="doc_generated", title="Generated metadata document", @@ -196,7 +194,6 @@ def test_stat_field_reads_one_metadata_field_across_multiple_targets(tmp_path): source.write_text(f"fixture text {index}", encoding="utf-8") filesystem.register_file( storage_uri=source.as_uri(), - source_path=f"docs/source{index}.txt", folder_path="/documents", external_id=f"doc_summary_{index}", title=f"Summary document {index}", @@ -249,7 +246,6 @@ def test_stat_field_rejects_more_than_twenty_targets(tmp_path): source.write_text(f"fixture text {index}", encoding="utf-8") filesystem.register_file( storage_uri=source.as_uri(), - source_path=f"docs/source{index}.txt", folder_path="/documents", external_id=f"doc_{index}", title=f"Document {index}", @@ -273,7 +269,6 @@ def test_register_rejects_pifs_owned_metadata_fields(tmp_path): with pytest.raises(ValueError, match="PIFS-owned generated field"): filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/source.txt", folder_path="/documents", external_id="doc_conflict", title="Conflict document", @@ -299,7 +294,6 @@ def test_batch_metadata_status_generates_into_unified_metadata(tmp_path): ) file_ref = filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/source.txt", folder_path="/documents", external_id="doc_batch", title="Batch document", diff --git a/tests/test_pifs_like_escape.py b/tests/test_pifs_like_escape.py index 5c0751e..5b624be 100644 --- a/tests/test_pifs_like_escape.py +++ b/tests/test_pifs_like_escape.py @@ -14,7 +14,6 @@ def _register_file( source.write_text(f"{external_id} fixture text", encoding="utf-8") filesystem.register_file( storage_uri=source.as_uri(), - source_path=f"docs/{filename}", folder_path=folder_path, external_id=external_id, title=external_id, diff --git a/tests/test_pifs_path_resolution.py b/tests/test_pifs_path_resolution.py index 184fc53..77552b6 100644 --- a/tests/test_pifs_path_resolution.py +++ b/tests/test_pifs_path_resolution.py @@ -7,7 +7,6 @@ def test_root_virtual_file_path_resolves_without_double_slash(tmp_path): filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") file_ref = filesystem.register_file( storage_uri="file:///tmp/root-source.txt", - source_path="sources/root-source.txt", folder_path="/", external_id="doc_root_title", title="Root Title", @@ -17,13 +16,12 @@ def test_root_virtual_file_path_resolves_without_double_slash(tmp_path): assert filesystem.store.resolve_file_ref("/Root Title") == file_ref -def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path): +def test_nested_virtual_file_path_resolves_by_folder_and_title(tmp_path): from pageindex.filesystem import PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") first_ref = filesystem.register_file( storage_uri="file:///tmp/first.txt", - source_path="b/file.txt", folder_path="/a", external_id="doc_first", title="First", @@ -31,26 +29,23 @@ def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path): ) second_ref = filesystem.register_file( storage_uri="file:///tmp/second.txt", - source_path="second-source.txt", folder_path="/a/b", external_id="doc_second", title="file.txt", content="second content", ) - with pytest.raises(KeyError, match="Ambiguous file target"): - filesystem.store.resolve_file_ref("/a/b/file.txt") + assert filesystem.store.resolve_file_ref("/a/b/file.txt") == second_ref assert first_ref != second_ref -def test_duplicate_source_path_target_raises_clear_error(tmp_path): +def test_unknown_virtual_file_target_raises_clear_error(tmp_path): from pageindex.filesystem import PageIndexFileSystem filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") first_ref = filesystem.register_file( storage_uri="file:///tmp/first.txt", - source_path="shared/source.txt", folder_path="/first", external_id="doc_first", title="First", @@ -58,14 +53,13 @@ def test_duplicate_source_path_target_raises_clear_error(tmp_path): ) second_ref = filesystem.register_file( storage_uri="file:///tmp/second.txt", - source_path="shared/source.txt", folder_path="/second", external_id="doc_second", title="Second", content="second content", ) - with pytest.raises(KeyError, match="Ambiguous file target"): - filesystem.store.resolve_file_ref("/shared/source.txt") + with pytest.raises(KeyError, match="Unknown file target"): + filesystem.store.resolve_file_ref("/shared/missing.txt") assert first_ref != second_ref diff --git a/tests/test_pifs_register_side_effects.py b/tests/test_pifs_register_side_effects.py index 867dd6b..435ca7a 100644 --- a/tests/test_pifs_register_side_effects.py +++ b/tests/test_pifs_register_side_effects.py @@ -40,7 +40,6 @@ def test_register_insert_failure_cleans_owned_artifacts_and_skips_projection( with pytest.raises(RuntimeError, match="catalog insert failed"): filesystem.register_file( storage_uri=source.as_uri(), - source_path="docs/source.txt", folder_path="/documents", external_id="doc_insert_failure", title="Insert failure", diff --git a/tests/test_semantic_index.py b/tests/test_semantic_index.py index c1da0dc..4684d8f 100644 --- a/tests/test_semantic_index.py +++ b/tests/test_semantic_index.py @@ -31,7 +31,6 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path): file_ref="file_a", external_id="doc_a", source_type="github", - source_path="github/a.json", title="Multipart upload limits", text="multipart upload limits", vector=[1.0, 0.0, 0.0], @@ -41,7 +40,6 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path): file_ref="file_b", external_id="doc_b", source_type="slack", - source_path="slack/b.json", title="GPU cache issue", text="gpu cache issue", vector=[0.0, 1.0, 0.0], @@ -72,7 +70,6 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm file_ref=f"file_off_{item:02d}", external_id=f"doc_off_{item:02d}", source_type="documents", - source_path=f"other/{item:02d}.pdf", title=f"Off scope {item:02d}", text="off scope", vector=[1.0, 0.0], @@ -84,7 +81,6 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm file_ref="file_in_scope", external_id="doc_in_scope", source_type="documents", - source_path="documents/in-scope.pdf", title="In scope", text="in scope", vector=[0.0, 1.0], @@ -117,7 +113,6 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path): "file_ref": "file_a", "external_id": "doc_a", "source_type": "documents", - "source_path": "docs/a.pdf", "title": "A", "metadata": { "summary": "Unified metadata summary.", @@ -153,7 +148,6 @@ def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path): "file_ref": "file_a", "external_id": "doc_a", "source_type": "documents", - "source_path": "docs/a.pdf", "title": "A", "metadata": {"summary": "Default dimension summary."}, } @@ -180,7 +174,6 @@ def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path): "file_ref": "file_a", "external_id": "doc_a", "source_type": "documents", - "source_path": "docs/a.pdf", "title": "A", "metadata": {"summary": "Explicit 256 dimension summary."}, } @@ -304,7 +297,6 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path file_ref="file_a", external_id="doc_a", source_type="documents", - source_path="docs/a.pdf", title="A", text="summary", vector=[1.0, 0.0, 0.0],