refactor: remove source paths from PIFS

This commit is contained in:
BukeLy 2026-06-01 01:40:44 +08:00
parent b9e30952ad
commit dc4de3116f
22 changed files with 324 additions and 528 deletions

View file

@ -173,7 +173,7 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
You can customize the processing with additional optional arguments:
```
--model LLM model to use (default: gpt-4o-2024-11-20)
--model LLM model to use (default: gpt-5.4)
--toc-check-pages Pages to check for table of contents (default: 20)
--max-pages-per-node Max pages per node (default: 10)
--max-tokens-per-node Max tokens per node (default: 20000)

View file

@ -48,7 +48,7 @@ from pageindex.filesystem.core import DEFAULT_EMBEDDING_DIMENSIONS
EXAMPLES_DIR = Path(__file__).parent
DOCUMENTS_DIR = EXAMPLES_DIR / "documents"
WORKSPACE = EXAMPLES_DIR / "pifs_workspace"
DEFAULT_MODEL = os.environ.get("PIFS_DEMO_MODEL", "gpt-5.4-mini")
DEFAULT_MODEL = os.environ.get("PIFS_DEMO_MODEL", "gpt-5.4")
DEFAULT_METADATA_PROVIDER = os.environ.get("PIFS_DEMO_METADATA_PROVIDER") or os.environ.get(
"PIFS_METADATA_PROVIDER", "openai"
)
@ -416,7 +416,6 @@ def register_documents(
register_started = time.perf_counter()
file_ref = filesystem.register(
storage_uri=document_path.as_uri(),
source_path=str(document_path),
folder_path="/documents",
external_id=external_id,
title=document_path.name,

View file

@ -1,4 +1,4 @@
model: "gpt-4o-2024-11-20"
model: "gpt-5.4"
# model: "anthropic/claude-sonnet-4-6"
retrieve_model: "gpt-5.4" # defaults to `model` if not set
toc_check_page_num: 20
@ -7,4 +7,4 @@ max_token_num_each_node: 20000
if_add_node_id: "yes"
if_add_node_summary: "yes"
if_add_doc_description: "no"
if_add_node_text: "no"
if_add_node_text: "no"

View file

@ -21,7 +21,7 @@ from .core import PageIndexFileSystem
AGENT_STREAM_MODE_CHOICES = ("off", "tools", "model", "all")
DEFAULT_AGENT_MODEL = "gpt-5.4-mini"
DEFAULT_AGENT_MODEL = "gpt-5.4"
EXIT_COMMANDS = {"exit", "quit", ":q"}
ANSI_ESCAPE_RE = re.compile(r"\x1b(?:\[[0-?]*[ -/]*[@-~]|.)")
PIFS_CONFIG_FILE_ENV = "PIFS_CONFIG_FILE"
@ -290,9 +290,8 @@ def _run_add(argv: list[str], *, workspace: str) -> int:
filesystem = _filesystem_from_workspace(workspace)
info = filesystem.add_file(args.physical_path, args.virtual_target)
print(f"added: {info.get('path') or '/' + str(info.get('source_path') or '').strip('/')}")
print(f"added: {info.get('path')}")
print(f"file_ref: {info['file_ref']}")
print(f"storage_uri: {info['storage_uri']}")
return 0

View file

@ -3,9 +3,7 @@ from __future__ import annotations
import json
import re
import shlex
import subprocess
from dataclasses import asdict, is_dataclass
from pathlib import Path
from typing import Any
from .core import SEMANTIC_RETRIEVAL_CHANNELS, PageIndexFileSystem
@ -411,35 +409,18 @@ class PIFSCommandExecutor:
"mode": "files",
"query": query,
"scope": normalized,
"data": self._grep_file_hits_from_results(direct_results, query),
"data": self._grep_file_hits_from_results(
direct_results,
query,
require_match=True,
),
}
if where is None:
direct_source_hits = self._grep_source_file_hits(
normalized,
query,
limit=limit,
direct_only=True,
)
if direct_source_hits:
return {
"mode": "files",
"query": query,
"scope": normalized,
"data": direct_source_hits,
}
ranked = self._rank_child_folders(
query=query,
children=children,
metadata_filter=where,
limit=limit,
)
if not ranked and where is None:
ranked = self._rank_child_folders_from_source(
query=query,
parent_path=normalized,
children=children,
limit=limit,
)
return {
"mode": "folders",
"query": query,
@ -453,19 +434,15 @@ class PIFSCommandExecutor:
metadata_filter=where,
limit=limit,
)
if not results and where is None:
source_hits = self._grep_source_file_hits(normalized, query, limit=limit)
return {
"mode": "files",
"query": query,
"scope": normalized,
"data": source_hits,
}
return {
"mode": "files",
"query": query,
"scope": normalized,
"data": self._grep_file_hits_from_results(results, query),
"data": self._grep_file_hits_from_results(
results,
query,
require_match=True,
),
}
return {
"mode": "matches",
@ -976,11 +953,9 @@ class PIFSCommandExecutor:
if data.get("mode") == "files":
return "\n\n".join(self._render_stat(item) for item in data.get("data", []))
lines = [
f"target: {data.get('target') or data.get('file_ref')}",
f"target: {data.get('path') or data.get('target') or data.get('file_ref')}",
f"file_ref: {data.get('file_ref')}",
f"document_id: {data.get('external_id') or data.get('document_id') or '-'}",
f"source_path: {data.get('source_path') or '-'}",
f"storage_uri: {data.get('storage_uri') or '-'}",
]
folders = data.get("folders") or []
if folders:
@ -1019,11 +994,10 @@ class PIFSCommandExecutor:
file_ref = item.get("file_ref")
doc_id = item.get("external_id") or item.get("document_id") or "-"
title = self._compact_text(item.get("title") or item.get("name") or "", max_chars=80)
source_path = item.get("source_path") or "-"
folder_paths = item.get("folder_paths") or self._folder_paths_for_file(file_ref)
folders = f" folders={','.join(folder_paths)}" if folder_paths else ""
target = self._file_target_path(item)
return f"{target} id={doc_id} file_ref={file_ref or '-'} title={title} source={source_path}{folders}".strip()
return f"{target} id={doc_id} file_ref={file_ref or '-'} title={title}{folders}".strip()
def _grep_file_hit_text(self, item: dict[str, Any]) -> str:
doc_id = item.get("external_id") or "-"
@ -1046,7 +1020,7 @@ class PIFSCommandExecutor:
if folder_paths and title:
folder = str(folder_paths[0] or "/").rstrip("/")
return f"{folder}/{title}" if folder else f"/{title}"
return str(item.get("source_path") or item.get("external_id") or file_ref or "-")
return str(item.get("external_id") or file_ref or "-")
def _semantic_retrieval_query(self, query: str) -> str:
query = str(query or "").strip()
@ -1150,7 +1124,6 @@ class PIFSCommandExecutor:
"file_ref": result.file_ref,
"external_id": result.external_id,
"title": result.title,
"source_path": result.source_path,
"folder_paths": result.folder_paths,
"line": line,
"text": text or result.snippet,
@ -1160,76 +1133,6 @@ class PIFSCommandExecutor:
break
return hits
def _rank_child_folders_from_source(
self,
*,
query: str,
parent_path: str,
children: list[dict[str, Any]],
limit: int,
) -> list[dict[str, Any]]:
source_dir = self._source_dir_for_folder(parent_path)
source_root = self._source_root()
if source_dir is None or source_root is None:
return []
child_paths = {child["path"]: child for child in children}
counts: dict[str, int] = {}
for path in self._rg_candidate_files(query, source_dir, max_files=5000):
source_path = self._source_path_from_storage(path, source_root)
folder_path = "/" + str(Path(source_path).parent).strip("/")
child_path = self._matching_child_path(parent_path, folder_path, child_paths)
if child_path:
counts[child_path] = counts.get(child_path, 0) + 1
ranked = [
{
"path": path,
"name": child_paths[path]["name"],
"matched_files": matched,
"files": self.filesystem.store.count_files_in_folder(path, recursive=True),
"children_count": child_paths[path].get("children_count", 0),
}
for path, matched in counts.items()
]
ranked.sort(key=lambda item: (-item["matched_files"], item["path"]))
return ranked[:limit]
def _grep_source_file_hits(
self,
folder_path: str,
query: str,
*,
limit: int,
direct_only: bool = False,
) -> list[dict[str, Any]]:
source_dir = self._source_dir_for_folder(folder_path)
source_root = self._source_root()
if source_dir is None or source_root is None:
return []
hits = []
for path in self._rg_candidate_files(query, source_dir, max_files=max(limit * 10, 50)):
file_row = self._file_row_for_storage(path)
if not file_row:
continue
if direct_only and self._folder_path_for_source_path(file_row["source_path"]) != folder_path:
continue
line_number, text = self._first_matching_source_line(path, query)
if line_number is None:
continue
hits.append(
{
"file_ref": file_row["file_ref"],
"external_id": file_row["external_id"],
"title": file_row["title"],
"source_path": file_row["source_path"],
"folder_paths": self._folder_paths_for_file(file_row["file_ref"]),
"line": line_number,
"text": text or file_row["title"],
}
)
if len(hits) >= limit:
break
return hits
def _grep_file_matches(self, target: str, query: str, *, limit: int) -> list[dict[str, Any]]:
file_ref = self.filesystem._resolve_target(target)
entry = self.filesystem.store.get_file(file_ref)
@ -1241,7 +1144,6 @@ class PIFSCommandExecutor:
"file_ref": file_ref,
"external_id": entry.external_id,
"title": entry.title,
"source_path": entry.source_path,
"folder_paths": self._folder_paths_for_file(file_ref),
"line": line_number,
"text": self._compact_text(line, max_chars=220),
@ -1269,136 +1171,6 @@ class PIFSCommandExecutor:
def _is_combined_grep_flag(arg: str) -> bool:
return bool(re.fullmatch(r"-[Rrni]+", arg)) and len(arg) > 2
def _rg_candidate_files(self, query: str, directory: Path, *, max_files: int) -> list[Path]:
if not directory.exists():
return []
terms = [term.lower() for term in re.findall(r"[A-Za-z0-9_]{3,}", query)]
if not terms:
return []
primary = max(terms, key=len)
try:
completed = subprocess.run(
[
"rg",
"-l",
"-i",
"-F",
primary,
str(directory),
"--glob",
"*.json",
"--no-messages",
],
check=False,
capture_output=True,
text=True,
timeout=20,
)
except (OSError, subprocess.TimeoutExpired):
return []
candidates = [Path(line) for line in completed.stdout.splitlines() if line.strip()]
filtered = []
for path in candidates[: max(max_files * 20, max_files)]:
try:
text = path.read_text(encoding="utf-8", errors="ignore").lower()
except OSError:
continue
if all(term in text for term in terms):
filtered.append(path)
if len(filtered) >= max_files:
break
return filtered
def _first_matching_source_line(self, path: Path, query: str) -> tuple[int | None, str]:
try:
lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
except OSError:
return None, ""
for line_number, line in enumerate(lines, 1):
if self._line_matches(line, query):
return line_number, self._compact_text(line, max_chars=220)
return None, ""
def _source_root(self) -> Path | None:
with self.filesystem.store.connect() as conn:
row = conn.execute(
"""
SELECT storage_uri, source_path
FROM files
WHERE deleted_at IS NULL
LIMIT 1
"""
).fetchone()
if row is None:
return None
storage_path = Path(row["storage_uri"])
source_path = Path(row["source_path"])
root = storage_path
for _ in range(len(source_path.parts)):
root = root.parent
return root
def _source_dir_for_folder(self, folder_path: str) -> Path | None:
source_root = self._source_root()
if source_root is None:
return None
stripped = folder_path.strip("/")
return source_root / stripped if stripped else source_root
@staticmethod
def _source_path_from_storage(path: Path, source_root: Path) -> str:
try:
return path.relative_to(source_root).as_posix()
except ValueError:
return path.name
@staticmethod
def _matching_child_path(
parent_path: str,
folder_path: str,
child_paths: dict[str, dict[str, Any]],
) -> str | None:
normalized_parent = parent_path.rstrip("/")
if normalized_parent == "":
normalized_parent = "/"
if normalized_parent == "/":
parts = [part for part in folder_path.strip("/").split("/") if part]
candidate = "/" + parts[0] if parts else "/"
return candidate if candidate in child_paths else None
prefix = normalized_parent + "/"
if not folder_path.startswith(prefix):
return None
remainder = folder_path[len(prefix):]
first = remainder.split("/", 1)[0]
candidate = prefix + first
return candidate if candidate in child_paths else None
def _file_row_for_storage(self, path: Path) -> dict[str, Any] | None:
storage_uri = str(path)
with self.filesystem.store.connect() as conn:
row = conn.execute(
"""
SELECT file_ref, external_id, title, source_path
FROM files
WHERE storage_uri = ? AND deleted_at IS NULL
LIMIT 1
""",
(storage_uri,),
).fetchone()
if row is None:
return None
return {
"file_ref": row["file_ref"],
"external_id": row["external_id"],
"title": row["title"],
"source_path": row["source_path"],
}
@staticmethod
def _folder_path_for_source_path(source_path: str) -> str:
parent = str(Path(source_path).parent).strip(".")
return "/" + parent.strip("/") if parent and parent != "." else "/"
def _folder_paths_for_file(self, file_ref: str | None) -> list[str]:
if not file_ref:
return []

View file

@ -144,13 +144,12 @@ class PageIndexFileSystem:
self,
*,
storage_uri: str,
source_path: str,
folder_path: Optional[str] = None,
metadata: Optional[dict[str, Any]] = None,
external_id: Optional[str] = None,
title: Optional[str] = None,
content: str = "",
content_type: str = "text/plain",
content_type: str | None = None,
source_type: Optional[str] = None,
metadata_policy: Optional[dict[str, Any]] = None,
metadata_status: Optional[str] = None,
@ -159,7 +158,6 @@ class PageIndexFileSystem:
[
{
"storage_uri": storage_uri,
"source_path": source_path,
"folder_path": folder_path,
"metadata": metadata,
"external_id": external_id,
@ -231,7 +229,6 @@ class PageIndexFileSystem:
record = self._prepare_file_record(
{
"storage_uri": final_path.as_uri(),
"source_path": virtual_path.strip("/"),
"folder_path": folder_path,
"metadata": {},
"external_id": None,
@ -604,23 +601,27 @@ class PageIndexFileSystem:
folder["path"]
for folder in self.store.folder_memberships(file_ref)
]
folder_path = self._preferred_folder_path(
folder_paths,
path,
entry.folder_path,
)
rank = len(rows) + 1
rows.append(
{
"rank": rank,
"similarity": self._semantic_candidate_similarity(candidate),
"score": self._semantic_candidate_score(candidate),
"path": self._stable_file_locator(file_ref, entry),
"path": self._stable_file_locator(
file_ref,
entry,
folder_path=folder_path,
),
"file_ref": file_ref,
"document_id": entry.external_id,
"external_id": entry.external_id,
"title": entry.title,
"source_path": entry.source_path,
"folder_path": self._preferred_folder_path(
folder_paths,
path,
entry.folder_path,
),
"folder_path": folder_path,
"folder_paths": folder_paths,
"summary": str((entry.metadata or {}).get("summary") or ""),
"snippet": str(getattr(candidate, "snippet", "") or entry.descriptor),
@ -724,7 +725,6 @@ class PageIndexFileSystem:
folder_paths=folder_paths,
metadata=row["metadata"],
metadata_status=row["metadata_status"],
source_path=row["source_path"],
id=row["id"],
document_id=row["document_id"],
name=row["name"],
@ -845,7 +845,6 @@ class PageIndexFileSystem:
"mode": "structure",
"file_ref": file_ref,
"external_id": entry.external_id,
"source_path": entry.source_path,
"status": entry.pageindex_tree_status,
"available": True,
"pageindex_doc_id": doc_id,
@ -887,7 +886,6 @@ class PageIndexFileSystem:
"mode": "page",
"file_ref": file_ref,
"external_id": entry.external_id,
"source_path": entry.source_path,
"status": entry.pageindex_tree_status,
"available": True,
"pageindex_doc_id": doc_id,
@ -905,7 +903,7 @@ class PageIndexFileSystem:
return
raise ValueError(
f"{command} is only supported for txt/text files; "
f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
f"got title={entry.title!r}, content_type={entry.content_type!r}. "
"Use cat <path|file_ref|document_id> --structure, "
"or cat <path|file_ref|document_id> --page for PDF/Markdown PageIndex files."
)
@ -915,29 +913,22 @@ class PageIndexFileSystem:
return
raise ValueError(
f"{command} is only supported for PDF/Markdown PageIndex files; "
f"got source_path={entry.source_path!r}, content_type={entry.content_type!r}. "
f"got title={entry.title!r}, content_type={entry.content_type!r}. "
"Use cat <path|file_ref|document_id> --all for txt/text files."
)
@classmethod
def _file_format(cls, entry: Any) -> str:
suffix = Path(str(entry.source_path or "")).suffix.lower()
content_type = cls._normalized_content_type(entry.content_type)
if suffix == ".pdf" or content_type == "application/pdf":
return "pdf"
if suffix in PAGEINDEX_DOCUMENT_SUFFIXES or content_type in PAGEINDEX_DOCUMENT_CONTENT_TYPES:
return "markdown"
if suffix in TEXT_ARTIFACT_SUFFIXES:
return "text"
if entry.pageindex_doc_id or entry.pageindex_tree_status != "not_built":
if getattr(entry, "pageindex_doc_id", None) or entry.pageindex_tree_status != "not_built":
return "pageindex"
if content_type in TEXT_ARTIFACT_CONTENT_TYPES:
return "text"
file_format = cls._content_format(getattr(entry, "title", ""), entry.content_type)
if file_format != "unsupported":
return file_format
return "unsupported"
@classmethod
def _source_format(cls, source_path: Any, content_type: str | None) -> str:
suffix = Path(str(source_path or "")).suffix.lower()
def _content_format(cls, filename: Any, content_type: str | None) -> str:
suffix = Path(str(filename or "")).suffix.lower()
normalized_content_type = cls._normalized_content_type(content_type)
if suffix == ".pdf" or normalized_content_type == "application/pdf":
return "pdf"
@ -977,27 +968,27 @@ class PageIndexFileSystem:
self,
*,
storage_uri: str,
source_path: str,
title: str,
content_type: str,
) -> tuple[str | None, str, dict[str, Any] | None]:
if self._source_format(source_path, content_type) not in {"pdf", "markdown"}:
if self._content_format(title, content_type) not in {"pdf", "markdown"}:
return None, "not_built", None
client = self._pageindex_client()
source = self._canonical_source_path(storage_uri=storage_uri, source_path=source_path)
cached_doc_id = self._find_cached_pageindex_doc_id(client, source)
local_path = self._canonical_storage_uri_path(storage_uri)
cached_doc_id = self._find_cached_pageindex_doc_id(client, local_path)
if cached_doc_id:
return cached_doc_id, "built", None
if source is None:
if local_path is None:
return None, "failed", self._pageindex_tree_failure_record(
source="PageIndexFileSystem.registration",
error_type="UnresolvableSourcePath",
error_type="UnresolvableStorageUri",
message=(
"PageIndex source path must resolve to a local file path for "
"storage_uri must resolve to a local file path for "
"PDF/Markdown registration."
),
)
try:
doc_id = client.index(source)
doc_id = client.index(local_path)
return doc_id, "built", None
except Exception as exc:
return None, "failed", self._pageindex_tree_failure_record(
@ -1024,25 +1015,41 @@ class PageIndexFileSystem:
def _find_cached_pageindex_doc_id(
self,
client: PageIndexClient,
source_path: str | None,
local_path: str | None,
) -> str | None:
if source_path is None:
if local_path is None:
return None
for doc_id, doc in client.documents.items():
if self._canonical_path(doc.get("path")) == source_path:
if self._canonical_path(doc.get("path")) == local_path:
return doc_id
return None
def _canonical_source_path(self, *, storage_uri: str, source_path: str) -> str | None:
def _canonical_storage_uri_path(self, storage_uri: str) -> str | None:
parsed = urlparse(storage_uri)
if parsed.scheme == "file":
return self._canonical_path(unquote(parsed.path))
if storage_uri and not parsed.scheme:
return self._canonical_path(storage_uri)
if Path(source_path).expanduser().is_absolute():
return self._canonical_path(source_path)
return None
@staticmethod
def _title_from_storage_uri(storage_uri: str) -> str:
parsed = urlparse(str(storage_uri or ""))
path = unquote(parsed.path) if parsed.scheme else str(storage_uri or "")
return Path(path).name
@classmethod
def _infer_content_type(cls, *, title: str, storage_uri: str) -> str:
for filename in (title, cls._title_from_storage_uri(storage_uri)):
suffix = Path(str(filename or "")).suffix.lower()
if suffix == ".pdf":
return "application/pdf"
if suffix in PAGEINDEX_DOCUMENT_SUFFIXES:
return "text/markdown"
if suffix in TEXT_ARTIFACT_SUFFIXES:
return "text/plain"
return "text/plain"
@staticmethod
def _canonical_path(path: Any) -> str | None:
if not path:
@ -1124,12 +1131,12 @@ class PageIndexFileSystem:
}
def _add_file_content(self, path: Path, content_type: str) -> str:
if self._source_format(str(path), content_type) in {"markdown", "text"}:
if self._content_format(path.name, content_type) in {"markdown", "text"}:
return path.read_text(encoding="utf-8")
return ""
def _require_add_pageindex_ready(self, record: dict[str, Any]) -> None:
if self._source_format(record["source_path"], record["content_type"]) not in {
if self._content_format(record["title"], record["content_type"]) not in {
"pdf",
"markdown",
}:
@ -1178,33 +1185,47 @@ class PageIndexFileSystem:
def _prepare_file_record(self, file: dict[str, Any]) -> dict[str, Any]:
storage_uri = file["storage_uri"]
raw_source_path = str(file["source_path"])
source_path = raw_source_path.strip("/")
metadata = file.get("metadata") or {}
if not isinstance(metadata, dict):
raise ValueError("metadata must be a JSON object")
self._validate_register_metadata(metadata)
external_id = file.get("external_id")
content = file.get("content") or ""
content_type = file.get("content_type") or "text/plain"
folder_path = normalize_path(file.get("folder_path") or "/")
title = str(
file.get("title")
or metadata.get("title")
or self._title_from_storage_uri(storage_uri)
or external_id
or ""
).strip()
if not title:
raise ValueError("file title is required")
content_type = file.get("content_type") or self._infer_content_type(
title=title,
storage_uri=storage_uri,
)
file_ref = make_file_ref(
str(external_id or self._join_virtual_file_path(folder_path, title).strip("/"))
)
(
pageindex_doc_id,
pageindex_tree_status,
pageindex_tree_failure,
) = self._registration_pageindex_pointer(
storage_uri=storage_uri,
source_path=raw_source_path,
title=title,
content_type=content_type,
)
artifact_content = self._registration_text_artifact_content(
source_path=raw_source_path,
title=title,
content_type=content_type,
pageindex_doc_id=pageindex_doc_id,
pageindex_tree_status=pageindex_tree_status,
fallback_content=content,
)
fts_content = file.get("fts_content", artifact_content)
source_type = file.get("source_type") or self._infer_source_type(source_path)
source_type = file.get("source_type")
metadata_policy = self._normalize_metadata_policy(
file.get("metadata_policy"),
metadata=metadata,
@ -1217,9 +1238,6 @@ class PageIndexFileSystem:
self._attach_pageindex_tree_failure(metadata_status, pageindex_tree_failure)
indexed_metadata = SQLiteFileSystemStore.indexed_metadata_values(metadata)
searchable_metadata = dict(metadata)
folder_path = normalize_path(file.get("folder_path") or "/")
title = file.get("title") or metadata.get("title") or Path(source_path).stem
file_ref = make_file_ref(external_id or source_path)
text_artifact_path = file.get("text_artifact_path")
owns_text_artifact = text_artifact_path is None
if text_artifact_path is None:
@ -1234,7 +1252,6 @@ class PageIndexFileSystem:
"file_ref": file_ref,
"external_id": external_id,
"storage_uri": storage_uri,
"source_path": source_path,
"title": title,
"descriptor": descriptor,
"content_type": content_type,
@ -1260,13 +1277,13 @@ class PageIndexFileSystem:
def _registration_text_artifact_content(
self,
*,
source_path: str,
title: str,
content_type: str,
pageindex_doc_id: str | None,
pageindex_tree_status: str,
fallback_content: str,
) -> str:
if self._source_format(source_path, content_type) not in {"pdf", "markdown"}:
if self._content_format(title, content_type) not in {"pdf", "markdown"}:
return fallback_content
if pageindex_tree_status != "built" or not pageindex_doc_id:
return fallback_content
@ -1296,15 +1313,11 @@ class PageIndexFileSystem:
@staticmethod
def _raw_artifact_payload(
*,
storage_uri: str,
source_path: str,
folder_path: str,
metadata: dict[str, Any],
metadata_status: dict[str, Any],
) -> dict[str, Any]:
return {
"storage_uri": storage_uri,
"source_path": source_path,
"folder_path": folder_path,
"metadata": metadata,
"metadata_status": metadata_status,
@ -1323,8 +1336,6 @@ class PageIndexFileSystem:
self.store.write_raw_artifact(
record["file_ref"],
self._raw_artifact_payload(
storage_uri=record["storage_uri"],
source_path=record["source_path"],
folder_path=record["folder_path"],
metadata=record["metadata"],
metadata_status=record["metadata_status"],
@ -1351,7 +1362,6 @@ class PageIndexFileSystem:
"file_ref": entry.file_ref,
"external_id": entry.external_id,
"storage_uri": entry.storage_uri,
"source_path": entry.source_path,
"title": entry.title,
"descriptor": entry.descriptor,
"content_type": entry.content_type,
@ -1394,7 +1404,6 @@ class PageIndexFileSystem:
file_ref=record["file_ref"],
external_id=record.get("external_id"),
title=record["title"],
source_path=record["source_path"],
content_type=record["content_type"],
source_type=record.get("source_type"),
text=Path(record["text_artifact_path"]).read_text(encoding="utf-8"),
@ -1638,7 +1647,6 @@ class PageIndexFileSystem:
text=text,
external_id=entry.external_id,
folder_path=entry.folder_path,
source_path=entry.source_path,
)
def _open_all(self, file_ref: str) -> OpenResult:
@ -1652,7 +1660,6 @@ class PageIndexFileSystem:
text=text,
external_id=entry.external_id,
folder_path=entry.folder_path,
source_path=entry.source_path,
)
@classmethod
@ -1671,7 +1678,6 @@ class PageIndexFileSystem:
"mode": mode,
"file_ref": entry.file_ref,
"external_id": entry.external_id,
"source_path": entry.source_path,
"status": entry.pageindex_tree_status,
"available": False,
"message": message,
@ -1744,19 +1750,30 @@ class PageIndexFileSystem:
separators=(",", ":"),
)
def _stable_file_locator(self, file_ref: str, entry: Any) -> str:
source_path = str(getattr(entry, "source_path", "") or "").strip()
if source_path:
target = "/" + source_path.strip("/")
try:
if self.store.resolve_file_ref(target) == file_ref:
return target
except KeyError:
pass
external_id = str(getattr(entry, "external_id", "") or "").strip()
if external_id:
return external_id
return file_ref
def _stable_file_locator(
self,
file_ref: str,
entry: Any,
*,
folder_path: str | None = None,
) -> str:
folder_path = normalize_path(folder_path or getattr(entry, "folder_path", None) or "/")
title = str(getattr(entry, "title", "") or "").strip()
if not title:
raise RuntimeError(f"browse cannot build a virtual path for {file_ref}: missing title")
target = self._join_virtual_file_path(folder_path, title.strip("/"))
try:
resolved_file_ref = self.store.resolve_file_ref(target)
except KeyError as exc:
raise RuntimeError(
f"browse produced an unresolved virtual path for {file_ref}: {target}"
) from exc
if resolved_file_ref != file_ref:
raise RuntimeError(
"browse produced a non-idempotent virtual path: "
f"{target} resolved to {resolved_file_ref}, expected {file_ref}"
)
return target
@staticmethod
def _build_descriptor(title: str, metadata: dict[str, Any]) -> str:
@ -2011,11 +2028,6 @@ class PageIndexFileSystem:
return "number"
return "string"
@staticmethod
def _infer_source_type(source_path: str) -> Optional[str]:
parts = [part for part in Path(source_path).parts if part not in ("", ".")]
return parts[0] if parts else None
@staticmethod
def _scope_folder_path(scope: Optional[dict[str, Any]]) -> Optional[str]:
if not scope:

View file

@ -18,7 +18,6 @@ class MetadataGenerationInput:
file_ref: str
external_id: str | None
title: str
source_path: str
content_type: str
source_type: str | None
text: str

View file

@ -21,7 +21,6 @@ class SemanticIndexRecord:
text: str
external_id: str | None = None
source_type: str = ""
source_path: str = ""
title: str = ""
metadata: dict[str, Any] | None = None
@ -32,7 +31,6 @@ class SemanticSearchResult:
distance: float
external_id: str | None
source_type: str
source_path: str
title: str
text_hash: str
metadata: dict[str, Any]
@ -88,7 +86,6 @@ class SQLiteVecSemanticIndex:
file_ref TEXT NOT NULL UNIQUE,
external_id TEXT,
source_type TEXT NOT NULL DEFAULT '',
source_path TEXT NOT NULL DEFAULT '',
title TEXT NOT NULL DEFAULT '',
text_hash TEXT NOT NULL,
text_chars INTEGER NOT NULL DEFAULT 0,
@ -215,7 +212,6 @@ class SQLiteVecSemanticIndex:
d.file_ref,
d.external_id,
d.source_type,
d.source_path,
d.title,
d.text_hash,
d.metadata_json,
@ -245,7 +241,6 @@ class SQLiteVecSemanticIndex:
d.file_ref,
d.external_id,
d.source_type,
d.source_path,
d.title,
d.text_hash,
d.metadata_json,
@ -269,7 +264,6 @@ class SQLiteVecSemanticIndex:
distance=float(row["distance"]),
external_id=row["external_id"],
source_type=row["source_type"],
source_path=row["source_path"],
title=row["title"],
text_hash=row["text_hash"],
metadata=metadata,
@ -361,15 +355,14 @@ class SQLiteVecSemanticIndex:
cursor = conn.execute(
"""
INSERT INTO semantic_index_docs(
file_ref, external_id, source_type, source_path, title,
file_ref, external_id, source_type, title,
text_hash, text_chars, metadata_json
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
) VALUES (?, ?, ?, ?, ?, ?, ?)
""",
(
record.file_ref,
record.external_id,
record.source_type,
record.source_path,
record.title,
text_hash,
len(record.text),
@ -381,10 +374,9 @@ class SQLiteVecSemanticIndex:
conn.execute(
"""
UPDATE semantic_index_docs
SET external_id = ?,
source_type = ?,
source_path = ?,
title = ?,
SET external_id = ?,
source_type = ?,
title = ?,
text_hash = ?,
text_chars = ?,
metadata_json = ?,
@ -394,7 +386,6 @@ class SQLiteVecSemanticIndex:
(
record.external_id,
record.source_type,
record.source_path,
record.title,
text_hash,
len(record.text),

View file

@ -39,7 +39,6 @@ class SemanticProjectionCandidate:
score: float
sources: list[dict[str, Any]]
source_type: str
source_path: str
title: str
metadata: dict[str, Any]
snippet: str
@ -261,7 +260,6 @@ class SummaryProjectionIndexer:
text=summary,
external_id=record.get("external_id"),
source_type=str(record.get("source_type") or ""),
source_path=str(record.get("source_path") or ""),
title=str(record.get("title") or ""),
metadata=metadata,
)
@ -493,7 +491,6 @@ def rank_single_semantic_channel(
score=1 / (60 + rank),
sources=[{"channel": channel, "rank": rank, "distance": result.distance}],
source_type=result.source_type,
source_path=result.source_path,
title=result.title,
metadata=result.metadata,
snippet=f"{channel}_vector rank={rank}",

View file

@ -43,7 +43,6 @@ class SQLiteFileSystemStore:
file_ref TEXT PRIMARY KEY,
external_id TEXT,
storage_uri TEXT NOT NULL,
source_path TEXT NOT NULL,
title TEXT NOT NULL,
descriptor TEXT NOT NULL,
content_type TEXT NOT NULL,
@ -124,7 +123,6 @@ class SQLiteFileSystemStore:
USING fts5(file_ref UNINDEXED, title, body, metadata_text);
CREATE INDEX IF NOT EXISTS idx_files_external_id ON files(external_id);
CREATE INDEX IF NOT EXISTS idx_files_source_path ON files(source_path);
CREATE INDEX IF NOT EXISTS idx_files_source_type ON files(source_type);
CREATE INDEX IF NOT EXISTS idx_folders_path ON folders(path);
CREATE INDEX IF NOT EXISTS idx_folders_parent_id ON folders(parent_id);
@ -168,6 +166,7 @@ class SQLiteFileSystemStore:
fts_file_ref_rows = []
fts_rows = []
metadata_rows = []
pending_folder_titles: dict[tuple[str, str], str] = {}
metadata_field_ids = {
row["name"]: row["field_id"]
for row in conn.execute(
@ -184,6 +183,18 @@ class SQLiteFileSystemStore:
kind=record.get("folder_kind", "manual"),
)
folder_cache[folder_cache_key] = folder_id
self._ensure_title_available_in_folder(
conn,
folder_id=folder_id,
file_ref=record["file_ref"],
title=record["title"],
)
title_key = (folder_id, str(record["title"]))
existing_file_ref = pending_folder_titles.get(title_key)
if existing_file_ref and existing_file_ref != record["file_ref"]:
target = self._virtual_file_target(conn, folder_id, str(record["title"]))
raise FileExistsError(f"File already exists at {target}")
pending_folder_titles[title_key] = record["file_ref"]
file_rows.append(self._file_insert_values(record))
membership_rows.append(
(
@ -244,7 +255,6 @@ class SQLiteFileSystemStore:
"file_ref",
"external_id",
"storage_uri",
"source_path",
"title",
"descriptor",
"content_type",
@ -270,7 +280,6 @@ class SQLiteFileSystemStore:
record["file_ref"],
record["external_id"],
record["storage_uri"],
record["source_path"],
record["title"],
record["descriptor"],
record["content_type"],
@ -338,6 +347,12 @@ class SQLiteFileSystemStore:
with self.connect() as conn:
resolved_file_ref = self._resolve_file_ref(conn, file_ref)
folder_id = self._resolve_or_create_folder(conn, folder_path_or_id)
self._ensure_title_available_in_folder(
conn,
folder_id=folder_id,
file_ref=resolved_file_ref,
title=self._file_title(conn, resolved_file_ref),
)
conn.execute(
"""
INSERT INTO file_folders(file_ref, folder_id, metadata_json)
@ -357,6 +372,12 @@ class SQLiteFileSystemStore:
for item in items:
resolved_file_ref = self._resolve_file_ref(conn, item["file_ref"])
folder_id = self._resolve_or_create_folder(conn, item["folder"])
self._ensure_title_available_in_folder(
conn,
folder_id=folder_id,
file_ref=resolved_file_ref,
title=self._file_title(conn, resolved_file_ref),
)
conn.execute(
"""
INSERT INTO file_folders(file_ref, folder_id, metadata_json)
@ -371,6 +392,56 @@ class SQLiteFileSystemStore:
),
)
def _ensure_title_available_in_folder(
self,
conn: sqlite3.Connection,
*,
folder_id: str,
file_ref: str,
title: str,
) -> None:
row = conn.execute(
"""
SELECT f.file_ref, fo.path
FROM files f
JOIN file_folders ff ON ff.file_ref = f.file_ref
JOIN folders fo ON fo.folder_id = ff.folder_id
WHERE f.deleted_at IS NULL
AND ff.folder_id = ?
AND f.title = ?
AND f.file_ref != ?
LIMIT 1
""",
(folder_id, title, file_ref),
).fetchone()
if row:
raise FileExistsError(
f"File already exists at {self._virtual_file_target(conn, folder_id, title)}"
)
@staticmethod
def _virtual_file_target(
conn: sqlite3.Connection,
folder_id: str,
title: str,
) -> str:
row = conn.execute(
"SELECT path FROM folders WHERE folder_id = ?",
(folder_id,),
).fetchone()
folder_path = normalize_path(row["path"] if row else "/")
return f"/{title}" if folder_path == "/" else f"{folder_path}/{title}"
@staticmethod
def _file_title(conn: sqlite3.Connection, file_ref: str) -> str:
row = conn.execute(
"SELECT title FROM files WHERE file_ref = ? AND deleted_at IS NULL",
(file_ref,),
).fetchone()
if row is None:
raise KeyError(f"Unknown file target: {file_ref}")
return str(row["title"])
def replace_metadata_values(
self,
conn: sqlite3.Connection,
@ -791,7 +862,6 @@ class SQLiteFileSystemStore:
selects = [
"f.file_ref",
"f.external_id",
"f.source_path",
"f.title",
"f.descriptor",
"f.pageindex_tree_status",
@ -984,7 +1054,6 @@ class SQLiteFileSystemStore:
f.file_ref,
f.external_id,
f.storage_uri,
f.source_path,
f.title,
f.descriptor,
f.content_type,
@ -1125,30 +1194,6 @@ class SQLiteFileSystemStore:
).fetchone()
if row:
return row["file_ref"]
stripped = target.strip("/")
rows = conn.execute(
"""
SELECT
f.file_ref,
f.external_id,
f.title,
f.source_path,
COALESCE(MIN(fo.path), '/') AS folder_path
FROM files f
LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref
LEFT JOIN folders fo ON fo.folder_id = ff.folder_id
WHERE f.source_path = ? AND f.deleted_at IS NULL
GROUP BY f.file_ref, f.external_id, f.title, f.source_path
ORDER BY f.file_ref
LIMIT 2
""",
(stripped,),
).fetchall()
if len(rows) > 1:
matches = "; ".join(self._virtual_match_summary(row) for row in rows)
raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}")
if rows:
return rows[0]["file_ref"]
virtual_file_ref = self._resolve_virtual_file_ref(conn, target)
if virtual_file_ref:
return virtual_file_ref
@ -1163,12 +1208,9 @@ class SQLiteFileSystemStore:
f.file_ref,
f.external_id,
f.title,
f.source_path,
pf.path AS folder_path,
(CASE WHEN pf.path = '/' THEN '/' ELSE pf.path || '/' END)
|| ltrim(f.title, '/') AS title_virtual_path,
(CASE WHEN pf.path = '/' THEN '/' ELSE pf.path || '/' END)
|| ltrim(f.source_path, '/') AS source_virtual_path
|| ltrim(f.title, '/') AS title_virtual_path
FROM files f
JOIN file_folders ff ON ff.file_ref = f.file_ref
JOIN folders pf ON pf.folder_id = ff.folder_id
@ -1178,16 +1220,14 @@ class SQLiteFileSystemStore:
file_ref,
external_id,
title,
source_path,
MIN(folder_path) AS folder_path
FROM virtual_matches
WHERE title_virtual_path = ?
OR source_virtual_path = ?
GROUP BY file_ref, external_id, title, source_path
GROUP BY file_ref, external_id, title
ORDER BY file_ref
LIMIT 2
""",
(virtual_target, virtual_target),
(virtual_target,),
).fetchall()
if not rows:
return None
@ -1201,8 +1241,7 @@ class SQLiteFileSystemStore:
external_id = row["external_id"] or "-"
return (
f"file_ref={row['file_ref']} external_id={external_id} "
f"folder={row['folder_path']} title={row['title']!r} "
f"source_path={row['source_path']!r}"
f"folder={row['folder_path']} title={row['title']!r}"
)
def ensure_folder(
@ -1475,18 +1514,12 @@ class SQLiteFileSystemStore:
JOIN folders fo ON fo.folder_id = ff.folder_id
WHERE f.deleted_at IS NULL
AND fo.path = ?
AND (
f.title = ?
OR f.source_path = ?
OR f.source_path LIKE ? ESCAPE '\\'
)
AND f.title = ?
LIMIT 1
""",
(
path,
basename,
basename,
"%/" + self._like_escape(basename),
),
).fetchone()
return row is not None
@ -1548,7 +1581,6 @@ class SQLiteFileSystemStore:
f.file_ref,
f.external_id,
f.storage_uri,
f.source_path,
f.title,
f.descriptor,
f.content_type,
@ -1592,7 +1624,6 @@ class SQLiteFileSystemStore:
f.external_id,
f.title,
f.descriptor,
f.source_path,
f.pageindex_tree_status,
f.metadata_json,
f.metadata_status_json,
@ -1804,7 +1835,6 @@ class SQLiteFileSystemStore:
"pageNum": None,
"createdAt": cls._row_value(row, "created_at"),
"folderId": cls._row_value(row, "folder_id"),
"source_path": row["source_path"],
"folder_path": row["folder_path"],
"metadata": json.loads(row["metadata_json"] or "{}"),
"metadata_status": json.loads(
@ -1827,7 +1857,6 @@ class SQLiteFileSystemStore:
"pageNum": None,
"createdAt": cls._row_value(row, "created_at"),
"folderId": cls._row_value(row, "folder_id"),
"source_path": row["source_path"],
"snippet": row["snippet"] or row["title"],
"folder_path": row["folder_path"],
"metadata": json.loads(row["metadata_json"] or "{}"),
@ -1846,7 +1875,6 @@ class SQLiteFileSystemStore:
file_ref=row["file_ref"],
external_id=row["external_id"],
storage_uri=row["storage_uri"],
source_path=row["source_path"],
title=row["title"],
descriptor=row["descriptor"],
content_type=row["content_type"],
@ -1871,8 +1899,7 @@ class SQLiteFileSystemStore:
"document_id": entry.external_id,
"external_id": entry.external_id,
"name": entry.title,
"storage_uri": entry.storage_uri,
"source_path": entry.source_path,
"path": cls._virtual_file_path(entry.folder_path, entry.title),
"title": entry.title,
"description": entry.descriptor,
"status": entry.pageindex_tree_status,
@ -1881,8 +1908,6 @@ class SQLiteFileSystemStore:
"content_type": entry.content_type,
"source_type": entry.source_type,
"fingerprint": entry.fingerprint,
"text_artifact_path": entry.text_artifact_path,
"raw_artifact_path": entry.raw_artifact_path,
"pageindex_doc_id": entry.pageindex_doc_id,
"pageindex_tree_status": entry.pageindex_tree_status,
"metadata": entry.metadata,
@ -1890,6 +1915,11 @@ class SQLiteFileSystemStore:
"folder_path": entry.folder_path,
}
@staticmethod
def _virtual_file_path(folder_path: str, title: str) -> str:
folder_path = normalize_path(folder_path)
return f"/{title}" if folder_path == "/" else f"{folder_path}/{title}"
@staticmethod
def _query_text(query: str | list[str] | None) -> str:
if query is None:

View file

@ -13,7 +13,6 @@ class SearchResult:
folder_path: str
folder_paths: list[str]
metadata: dict[str, Any]
source_path: str = ""
id: Optional[str] = None
document_id: Optional[str] = None
name: str = ""
@ -33,7 +32,6 @@ class OpenResult:
text: str
external_id: Optional[str] = None
folder_path: str = ""
source_path: str = ""
@dataclass(frozen=True)
@ -50,7 +48,6 @@ class FileEntry:
file_ref: str
external_id: Optional[str]
storage_uri: str
source_path: str
title: str
descriptor: str
content_type: str

View file

@ -21,7 +21,6 @@ def test_insert_files_does_not_disable_sqlite_synchronous(tmp_path):
"file_ref": "ref_report",
"external_id": "doc_report",
"storage_uri": "file:///tmp/report.pdf",
"source_path": "documents/report.pdf",
"folder_path": "/documents",
"title": "Report",
"descriptor": "documents/report.pdf",

View file

@ -20,7 +20,6 @@ def test_metadata_generator_uses_provider_parameter():
file_ref="file_a",
external_id="doc_a",
title="A",
source_path="docs/a.txt",
content_type="text/plain",
source_type=None,
text="hello",

View file

@ -135,7 +135,6 @@ def _register_browse_file(
filesystem.metadata_generator = SummaryGenerator()
return filesystem.register_file(
storage_uri=f"file:///tmp/{external_id}.txt",
source_path=f"documents/{external_id}.txt",
folder_path=folder_path,
external_id=external_id,
title=f"{external_id}.txt",
@ -427,7 +426,7 @@ def test_browse_shell_output_uses_fixed_blocks_with_pagination_command(tmp_path)
assert "score:" not in rendered
def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp_path):
def test_browse_shell_path_uses_virtual_locator_when_source_collides(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -443,7 +442,6 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp
)
first_ref = filesystem.register_file(
storage_uri="file:///tmp/first.json",
source_path="shared/source.json",
folder_path="/documents",
external_id="dsid_first",
title="First",
@ -459,7 +457,6 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp
)
filesystem.register_file(
storage_uri="file:///tmp/second.json",
source_path="shared/source.json",
folder_path="/documents",
external_id="dsid_second",
title="Second",
@ -478,13 +475,52 @@ def test_browse_shell_path_falls_back_to_unique_locator_when_source_collides(tmp
rendered = executor.execute('browse /documents "first"')
assert "path: dsid_first" in rendered
assert "path: /documents/First" in rendered
assert "path: /shared/source.json" not in rendered
assert filesystem.store.resolve_file_ref("dsid_first") == first_ref
with pytest.raises(KeyError, match="Ambiguous file target"):
assert filesystem.store.resolve_file_ref("/documents/First") == first_ref
with pytest.raises(KeyError, match="Unknown file target"):
filesystem.store.resolve_file_ref("/shared/source.json")
def test_browse_shell_path_never_returns_storage_uri_path(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(
values={"summary": "summary for physical source report"}
)
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
file_ref = filesystem.register_file(
storage_uri="file:///Users/chengjie/Downloads/source/report.pdf",
folder_path="/documents/reports",
external_id="dsid_report",
title="report.pdf",
content="physical source report content",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
filesystem.semantic_retrieval_backend = BrowseBackend(["dsid_report"])
executor = PIFSCommandExecutor(filesystem)
rendered = executor.execute('browse /documents/reports "physical source"')
assert "path: /documents/reports/report.pdf" in rendered
assert "/Users/chengjie/Downloads" not in rendered
assert filesystem.store.resolve_file_ref("/documents/reports/report.pdf") == file_ref
def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -501,7 +537,6 @@ def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path
)
file_ref = filesystem.register_file(
storage_uri="file:///tmp/report.pdf",
source_path="examples/documents/report.pdf",
folder_path="/documents",
external_id="dsid_report",
title="report.pdf",
@ -525,14 +560,13 @@ def test_browse_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path
)
assert "source_type" not in backend.calls[0][2]
assert "source_path" not in backend.calls[0][2]
assert result["data"]["data"][0]["path"] == "/examples/documents/report.pdf"
assert result["data"]["data"][0]["path"] == "/documents/report.pdf"
assert result["data"]["data"][0]["summary"] == "Federal Reserve annual report summary"
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref
def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
def test_register_file_rejects_duplicate_title_in_folder(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class SummaryGenerator:
@ -545,9 +579,8 @@ def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path):
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
first_ref = filesystem.register_file(
filesystem.register_file(
storage_uri="file:///tmp/first.json",
source_path="slack/dsid_first.json",
folder_path="/documents",
external_id="dsid_first",
title="announcements",
@ -561,34 +594,25 @@ def test_browse_path_is_unique_source_target_when_titles_collide(tmp_path):
}
},
)
filesystem.register_file(
storage_uri="file:///tmp/second.json",
source_path="slack/dsid_second.json",
folder_path="/documents",
external_id="dsid_second",
title="announcements",
content="second announcement mentions unrelated maintenance.",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(executor.execute('browse /documents "H200 reservations"'))
assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json"
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
with pytest.raises(KeyError, match="Ambiguous file target"):
filesystem.store.resolve_file_ref("/documents/announcements")
with pytest.raises(FileExistsError, match="File already exists at /documents/announcements"):
filesystem.register_file(
storage_uri="file:///tmp/second.json",
folder_path="/documents",
external_id="dsid_second",
title="announcements",
content="second announcement mentions unrelated maintenance.",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
def test_browse_path_uses_virtual_title_when_storage_paths_are_unrelated(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -604,7 +628,6 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
)
first_ref = filesystem.register_file(
storage_uri="file:///tmp/first.json",
source_path="shared/source.json",
folder_path="/documents",
external_id="dsid_first",
title="First",
@ -620,7 +643,6 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
)
filesystem.register_file(
storage_uri="file:///tmp/second.json",
source_path="shared/source.json",
folder_path="/documents",
external_id="dsid_second",
title="Second",
@ -639,7 +661,7 @@ def test_browse_path_falls_back_when_source_target_is_ambiguous(tmp_path):
result = json.loads(executor.execute('browse /documents "first"'))
assert result["data"]["data"][0]["path"] == "dsid_first"
assert result["data"]["data"][0]["path"] == "/documents/First"
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
@ -663,7 +685,6 @@ def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path)
)
filesystem.register_file(
storage_uri="file:///tmp/market-note.pdf",
source_path="examples/documents/market-note.pdf",
folder_path="/documents",
external_id="dsid_market_note",
title="market-note.pdf",
@ -695,13 +716,13 @@ def test_old_semantic_commands_are_unsupported_even_when_indexes_exist(tmp_path)
executor.execute('browse /documents "Federal Reserve" --space entity')
)
assert entity["data"]["data"][0]["summary"] == "Risk and compliance summary"
assert entity["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf"
assert entity["data"]["data"][0]["path"] == "/documents/market-note.pdf"
relation = json.loads(
executor.execute('browse /documents "Disney valuation" --space relation')
)
assert relation["data"]["data"][0]["summary"] == "Risk and compliance summary"
assert relation["data"]["data"][0]["path"] == "/examples/documents/market-note.pdf"
assert relation["data"]["data"][0]["path"] == "/documents/market-note.pdf"
def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path):
@ -711,7 +732,6 @@ def test_find_name_is_lexical_and_find_relation_is_not_semantic_alias(tmp_path):
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
filesystem.register_file(
storage_uri="file:///tmp/report.pdf",
source_path="examples/documents/report.pdf",
folder_path="/documents",
external_id="dsid_report",
title="Annual report",
@ -755,7 +775,7 @@ def test_broad_recursive_grep_suggests_browse_not_removed_semantic_commands(tmp_
assert "semantic-grep" not in rendered
def test_grep_source_file_requires_terms_on_same_line(tmp_path):
def test_grep_file_requires_terms_on_same_line(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
source_dir = tmp_path / "source" / "documents"
@ -769,11 +789,10 @@ def test_grep_source_file_requires_terms_on_same_line(tmp_path):
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
filesystem.register_file(
storage_uri=str(source),
source_path="documents/split.json",
folder_path="/documents",
external_id="doc_split_terms",
title="Split source terms",
content="registered artifact without the searched tokens",
content=source.read_text(encoding="utf-8"),
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
@ -813,7 +832,6 @@ def test_existing_summary_projection_index_uses_current_config_when_dimensions_m
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="documents/a.pdf",
title="A",
text="summary",
vector=[1.0, 0.0, 0.0],
@ -879,7 +897,6 @@ def test_existing_summary_projection_index_dimension_mismatch_rejects_retrieval(
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="documents/a.pdf",
title="A",
text="summary",
vector=[1.0, 0.0, 0.0],
@ -948,7 +965,6 @@ def test_browse_semantic_files_uses_summary_projection_when_only_summary_availab
)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_summary_only",
title="Operations note",

View file

@ -60,7 +60,6 @@ def test_pageindex_structure_options_report_failed_register_build(monkeypatch):
monkeypatch.setattr(PageIndexClient, "index", fail_index)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/report.md",
external_id="dsid_structural_missing",
title="Structural report",
content=source.read_text(encoding="utf-8"),
@ -152,14 +151,12 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft
filesystem.register_file(
storage_uri=source_pdf.as_uri(),
source_path="docs/report.pdf",
external_id="dsid_pdf_extracted",
title="PDF extracted",
content="CALLER PDF CONTENT MUST NOT REACH GENERATOR",
)
filesystem.register_file(
storage_uri=source_md.as_uri(),
source_path="docs/notes.md",
external_id="dsid_md_extracted",
title="Markdown extracted",
content="CALLER MD CONTENT MUST NOT REACH GENERATOR",
@ -167,8 +164,12 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft
pdf_request = generator.calls[0][0]
md_request = generator.calls[1][0]
pdf_stat = filesystem.store.file_info("dsid_pdf_extracted")
md_stat = filesystem.store.file_info("dsid_md_extracted")
pdf_entry = filesystem.store.get_file(
filesystem.store.resolve_file_ref("dsid_pdf_extracted")
)
md_entry = filesystem.store.get_file(
filesystem.store.resolve_file_ref("dsid_md_extracted")
)
assert "PageIndex PDF extracted alpha text" in pdf_request.text
assert "Second PageIndex PDF extracted beta text" in pdf_request.text
@ -176,10 +177,10 @@ def test_register_pdf_markdown_uses_pageindex_extracted_text_for_metadata_and_ft
assert "PageIndex Markdown extracted gamma text" in md_request.text
assert "CALLER MD CONTENT" not in md_request.text
assert "PageIndex PDF extracted alpha text" in Path(
pdf_stat["text_artifact_path"]
pdf_entry.text_artifact_path
).read_text(encoding="utf-8")
assert "PageIndex Markdown extracted gamma text" in Path(
md_stat["text_artifact_path"]
md_entry.text_artifact_path
).read_text(encoding="utf-8")
assert [r.external_id for r in filesystem.search("alpha beta", limit=5)] == [
"dsid_pdf_extracted"
@ -207,7 +208,6 @@ def test_register_text_metadata_generation_keeps_caller_content_without_pageinde
filesystem.register_file(
storage_uri="file:///tmp/readme.txt",
source_path="docs/readme.txt",
external_id="dsid_text_generation",
title="Text generation",
content="Plain text caller content stays authoritative.",
@ -215,11 +215,14 @@ def test_register_text_metadata_generation_keeps_caller_content_without_pageinde
)
stat = filesystem.store.file_info("dsid_text_generation")
entry = filesystem.store.get_file(
filesystem.store.resolve_file_ref("dsid_text_generation")
)
assert generator.calls[0][0].text == "Plain text caller content stays authoritative."
assert stat["pageindex_doc_id"] is None
assert stat["pageindex_tree_status"] == "not_built"
assert Path(stat["text_artifact_path"]).read_text(
assert Path(entry.text_artifact_path).read_text(
encoding="utf-8"
) == "Plain text caller content stays authoritative."
@ -261,14 +264,12 @@ def test_register_pdf_markdown_cache_miss_invokes_pageindex_client_index(monkeyp
filesystem.register_file(
storage_uri=str(source_pdf),
source_path="docs/report.pdf",
external_id="dsid_pdf_build",
title="PDF build",
content="pdf text",
)
filesystem.register_file(
storage_uri=source_md.as_uri(),
source_path="docs/notes.md",
external_id="dsid_md_build",
title="Markdown build",
content=source_md.read_text(encoding="utf-8"),
@ -332,7 +333,6 @@ def test_cat_structure_page_reuses_pageindex_client_cache_without_indexing(monke
monkeypatch.setattr(PageIndexClient, "index", fail_index)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/report.pdf",
external_id="dsid_structural_cached",
title="Cached structural report",
content="text artifact remains available for grep, not cat --all",
@ -370,7 +370,6 @@ def test_cat_node_is_not_supported():
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
filesystem.register_file(
storage_uri="file:///tmp/notes.md",
source_path="docs/notes.md",
external_id="dsid_md_cached",
title="Cached markdown notes",
content="# Notes\n\nBody",
@ -419,7 +418,6 @@ def test_cat_structure_page_and_text_outputs_are_hard_limited():
)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/report.pdf",
external_id="dsid_limited_pdf",
title="Limited structural report",
content="text artifact remains available for grep",
@ -427,7 +425,6 @@ def test_cat_structure_page_and_text_outputs_are_hard_limited():
text_content = "\n".join(f"line {index}" for index in range(1, 106))
filesystem.register_file(
storage_uri="file:///tmp/long.txt",
source_path="docs/long.txt",
external_id="dsid_long_text",
title="Long text",
content=text_content,
@ -474,7 +471,6 @@ def test_tree_folder_behavior_is_preserved():
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
filesystem.register_file(
storage_uri="file:///tmp/report.txt",
source_path="docs/report.txt",
folder_path="/docs/reports",
external_id="dsid_folder_tree",
title="Folder report",
@ -514,7 +510,6 @@ def test_tree_does_not_read_file_internal_pageindex_structure():
)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/report.pdf",
external_id="dsid_tree_is_folder_only",
title="Cached structural report",
content="text artifact remains available",
@ -536,28 +531,24 @@ def test_cat_all_is_limited_to_text_files():
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
filesystem.register_file(
storage_uri="file:///tmp/readme.txt",
source_path="docs/readme.txt",
external_id="dsid_text_file",
title="Text readme",
content="plain text body",
)
filesystem.register_file(
storage_uri="file:///tmp/report.pdf",
source_path="docs/report.pdf",
external_id="dsid_pdf_file",
title="PDF report",
content="extracted text should not be served through cat --all",
)
filesystem.register_file(
storage_uri="file:///tmp/notes.md",
source_path="docs/notes.md",
external_id="dsid_md_file",
title="Markdown notes",
content="markdown text should use PageIndex structure reads",
)
filesystem.register_file(
storage_uri="file:///tmp/data.json",
source_path="docs/data.json",
external_id="dsid_json_file",
title="JSON record",
content='{"body":"json"}',
@ -589,7 +580,6 @@ def test_pageindex_structure_commands_are_limited_to_pdf_and_markdown():
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
filesystem.register_file(
storage_uri="file:///tmp/readme.txt",
source_path="docs/readme.txt",
external_id="dsid_text_only",
title="Text readme",
content="plain text body",
@ -617,7 +607,6 @@ def test_existing_pageindex_status_allows_legacy_record_without_format_suffix():
filesystem = PageIndexFileSystem(workspace=Path(tmp) / "workspace")
file_ref = filesystem.register_file(
storage_uri=source.as_uri(),
source_path="uploads/uploaded",
external_id="dsid_legacy_pageindex",
title="Legacy PageIndex record",
content="text/plain is only a weak default here",
@ -665,7 +654,6 @@ def test_read_commands_do_not_link_pageindex_cache_when_pointer_is_missing(monke
monkeypatch.setattr(PageIndexClient, "index", fail_index)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/late.md",
external_id="dsid_late_cache",
title="Late cache",
content=source.read_text(encoding="utf-8"),

View file

@ -80,12 +80,13 @@ def test_add_text_folder_target_copies_artifact_indexes_summary_and_is_readable(
info = filesystem.add_file(str(source), "/documents/reports")
assert info["source_path"] == "documents/reports/filing.txt"
assert info["path"] == "/documents/reports/filing.txt"
assert info["folder_path"] == "/documents/reports"
assert filesystem.folder_info("/documents/reports")["path"] == "/documents/reports"
assert info["storage_uri"] != source.as_uri()
assert "/artifacts/uploads/" in info["storage_uri"]
copied_path = Path(info["storage_uri"].removeprefix("file://"))
entry = filesystem.store.get_file(info["file_ref"])
assert entry.storage_uri != source.as_uri()
assert "/artifacts/uploads/" in entry.storage_uri
copied_path = Path(entry.storage_uri.removeprefix("file://"))
assert copied_path.read_text(encoding="utf-8") == "alpha filing text for pifs add"
assert copied_path.resolve() != source.resolve()
@ -164,7 +165,7 @@ def test_add_configures_semantic_retrieval_in_same_filesystem_instance(tmp_path)
recursive=True,
page_size=5,
)
assert [item["source_path"] for item in results["data"]] == ["documents/semantic.txt"]
assert [item["path"] for item in results["data"]] == ["/documents/semantic.txt"]
def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monkeypatch):
@ -205,10 +206,11 @@ def test_add_markdown_builds_pageindex_tree_from_copied_artifact(tmp_path, monke
info = filesystem.add_file(source, "/documents")
executor = PIFSCommandExecutor(filesystem, json_output=True)
structure = json.loads(executor.execute("cat /documents/notes.md --structure"))
entry = filesystem.store.get_file(info["file_ref"])
assert structure["data"]["available"] is True
assert structure["data"]["structure"][0]["title"] == "Notes"
assert indexed_paths == [Path(info["storage_uri"].removeprefix("file://"))]
assert indexed_paths == [Path(entry.storage_uri.removeprefix("file://"))]
assert indexed_paths[0].resolve() != source.resolve()
@ -469,8 +471,6 @@ def test_cli_add_uses_workspace_and_prints_added_file(monkeypatch, capsys, tmp_p
return {
"file_ref": "file_cli",
"path": "/documents/cli.txt",
"source_path": "documents/cli.txt",
"storage_uri": "file:///workspace/artifacts/uploads/file_cli/cli.txt",
}
monkeypatch.setattr(cli, "PageIndexFileSystem", FakeAddFileSystem)
@ -482,5 +482,4 @@ def test_cli_add_uses_workspace_and_prints_added_file(monkeypatch, capsys, tmp_p
assert capsys.readouterr().out == (
"added: /documents/cli.txt\n"
"file_ref: file_cli\n"
"storage_uri: file:///workspace/artifacts/uploads/file_cli/cli.txt\n"
)

View file

@ -76,7 +76,6 @@ def test_cli_workspace_surfaces_projection_dimension_mismatch(tmp_path):
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="documents/a.pdf",
title="A",
text="summary",
vector=[1.0, 0.0, 0.0],
@ -226,6 +225,28 @@ def test_cli_ask_invokes_agent_with_question(monkeypatch, capsys, tmp_path):
}
def test_cli_ask_defaults_to_global_agent_model(monkeypatch, capsys, tmp_path):
from pageindex.filesystem import cli
workspace = tmp_path / "workspace"
agent_calls = []
monkeypatch.delenv("PIFS_AGENT_MODEL", raising=False)
monkeypatch.delenv("PIFS_MODEL", raising=False)
def fake_run_pifs_agent(filesystem, question, **kwargs):
agent_calls.append(kwargs)
return "agent answer"
monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
monkeypatch.setattr(cli, "run_pifs_agent", fake_run_pifs_agent)
status = cli.main(["ask", "--workspace", str(workspace), "What?"])
assert status == 0
assert capsys.readouterr().out == "agent answer\n"
assert agent_calls[0]["model"] == "gpt-5.4"
def test_cli_ask_loads_env_file_before_running_agent(monkeypatch, capsys, tmp_path):
from pageindex.filesystem import cli

View file

@ -24,7 +24,6 @@ def _register_find_fixture(tmp_path: Path):
source.write_text(f"{title} fixture text", encoding="utf-8")
filesystem.register_file(
storage_uri=source.as_uri(),
source_path=f"docs/{filename}",
folder_path=folder_path,
external_id=external_id,
title=title,
@ -145,7 +144,6 @@ def test_stat_shell_output_includes_unified_metadata_status(tmp_path):
)
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_generated",
title="Generated metadata document",
@ -196,7 +194,6 @@ def test_stat_field_reads_one_metadata_field_across_multiple_targets(tmp_path):
source.write_text(f"fixture text {index}", encoding="utf-8")
filesystem.register_file(
storage_uri=source.as_uri(),
source_path=f"docs/source{index}.txt",
folder_path="/documents",
external_id=f"doc_summary_{index}",
title=f"Summary document {index}",
@ -249,7 +246,6 @@ def test_stat_field_rejects_more_than_twenty_targets(tmp_path):
source.write_text(f"fixture text {index}", encoding="utf-8")
filesystem.register_file(
storage_uri=source.as_uri(),
source_path=f"docs/source{index}.txt",
folder_path="/documents",
external_id=f"doc_{index}",
title=f"Document {index}",
@ -273,7 +269,6 @@ def test_register_rejects_pifs_owned_metadata_fields(tmp_path):
with pytest.raises(ValueError, match="PIFS-owned generated field"):
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_conflict",
title="Conflict document",
@ -299,7 +294,6 @@ def test_batch_metadata_status_generates_into_unified_metadata(tmp_path):
)
file_ref = filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_batch",
title="Batch document",

View file

@ -14,7 +14,6 @@ def _register_file(
source.write_text(f"{external_id} fixture text", encoding="utf-8")
filesystem.register_file(
storage_uri=source.as_uri(),
source_path=f"docs/{filename}",
folder_path=folder_path,
external_id=external_id,
title=external_id,

View file

@ -7,7 +7,6 @@ def test_root_virtual_file_path_resolves_without_double_slash(tmp_path):
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
file_ref = filesystem.register_file(
storage_uri="file:///tmp/root-source.txt",
source_path="sources/root-source.txt",
folder_path="/",
external_id="doc_root_title",
title="Root Title",
@ -17,13 +16,12 @@ def test_root_virtual_file_path_resolves_without_double_slash(tmp_path):
assert filesystem.store.resolve_file_ref("/Root Title") == file_ref
def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path):
def test_nested_virtual_file_path_resolves_by_folder_and_title(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
first_ref = filesystem.register_file(
storage_uri="file:///tmp/first.txt",
source_path="b/file.txt",
folder_path="/a",
external_id="doc_first",
title="First",
@ -31,26 +29,23 @@ def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path):
)
second_ref = filesystem.register_file(
storage_uri="file:///tmp/second.txt",
source_path="second-source.txt",
folder_path="/a/b",
external_id="doc_second",
title="file.txt",
content="second content",
)
with pytest.raises(KeyError, match="Ambiguous file target"):
filesystem.store.resolve_file_ref("/a/b/file.txt")
assert filesystem.store.resolve_file_ref("/a/b/file.txt") == second_ref
assert first_ref != second_ref
def test_duplicate_source_path_target_raises_clear_error(tmp_path):
def test_unknown_virtual_file_target_raises_clear_error(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
first_ref = filesystem.register_file(
storage_uri="file:///tmp/first.txt",
source_path="shared/source.txt",
folder_path="/first",
external_id="doc_first",
title="First",
@ -58,14 +53,13 @@ def test_duplicate_source_path_target_raises_clear_error(tmp_path):
)
second_ref = filesystem.register_file(
storage_uri="file:///tmp/second.txt",
source_path="shared/source.txt",
folder_path="/second",
external_id="doc_second",
title="Second",
content="second content",
)
with pytest.raises(KeyError, match="Ambiguous file target"):
filesystem.store.resolve_file_ref("/shared/source.txt")
with pytest.raises(KeyError, match="Unknown file target"):
filesystem.store.resolve_file_ref("/shared/missing.txt")
assert first_ref != second_ref

View file

@ -40,7 +40,6 @@ def test_register_insert_failure_cleans_owned_artifacts_and_skips_projection(
with pytest.raises(RuntimeError, match="catalog insert failed"):
filesystem.register_file(
storage_uri=source.as_uri(),
source_path="docs/source.txt",
folder_path="/documents",
external_id="doc_insert_failure",
title="Insert failure",

View file

@ -31,7 +31,6 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
file_ref="file_a",
external_id="doc_a",
source_type="github",
source_path="github/a.json",
title="Multipart upload limits",
text="multipart upload limits",
vector=[1.0, 0.0, 0.0],
@ -41,7 +40,6 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
file_ref="file_b",
external_id="doc_b",
source_type="slack",
source_path="slack/b.json",
title="GPU cache issue",
text="gpu cache issue",
vector=[0.0, 1.0, 0.0],
@ -72,7 +70,6 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
file_ref=f"file_off_{item:02d}",
external_id=f"doc_off_{item:02d}",
source_type="documents",
source_path=f"other/{item:02d}.pdf",
title=f"Off scope {item:02d}",
text="off scope",
vector=[1.0, 0.0],
@ -84,7 +81,6 @@ def test_sqlite_vec_semantic_index_file_ref_filter_not_limited_by_global_rank(tm
file_ref="file_in_scope",
external_id="doc_in_scope",
source_type="documents",
source_path="documents/in-scope.pdf",
title="In scope",
text="in scope",
vector=[0.0, 1.0],
@ -117,7 +113,6 @@ def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
"file_ref": "file_a",
"external_id": "doc_a",
"source_type": "documents",
"source_path": "docs/a.pdf",
"title": "A",
"metadata": {
"summary": "Unified metadata summary.",
@ -153,7 +148,6 @@ def test_summary_projection_indexer_defaults_to_1024_dimensions(tmp_path):
"file_ref": "file_a",
"external_id": "doc_a",
"source_type": "documents",
"source_path": "docs/a.pdf",
"title": "A",
"metadata": {"summary": "Default dimension summary."},
}
@ -180,7 +174,6 @@ def test_summary_projection_indexer_allows_explicit_256_dimensions(tmp_path):
"file_ref": "file_a",
"external_id": "doc_a",
"source_type": "documents",
"source_path": "docs/a.pdf",
"title": "A",
"metadata": {"summary": "Explicit 256 dimension summary."},
}
@ -304,7 +297,6 @@ def test_summary_projection_dimension_mismatch_preserves_existing_index(tmp_path
file_ref="file_a",
external_id="doc_a",
source_type="documents",
source_path="docs/a.pdf",
title="A",
text="summary",
vector=[1.0, 0.0, 0.0],