mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
fix(filesystem): restore summary vector search in cli
This commit is contained in:
parent
7e70b580f0
commit
fc0be1aeee
6 changed files with 147 additions and 3 deletions
|
|
@ -42,8 +42,10 @@ Run a command in the PageIndex FileSystem virtual shell. This is not a real
|
|||
operating-system shell. By default the tool is read-only: use ls, tree, find,
|
||||
grep, cat, stat, head, tail, sed, and any dynamically available semantic search
|
||||
commands described in the workspace context. grep -R is lexical evidence search;
|
||||
semantic search commands return candidate documents and do not guarantee literal
|
||||
text matches. Errors are returned as text prefixed with ERROR. Do not call
|
||||
semantic search commands such as search-summary return candidate documents and
|
||||
do not guarantee literal text matches. Use search-summary when the user asks for
|
||||
summary search, semantic search, or vector search and the command is listed as
|
||||
available. Errors are returned as text prefixed with ERROR. Do not call
|
||||
commands that are not listed as available. When evidence is required, inspect it
|
||||
with cat or grep before answering. Prefer shell-like target-first cat syntax
|
||||
with stable targets: cat <path> --structure, cat <path> --page 31-59, and
|
||||
|
|
@ -64,6 +66,7 @@ Tool policy:
|
|||
- Use --where only with metadata fields shown by stat --schema.
|
||||
- grep -R performs lexical evidence search.
|
||||
- Semantic search commands are candidate-discovery tools and do not guarantee literal text matches.
|
||||
- If search-summary is available and the user asks for summary search, semantic search, vector search, or "用 summary 搜", use search-summary <query> <folder>; do not translate that request into find --where.
|
||||
- Tool errors are returned as ERROR text; recover by trying an available command.
|
||||
- Use cat or grep to gather evidence before making source-backed claims.
|
||||
- Prefer target-first cat syntax with stable targets: cat <path> --structure, cat <path> --page 31-59, cat <path> --node <node_id>.
|
||||
|
|
|
|||
|
|
@ -122,7 +122,10 @@ def _parse_agent_command(
|
|||
|
||||
|
||||
def _filesystem_from_workspace(workspace: str) -> PageIndexFileSystem:
|
||||
return PageIndexFileSystem(Path(workspace).expanduser())
|
||||
filesystem = PageIndexFileSystem(Path(workspace).expanduser())
|
||||
with contextlib.suppress(Exception):
|
||||
filesystem.configure_existing_projection_retrieval()
|
||||
return filesystem
|
||||
|
||||
|
||||
def _agent_kwargs(args: argparse.Namespace) -> dict[str, object]:
|
||||
|
|
|
|||
|
|
@ -218,6 +218,64 @@ class PageIndexFileSystem:
|
|||
embedding_timeout=self.summary_projection_embedding_timeout,
|
||||
)
|
||||
|
||||
def configure_existing_projection_retrieval(self) -> bool:
|
||||
"""Attach semantic retrieval to already-built projection indexes.
|
||||
|
||||
Register-time generation owns building the index files. Opening an
|
||||
existing workspace should still expose the corresponding read commands,
|
||||
such as search-summary, without forcing a re-register step.
|
||||
"""
|
||||
if self.semantic_retrieval_backend is not None:
|
||||
return bool(self.semantic_retrieval_channels())
|
||||
index_config = self._existing_projection_index_config()
|
||||
if index_config is None:
|
||||
return False
|
||||
metadata = dict(index_config.get("metadata") or {})
|
||||
embedding_provider = str(
|
||||
metadata.get("embedding_provider")
|
||||
or self.summary_projection_embedding_provider
|
||||
)
|
||||
embedding_model = str(
|
||||
metadata.get("embedding_model")
|
||||
or self.summary_projection_embedding_model
|
||||
)
|
||||
embedding_dimensions = int(
|
||||
metadata.get("embedding_dimensions")
|
||||
or index_config.get("dimension")
|
||||
or self.summary_projection_embedding_dimensions
|
||||
)
|
||||
self.configure_hybrid_projection_retrieval(
|
||||
self.summary_projection_index_dir,
|
||||
embedding_provider=embedding_provider,
|
||||
embedding_model=embedding_model,
|
||||
embedding_dimensions=embedding_dimensions,
|
||||
embedding_timeout=self.summary_projection_embedding_timeout,
|
||||
)
|
||||
return bool(self.semantic_retrieval_channels())
|
||||
|
||||
def _existing_projection_index_config(self) -> dict[str, Any] | None:
|
||||
from .hybrid_projection import INDEX_BY_CHANNEL
|
||||
from .semantic_index import SQLiteVecSemanticIndex
|
||||
|
||||
for channel in SEMANTIC_RETRIEVAL_CHANNELS:
|
||||
index_name = INDEX_BY_CHANNEL.get(channel)
|
||||
if not index_name:
|
||||
continue
|
||||
index_path = self.summary_projection_index_dir / f"{index_name}.sqlite"
|
||||
if not index_path.exists():
|
||||
continue
|
||||
try:
|
||||
info = SQLiteVecSemanticIndex(index_path).info()
|
||||
except Exception:
|
||||
continue
|
||||
if int(info.get("document_count") or 0) <= 0:
|
||||
continue
|
||||
metadata = dict(info.get("metadata") or {})
|
||||
if metadata.get("channel") and metadata.get("channel") != channel:
|
||||
continue
|
||||
return info
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _register_uses_deferred_metadata(policy: Any) -> bool:
|
||||
if not isinstance(policy, dict):
|
||||
|
|
|
|||
|
|
@ -58,3 +58,60 @@ def test_semantic_search_scope_filters_explicit_source_type_facets():
|
|||
{"folder_path": "/documents"}
|
||||
) == {}
|
||||
|
||||
|
||||
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
index_dir = workspace / "artifacts" / "projection_indexes"
|
||||
summary_index = SQLiteVecSemanticIndex(index_dir / "summary_only_vector.sqlite")
|
||||
summary_index.reset(
|
||||
dimension=3,
|
||||
metadata={
|
||||
"channel": "summary",
|
||||
"embedding_provider": "openai",
|
||||
"embedding_model": "test-embedding",
|
||||
"embedding_dimensions": 3,
|
||||
},
|
||||
)
|
||||
summary_index.upsert_many(
|
||||
[
|
||||
SemanticIndexRecord(
|
||||
file_ref="file_a",
|
||||
external_id="doc_a",
|
||||
source_type="documents",
|
||||
source_path="documents/a.pdf",
|
||||
title="A",
|
||||
text="summary",
|
||||
vector=[1.0, 0.0, 0.0],
|
||||
)
|
||||
]
|
||||
)
|
||||
filesystem = PageIndexFileSystem(workspace)
|
||||
calls = []
|
||||
|
||||
def fake_configure(index_dir_arg, **kwargs):
|
||||
calls.append((index_dir_arg, kwargs))
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("doc_a")
|
||||
return filesystem.semantic_retrieval_backend
|
||||
|
||||
monkeypatch.setattr(
|
||||
filesystem,
|
||||
"configure_hybrid_projection_retrieval",
|
||||
fake_configure,
|
||||
)
|
||||
|
||||
assert filesystem.configure_existing_projection_retrieval() is True
|
||||
assert calls == [
|
||||
(
|
||||
filesystem.summary_projection_index_dir,
|
||||
{
|
||||
"embedding_provider": "openai",
|
||||
"embedding_model": "test-embedding",
|
||||
"embedding_dimensions": 3,
|
||||
"embedding_timeout": 60,
|
||||
},
|
||||
)
|
||||
]
|
||||
assert filesystem.semantic_retrieval_channels() == ("summary",)
|
||||
|
|
|
|||
|
|
@ -204,6 +204,11 @@ class PIFSAgentStreamTest(unittest.TestCase):
|
|||
self.assertIn("do not infer metadata presence or absence", AGENT_TOOL_POLICY)
|
||||
self.assertIn("questions about metadata fields", BASH_TOOL_DESCRIPTION)
|
||||
|
||||
def test_prompt_routes_summary_search_to_search_summary(self):
|
||||
self.assertIn("search-summary when the user asks for", BASH_TOOL_DESCRIPTION)
|
||||
self.assertIn("use search-summary <query> <folder>", AGENT_TOOL_POLICY)
|
||||
self.assertIn("do not translate that request into find --where", AGENT_TOOL_POLICY)
|
||||
|
||||
def test_system_prompt_sets_workspace_identity_and_scope(self):
|
||||
self.assertIn("PageIndex FileSystem Demo Agent", AGENT_SYSTEM_PROMPT)
|
||||
self.assertIn("VectifyAI Team", AGENT_SYSTEM_PROMPT)
|
||||
|
|
|
|||
|
|
@ -5,6 +5,24 @@ from pathlib import Path
|
|||
class FakeFileSystem:
|
||||
def __init__(self, workspace):
|
||||
self.workspace = Path(workspace)
|
||||
self.projection_retrieval_configured = False
|
||||
|
||||
def configure_existing_projection_retrieval(self):
|
||||
self.projection_retrieval_configured = True
|
||||
return True
|
||||
|
||||
|
||||
def test_cli_workspace_configures_existing_projection_retrieval(monkeypatch, tmp_path):
|
||||
from pageindex.filesystem import cli
|
||||
|
||||
workspace = tmp_path / "workspace"
|
||||
|
||||
monkeypatch.setattr(cli, "PageIndexFileSystem", FakeFileSystem)
|
||||
|
||||
filesystem = cli._filesystem_from_workspace(str(workspace))
|
||||
|
||||
assert filesystem.workspace == workspace
|
||||
assert filesystem.projection_retrieval_configured is True
|
||||
|
||||
|
||||
def test_cli_passthrough_invokes_pifs_command_executor(monkeypatch, capsys, tmp_path):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue