fix(filesystem): return stable semantic search targets

This commit is contained in:
BukeLy 2026-05-27 03:48:44 +08:00
parent 0243732e22
commit 977f596666
4 changed files with 186 additions and 12 deletions

View file

@ -1337,6 +1337,23 @@ class PIFSCommandExecutor:
return f"{folder}/{title}" if folder else f"/{title}"
return str(item.get("source_path") or item.get("external_id") or file_ref or "-")
def _stable_file_target_path(self, item: dict[str, Any]) -> str:
file_ref = str(item.get("file_ref") or "").strip()
source_path = str(item.get("source_path") or "").strip()
if source_path:
target = "/" + source_path.strip("/")
try:
if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref:
return target
except KeyError:
pass
external_id = str(item.get("external_id") or "").strip()
if external_id:
return external_id
if file_ref:
return file_ref
return str(item.get("external_id") or item.get("file_ref") or "-")
def _semantic_retrieval_query(self, query: str) -> str:
query = str(query or "").strip()
context = str(self.query_context or "").strip()
@ -1464,7 +1481,7 @@ class PIFSCommandExecutor:
if text:
line_text = f"{line}: {self._compact_text(text, max_chars=220)}"
hit = {
"path": self._file_target_path(
"path": self._stable_file_target_path(
{
"file_ref": result.file_ref,
"title": result.title,

View file

@ -1050,12 +1050,29 @@ class SQLiteFileSystemStore:
if row:
return row["file_ref"]
stripped = target.strip("/")
row = conn.execute(
"SELECT file_ref FROM files WHERE source_path = ? AND deleted_at IS NULL",
rows = conn.execute(
"""
SELECT
f.file_ref,
f.external_id,
f.title,
f.source_path,
COALESCE(MIN(fo.path), '/') AS folder_path
FROM files f
LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref
LEFT JOIN folders fo ON fo.folder_id = ff.folder_id
WHERE f.source_path = ? AND f.deleted_at IS NULL
GROUP BY f.file_ref, f.external_id, f.title, f.source_path
ORDER BY f.file_ref
LIMIT 2
""",
(stripped,),
).fetchone()
if row:
return row["file_ref"]
).fetchall()
if len(rows) > 1:
matches = "; ".join(self._virtual_match_summary(row) for row in rows)
raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}")
if rows:
return rows[0]["file_ref"]
virtual_file_ref = self._resolve_virtual_file_ref(conn, target)
if virtual_file_ref:
return virtual_file_ref

View file

@ -83,7 +83,7 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
filesystem.register_file(
file_ref = filesystem.register_file(
storage_uri="file:///tmp/report.pdf",
source_path="examples/documents/report.pdf",
folder_path="/documents",
@ -110,20 +110,133 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters
assert backend.calls[0][2] == {}
assert result["data"]["data"][0] == {
"path": "/documents/report.pdf",
"path": "/examples/documents/report.pdf",
"summary": "Federal Reserve annual report summary",
"line_text": "1: Federal Reserve supervision and regulation annual report.",
}
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref
executor.json_output = False
rendered = executor.execute('search-summary "Federal Reserve annual report" /documents')
assert "path: /documents/report.pdf" in rendered
assert "path: /examples/documents/report.pdf" in rendered
assert "summary: Federal Reserve annual report summary" in rendered
assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered
assert "id=dsid_report" not in rendered
assert "file_ref=" not in rendered
def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(
values={"summary": f"summary for {document.external_id}"}
)
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
first_ref = filesystem.register_file(
storage_uri="file:///tmp/first.json",
source_path="slack/dsid_first.json",
folder_path="/documents",
external_id="dsid_first",
title="announcements",
content="first announcement mentions H200 reservations.",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
filesystem.register_file(
storage_uri="file:///tmp/second.json",
source_path="slack/dsid_second.json",
folder_path="/documents",
external_id="dsid_second",
title="announcements",
content="second announcement mentions unrelated maintenance.",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(executor.execute('search-summary "H200 reservations" /documents'))
assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json"
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
with pytest.raises(KeyError, match="Ambiguous file target"):
filesystem.store.resolve_file_ref("/documents/announcements")
def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
class SummaryGenerator:
def generate(self, document, *, fields):
return MetadataGenerationResult(
values={"summary": f"summary for {document.external_id}"}
)
filesystem = PageIndexFileSystem(
workspace=tmp_path / "workspace",
metadata_generator=SummaryGenerator(),
)
first_ref = filesystem.register_file(
storage_uri="file:///tmp/first.json",
source_path="shared/source.json",
folder_path="/documents",
external_id="dsid_first",
title="First",
content="first content",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
filesystem.register_file(
storage_uri="file:///tmp/second.json",
source_path="shared/source.json",
folder_path="/documents",
external_id="dsid_second",
title="Second",
content="second content",
metadata_policy={
"fields": {
"summary": True,
"doc_type": False,
"domain": False,
"topic": False,
}
},
)
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(executor.execute('search-summary "first" /documents'))
assert result["data"]["data"][0]["path"] == "dsid_first"
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
@ -164,7 +277,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents'))
assert entity["data"]["data"][0] == {
"path": "/documents/market-note.pdf",
"path": "/examples/documents/market-note.pdf",
"summary": "Risk and compliance summary",
"line_text": "1: Federal Reserve policy affects Disney valuation.",
"entity": "Federal Reserve; Disney",
@ -172,7 +285,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
relation = json.loads(executor.execute('search-relation "Disney valuation" /documents'))
assert relation["data"]["data"][0] == {
"path": "/documents/market-note.pdf",
"path": "/examples/documents/market-note.pdf",
"summary": "Risk and compliance summary",
"line_text": "1: Federal Reserve policy affects Disney valuation.",
"relation": "Federal Reserve affects Disney valuation",
@ -180,7 +293,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
executor.json_output = False
rendered = executor.execute('search-entity "Federal Reserve" /documents')
assert "path: /documents/market-note.pdf" in rendered
assert "path: /examples/documents/market-note.pdf" in rendered
assert "summary: Risk and compliance summary" in rendered
assert "entity: Federal Reserve; Disney" in rendered
assert "file_ref=" not in rendered

View file

@ -42,3 +42,30 @@ def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path):
filesystem.store.resolve_file_ref("/a/b/file.txt")
assert first_ref != second_ref
def test_duplicate_source_path_target_raises_clear_error(tmp_path):
from pageindex.filesystem import PageIndexFileSystem
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
first_ref = filesystem.register_file(
storage_uri="file:///tmp/first.txt",
source_path="shared/source.txt",
folder_path="/first",
external_id="doc_first",
title="First",
content="first content",
)
second_ref = filesystem.register_file(
storage_uri="file:///tmp/second.txt",
source_path="shared/source.txt",
folder_path="/second",
external_id="doc_second",
title="Second",
content="second content",
)
with pytest.raises(KeyError, match="Ambiguous file target"):
filesystem.store.resolve_file_ref("/shared/source.txt")
assert first_ref != second_ref