mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
fix(filesystem): return stable semantic search targets
This commit is contained in:
parent
0243732e22
commit
977f596666
4 changed files with 186 additions and 12 deletions
|
|
@ -1337,6 +1337,23 @@ class PIFSCommandExecutor:
|
|||
return f"{folder}/{title}" if folder else f"/{title}"
|
||||
return str(item.get("source_path") or item.get("external_id") or file_ref or "-")
|
||||
|
||||
def _stable_file_target_path(self, item: dict[str, Any]) -> str:
|
||||
file_ref = str(item.get("file_ref") or "").strip()
|
||||
source_path = str(item.get("source_path") or "").strip()
|
||||
if source_path:
|
||||
target = "/" + source_path.strip("/")
|
||||
try:
|
||||
if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref:
|
||||
return target
|
||||
except KeyError:
|
||||
pass
|
||||
external_id = str(item.get("external_id") or "").strip()
|
||||
if external_id:
|
||||
return external_id
|
||||
if file_ref:
|
||||
return file_ref
|
||||
return str(item.get("external_id") or item.get("file_ref") or "-")
|
||||
|
||||
def _semantic_retrieval_query(self, query: str) -> str:
|
||||
query = str(query or "").strip()
|
||||
context = str(self.query_context or "").strip()
|
||||
|
|
@ -1464,7 +1481,7 @@ class PIFSCommandExecutor:
|
|||
if text:
|
||||
line_text = f"{line}: {self._compact_text(text, max_chars=220)}"
|
||||
hit = {
|
||||
"path": self._file_target_path(
|
||||
"path": self._stable_file_target_path(
|
||||
{
|
||||
"file_ref": result.file_ref,
|
||||
"title": result.title,
|
||||
|
|
|
|||
|
|
@ -1050,12 +1050,29 @@ class SQLiteFileSystemStore:
|
|||
if row:
|
||||
return row["file_ref"]
|
||||
stripped = target.strip("/")
|
||||
row = conn.execute(
|
||||
"SELECT file_ref FROM files WHERE source_path = ? AND deleted_at IS NULL",
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT
|
||||
f.file_ref,
|
||||
f.external_id,
|
||||
f.title,
|
||||
f.source_path,
|
||||
COALESCE(MIN(fo.path), '/') AS folder_path
|
||||
FROM files f
|
||||
LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref
|
||||
LEFT JOIN folders fo ON fo.folder_id = ff.folder_id
|
||||
WHERE f.source_path = ? AND f.deleted_at IS NULL
|
||||
GROUP BY f.file_ref, f.external_id, f.title, f.source_path
|
||||
ORDER BY f.file_ref
|
||||
LIMIT 2
|
||||
""",
|
||||
(stripped,),
|
||||
).fetchone()
|
||||
if row:
|
||||
return row["file_ref"]
|
||||
).fetchall()
|
||||
if len(rows) > 1:
|
||||
matches = "; ".join(self._virtual_match_summary(row) for row in rows)
|
||||
raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}")
|
||||
if rows:
|
||||
return rows[0]["file_ref"]
|
||||
virtual_file_ref = self._resolve_virtual_file_ref(conn, target)
|
||||
if virtual_file_ref:
|
||||
return virtual_file_ref
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters
|
|||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
filesystem.register_file(
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.pdf",
|
||||
source_path="examples/documents/report.pdf",
|
||||
folder_path="/documents",
|
||||
|
|
@ -110,20 +110,133 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters
|
|||
|
||||
assert backend.calls[0][2] == {}
|
||||
assert result["data"]["data"][0] == {
|
||||
"path": "/documents/report.pdf",
|
||||
"path": "/examples/documents/report.pdf",
|
||||
"summary": "Federal Reserve annual report summary",
|
||||
"line_text": "1: Federal Reserve supervision and regulation annual report.",
|
||||
}
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref
|
||||
|
||||
executor.json_output = False
|
||||
rendered = executor.execute('search-summary "Federal Reserve annual report" /documents')
|
||||
assert "path: /documents/report.pdf" in rendered
|
||||
assert "path: /examples/documents/report.pdf" in rendered
|
||||
assert "summary: Federal Reserve annual report summary" in rendered
|
||||
assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered
|
||||
assert "id=dsid_report" not in rendered
|
||||
assert "file_ref=" not in rendered
|
||||
|
||||
|
||||
def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
return MetadataGenerationResult(
|
||||
values={"summary": f"summary for {document.external_id}"}
|
||||
)
|
||||
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
first_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.json",
|
||||
source_path="slack/dsid_first.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_first",
|
||||
title="announcements",
|
||||
content="first announcement mentions H200 reservations.",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.json",
|
||||
source_path="slack/dsid_second.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_second",
|
||||
title="announcements",
|
||||
content="second announcement mentions unrelated maintenance.",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
result = json.loads(executor.execute('search-summary "H200 reservations" /documents'))
|
||||
|
||||
assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json"
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
|
||||
with pytest.raises(KeyError, match="Ambiguous file target"):
|
||||
filesystem.store.resolve_file_ref("/documents/announcements")
|
||||
|
||||
|
||||
def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
return MetadataGenerationResult(
|
||||
values={"summary": f"summary for {document.external_id}"}
|
||||
)
|
||||
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
first_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.json",
|
||||
source_path="shared/source.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_first",
|
||||
title="First",
|
||||
content="first content",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.json",
|
||||
source_path="shared/source.json",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_second",
|
||||
title="Second",
|
||||
content="second content",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first")
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
result = json.loads(executor.execute('search-summary "first" /documents'))
|
||||
|
||||
assert result["data"]["data"][0]["path"] == "dsid_first"
|
||||
assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref
|
||||
|
||||
|
||||
def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
|
@ -164,7 +277,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
|
|||
|
||||
entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents'))
|
||||
assert entity["data"]["data"][0] == {
|
||||
"path": "/documents/market-note.pdf",
|
||||
"path": "/examples/documents/market-note.pdf",
|
||||
"summary": "Risk and compliance summary",
|
||||
"line_text": "1: Federal Reserve policy affects Disney valuation.",
|
||||
"entity": "Federal Reserve; Disney",
|
||||
|
|
@ -172,7 +285,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
|
|||
|
||||
relation = json.loads(executor.execute('search-relation "Disney valuation" /documents'))
|
||||
assert relation["data"]["data"][0] == {
|
||||
"path": "/documents/market-note.pdf",
|
||||
"path": "/examples/documents/market-note.pdf",
|
||||
"summary": "Risk and compliance summary",
|
||||
"line_text": "1: Federal Reserve policy affects Disney valuation.",
|
||||
"relation": "Federal Reserve affects Disney valuation",
|
||||
|
|
@ -180,7 +293,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
|
|||
|
||||
executor.json_output = False
|
||||
rendered = executor.execute('search-entity "Federal Reserve" /documents')
|
||||
assert "path: /documents/market-note.pdf" in rendered
|
||||
assert "path: /examples/documents/market-note.pdf" in rendered
|
||||
assert "summary: Risk and compliance summary" in rendered
|
||||
assert "entity: Federal Reserve; Disney" in rendered
|
||||
assert "file_ref=" not in rendered
|
||||
|
|
|
|||
|
|
@ -42,3 +42,30 @@ def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path):
|
|||
filesystem.store.resolve_file_ref("/a/b/file.txt")
|
||||
|
||||
assert first_ref != second_ref
|
||||
|
||||
|
||||
def test_duplicate_source_path_target_raises_clear_error(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
first_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/first.txt",
|
||||
source_path="shared/source.txt",
|
||||
folder_path="/first",
|
||||
external_id="doc_first",
|
||||
title="First",
|
||||
content="first content",
|
||||
)
|
||||
second_ref = filesystem.register_file(
|
||||
storage_uri="file:///tmp/second.txt",
|
||||
source_path="shared/source.txt",
|
||||
folder_path="/second",
|
||||
external_id="doc_second",
|
||||
title="Second",
|
||||
content="second content",
|
||||
)
|
||||
|
||||
with pytest.raises(KeyError, match="Ambiguous file target"):
|
||||
filesystem.store.resolve_file_ref("/shared/source.txt")
|
||||
|
||||
assert first_ref != second_ref
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue