diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index 2ec5920..18a85cc 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -1337,6 +1337,23 @@ class PIFSCommandExecutor: return f"{folder}/{title}" if folder else f"/{title}" return str(item.get("source_path") or item.get("external_id") or file_ref or "-") + def _stable_file_target_path(self, item: dict[str, Any]) -> str: + file_ref = str(item.get("file_ref") or "").strip() + source_path = str(item.get("source_path") or "").strip() + if source_path: + target = "/" + source_path.strip("/") + try: + if not file_ref or self.filesystem.store.resolve_file_ref(target) == file_ref: + return target + except KeyError: + pass + external_id = str(item.get("external_id") or "").strip() + if external_id: + return external_id + if file_ref: + return file_ref + return str(item.get("external_id") or item.get("file_ref") or "-") + def _semantic_retrieval_query(self, query: str) -> str: query = str(query or "").strip() context = str(self.query_context or "").strip() @@ -1464,7 +1481,7 @@ class PIFSCommandExecutor: if text: line_text = f"{line}: {self._compact_text(text, max_chars=220)}" hit = { - "path": self._file_target_path( + "path": self._stable_file_target_path( { "file_ref": result.file_ref, "title": result.title, diff --git a/pageindex/filesystem/store.py b/pageindex/filesystem/store.py index b88b54e..7517d70 100644 --- a/pageindex/filesystem/store.py +++ b/pageindex/filesystem/store.py @@ -1050,12 +1050,29 @@ class SQLiteFileSystemStore: if row: return row["file_ref"] stripped = target.strip("/") - row = conn.execute( - "SELECT file_ref FROM files WHERE source_path = ? AND deleted_at IS NULL", + rows = conn.execute( + """ + SELECT + f.file_ref, + f.external_id, + f.title, + f.source_path, + COALESCE(MIN(fo.path), '/') AS folder_path + FROM files f + LEFT JOIN file_folders ff ON ff.file_ref = f.file_ref + LEFT JOIN folders fo ON fo.folder_id = ff.folder_id + WHERE f.source_path = ? AND f.deleted_at IS NULL + GROUP BY f.file_ref, f.external_id, f.title, f.source_path + ORDER BY f.file_ref + LIMIT 2 + """, (stripped,), - ).fetchone() - if row: - return row["file_ref"] + ).fetchall() + if len(rows) > 1: + matches = "; ".join(self._virtual_match_summary(row) for row in rows) + raise KeyError(f"Ambiguous file target: {target}. Matches: {matches}") + if rows: + return rows[0]["file_ref"] virtual_file_ref = self._resolve_virtual_file_ref(conn, target) if virtual_file_ref: return virtual_file_ref diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index 7c9e31b..087473a 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -83,7 +83,7 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters workspace=tmp_path / "workspace", metadata_generator=SummaryGenerator(), ) - filesystem.register_file( + file_ref = filesystem.register_file( storage_uri="file:///tmp/report.pdf", source_path="examples/documents/report.pdf", folder_path="/documents", @@ -110,20 +110,133 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters assert backend.calls[0][2] == {} assert result["data"]["data"][0] == { - "path": "/documents/report.pdf", + "path": "/examples/documents/report.pdf", "summary": "Federal Reserve annual report summary", "line_text": "1: Federal Reserve supervision and regulation annual report.", } + assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == file_ref executor.json_output = False rendered = executor.execute('search-summary "Federal Reserve annual report" /documents') - assert "path: /documents/report.pdf" in rendered + assert "path: /examples/documents/report.pdf" in rendered assert "summary: Federal Reserve annual report summary" in rendered assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered assert "id=dsid_report" not in rendered assert "file_ref=" not in rendered +def test_semantic_search_path_is_unique_source_target_when_titles_collide(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": f"summary for {document.external_id}"} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + first_ref = filesystem.register_file( + storage_uri="file:///tmp/first.json", + source_path="slack/dsid_first.json", + folder_path="/documents", + external_id="dsid_first", + title="announcements", + content="first announcement mentions H200 reservations.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.register_file( + storage_uri="file:///tmp/second.json", + source_path="slack/dsid_second.json", + folder_path="/documents", + external_id="dsid_second", + title="announcements", + content="second announcement mentions unrelated maintenance.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first") + executor = PIFSCommandExecutor(filesystem, json_output=True) + + result = json.loads(executor.execute('search-summary "H200 reservations" /documents')) + + assert result["data"]["data"][0]["path"] == "/slack/dsid_first.json" + assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref + with pytest.raises(KeyError, match="Ambiguous file target"): + filesystem.store.resolve_file_ref("/documents/announcements") + + +def test_semantic_search_path_falls_back_when_source_target_is_ambiguous(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": f"summary for {document.external_id}"} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) + first_ref = filesystem.register_file( + storage_uri="file:///tmp/first.json", + source_path="shared/source.json", + folder_path="/documents", + external_id="dsid_first", + title="First", + content="first content", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.register_file( + storage_uri="file:///tmp/second.json", + source_path="shared/source.json", + folder_path="/documents", + external_id="dsid_second", + title="Second", + content="second content", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, + ) + filesystem.semantic_retrieval_backend = SummaryBackend("dsid_first") + executor = PIFSCommandExecutor(filesystem, json_output=True) + + result = json.loads(executor.execute('search-summary "first" /documents')) + + assert result["data"]["data"][0]["path"] == "dsid_first" + assert filesystem.store.resolve_file_ref(result["data"]["data"][0]["path"]) == first_ref + + def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem from pageindex.filesystem.metadata_generation import MetadataGenerationResult @@ -164,7 +277,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path): entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents')) assert entity["data"]["data"][0] == { - "path": "/documents/market-note.pdf", + "path": "/examples/documents/market-note.pdf", "summary": "Risk and compliance summary", "line_text": "1: Federal Reserve policy affects Disney valuation.", "entity": "Federal Reserve; Disney", @@ -172,7 +285,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path): relation = json.loads(executor.execute('search-relation "Disney valuation" /documents')) assert relation["data"]["data"][0] == { - "path": "/documents/market-note.pdf", + "path": "/examples/documents/market-note.pdf", "summary": "Risk and compliance summary", "line_text": "1: Federal Reserve policy affects Disney valuation.", "relation": "Federal Reserve affects Disney valuation", @@ -180,7 +293,7 @@ def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path): executor.json_output = False rendered = executor.execute('search-entity "Federal Reserve" /documents') - assert "path: /documents/market-note.pdf" in rendered + assert "path: /examples/documents/market-note.pdf" in rendered assert "summary: Risk and compliance summary" in rendered assert "entity: Federal Reserve; Disney" in rendered assert "file_ref=" not in rendered diff --git a/tests/test_pifs_path_resolution.py b/tests/test_pifs_path_resolution.py index 08cf28f..184fc53 100644 --- a/tests/test_pifs_path_resolution.py +++ b/tests/test_pifs_path_resolution.py @@ -42,3 +42,30 @@ def test_ambiguous_virtual_file_path_raises_clear_error(tmp_path): filesystem.store.resolve_file_ref("/a/b/file.txt") assert first_ref != second_ref + + +def test_duplicate_source_path_target_raises_clear_error(tmp_path): + from pageindex.filesystem import PageIndexFileSystem + + filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + first_ref = filesystem.register_file( + storage_uri="file:///tmp/first.txt", + source_path="shared/source.txt", + folder_path="/first", + external_id="doc_first", + title="First", + content="first content", + ) + second_ref = filesystem.register_file( + storage_uri="file:///tmp/second.txt", + source_path="shared/source.txt", + folder_path="/second", + external_id="doc_second", + title="Second", + content="second content", + ) + + with pytest.raises(KeyError, match="Ambiguous file target"): + filesystem.store.resolve_file_ref("/shared/source.txt") + + assert first_ref != second_ref