From 7104602a70ab1cd7604d63446565f9ca3c733069 Mon Sep 17 00:00:00 2001 From: BukeLy Date: Tue, 26 May 2026 20:39:25 +0800 Subject: [PATCH] refactor(filesystem): simplify semantic search result fields --- pageindex/filesystem/commands.py | 55 ++++++++++- tests/test_pageindex_filesystem_scope.py | 114 ++++++++++++++++++++++- 2 files changed, 164 insertions(+), 5 deletions(-) diff --git a/pageindex/filesystem/commands.py b/pageindex/filesystem/commands.py index cc4535f..533e88a 100644 --- a/pageindex/filesystem/commands.py +++ b/pageindex/filesystem/commands.py @@ -786,7 +786,7 @@ class PIFSCommandExecutor: "query": query, "scope": normalized, "retrieval": f"{channel}_vector", - "data": self._grep_file_hits_from_results(results, query), + "data": self._semantic_channel_hits_from_results(channel, results, query), } def _semantic_recursive_grep( @@ -1072,7 +1072,7 @@ class PIFSCommandExecutor: if command_name in {"grep", "semantic-grep"}: return self._render_grep(data) if command_name in {"search-summary", "search-entity", "search-relation"}: - return self._render_grep(data) + return self._render_semantic_search(data) if command_name == "find": return self._render_find(data) if command_name == "stat": @@ -1195,6 +1195,26 @@ class PIFSCommandExecutor: ) return str(data) + def _render_semantic_search(self, data: Any) -> str: + if not isinstance(data, dict): + return str(data) + if data.get("mode") != "files": + return self._render_grep(data) + if not data.get("data", []): + return f"# no matches for: {data.get('query', '')}" + lines: list[str] = [] + for item in data.get("data", []): + lines.append(str(item.get("path") or "-")) + lines.append(f"summary: {self._one_line_value(item.get('summary') or '')}") + if "entity" in item: + lines.append(f"entity: {self._one_line_value(item.get('entity') or '')}") + if "relation" in item: + lines.append(f"relation: {self._one_line_value(item.get('relation') or '')}") + line_text = self._one_line_value(item.get("line_text") or "") + lines.append(f"line_text: {line_text or '-'}") + lines.append("") + return "\n".join(lines).rstrip() + def _render_find(self, data: Any) -> str: if not isinstance(data, list): return str(data) @@ -1422,6 +1442,37 @@ class PIFSCommandExecutor: break return hits + def _semantic_channel_hits_from_results( + self, + channel: str, + results: list[Any], + query: str, + ) -> list[dict[str, Any]]: + hits = [] + for result in results: + metadata = result.metadata or {} + line, text = self._first_matching_line(result.file_ref, query) + line_text = "" + if text: + line_text = f"{line}: {self._compact_text(text, max_chars=220)}" + hit = { + "path": self._file_target_path( + { + "file_ref": result.file_ref, + "title": result.title, + "folder_paths": result.folder_paths, + "source_path": result.source_path, + "external_id": result.external_id, + } + ), + "summary": metadata.get("summary") or "", + "line_text": line_text, + } + if channel in {"entity", "relation"}: + hit[channel] = metadata.get(channel) or "" + hits.append(hit) + return hits + def _rank_child_folders_from_source( self, *, diff --git a/tests/test_pageindex_filesystem_scope.py b/tests/test_pageindex_filesystem_scope.py index dd35027..1ee57cc 100644 --- a/tests/test_pageindex_filesystem_scope.py +++ b/tests/test_pageindex_filesystem_scope.py @@ -22,18 +22,53 @@ class SummaryBackend: ] +class ChannelBackend: + def __init__(self, document_id, channels=("summary", "entity", "relation")): + self.document_id = document_id + self.channels = channels + + def available_channels(self): + return self.channels + + def search_channel(self, channel, query, *, limit=10, filters=None): + return [ + SimpleNamespace( + document_id=self.document_id, + snippet=f"{channel} candidate: {query}", + ) + ] + + def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path): from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult - filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace") + class SummaryGenerator: + def generate(self, document, *, fields): + return MetadataGenerationResult( + values={"summary": "Federal Reserve annual report summary"} + ) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=SummaryGenerator(), + ) filesystem.register_file( storage_uri="file:///tmp/report.pdf", source_path="examples/documents/report.pdf", folder_path="/documents", external_id="dsid_report", - title="Annual report", + title="report.pdf", metadata={"source_type": "examples-documents"}, content="Federal Reserve supervision and regulation annual report.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + } + }, ) backend = SummaryBackend("dsid_report") filesystem.semantic_retrieval_backend = backend @@ -44,7 +79,80 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters ) assert backend.calls[0][2] == {} - assert result["data"]["data"][0]["external_id"] == "dsid_report" + assert result["data"]["data"][0] == { + "path": "/documents/report.pdf", + "summary": "Federal Reserve annual report summary", + "line_text": "1: Federal Reserve supervision and regulation annual report.", + } + + executor.json_output = False + rendered = executor.execute('search-summary "Federal Reserve annual report" /documents') + assert "/documents/report.pdf" in rendered + assert "summary: Federal Reserve annual report summary" in rendered + assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered + assert "id=dsid_report" not in rendered + assert "file_ref=" not in rendered + + +def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path): + from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem + from pageindex.filesystem.metadata_generation import MetadataGenerationResult + + class MetadataGenerator: + def generate(self, document, *, fields): + values = { + "summary": "Risk and compliance summary", + "entity": "Federal Reserve; Disney", + "relation": "Federal Reserve affects Disney valuation", + } + return MetadataGenerationResult(values={field: values[field] for field in fields}) + + filesystem = PageIndexFileSystem( + workspace=tmp_path / "workspace", + metadata_generator=MetadataGenerator(), + ) + filesystem.register_file( + storage_uri="file:///tmp/market-note.pdf", + source_path="examples/documents/market-note.pdf", + folder_path="/documents", + external_id="dsid_market_note", + title="market-note.pdf", + content="Federal Reserve policy affects Disney valuation.", + metadata_policy={ + "fields": { + "summary": True, + "doc_type": False, + "domain": False, + "topic": False, + "entity": True, + "relation": True, + } + }, + ) + filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note") + executor = PIFSCommandExecutor(filesystem, json_output=True) + + entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents')) + assert entity["data"]["data"][0] == { + "path": "/documents/market-note.pdf", + "summary": "Risk and compliance summary", + "line_text": "1: Federal Reserve policy affects Disney valuation.", + "entity": "Federal Reserve; Disney", + } + + relation = json.loads(executor.execute('search-relation "Disney valuation" /documents')) + assert relation["data"]["data"][0] == { + "path": "/documents/market-note.pdf", + "summary": "Risk and compliance summary", + "line_text": "1: Federal Reserve policy affects Disney valuation.", + "relation": "Federal Reserve affects Disney valuation", + } + + executor.json_output = False + rendered = executor.execute('search-entity "Federal Reserve" /documents') + assert "summary: Risk and compliance summary" in rendered + assert "entity: Federal Reserve; Disney" in rendered + assert "file_ref=" not in rendered def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):