mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
refactor(filesystem): simplify semantic search result fields
This commit is contained in:
parent
d139181c86
commit
7104602a70
2 changed files with 164 additions and 5 deletions
|
|
@ -786,7 +786,7 @@ class PIFSCommandExecutor:
|
|||
"query": query,
|
||||
"scope": normalized,
|
||||
"retrieval": f"{channel}_vector",
|
||||
"data": self._grep_file_hits_from_results(results, query),
|
||||
"data": self._semantic_channel_hits_from_results(channel, results, query),
|
||||
}
|
||||
|
||||
def _semantic_recursive_grep(
|
||||
|
|
@ -1072,7 +1072,7 @@ class PIFSCommandExecutor:
|
|||
if command_name in {"grep", "semantic-grep"}:
|
||||
return self._render_grep(data)
|
||||
if command_name in {"search-summary", "search-entity", "search-relation"}:
|
||||
return self._render_grep(data)
|
||||
return self._render_semantic_search(data)
|
||||
if command_name == "find":
|
||||
return self._render_find(data)
|
||||
if command_name == "stat":
|
||||
|
|
@ -1195,6 +1195,26 @@ class PIFSCommandExecutor:
|
|||
)
|
||||
return str(data)
|
||||
|
||||
def _render_semantic_search(self, data: Any) -> str:
|
||||
if not isinstance(data, dict):
|
||||
return str(data)
|
||||
if data.get("mode") != "files":
|
||||
return self._render_grep(data)
|
||||
if not data.get("data", []):
|
||||
return f"# no matches for: {data.get('query', '')}"
|
||||
lines: list[str] = []
|
||||
for item in data.get("data", []):
|
||||
lines.append(str(item.get("path") or "-"))
|
||||
lines.append(f"summary: {self._one_line_value(item.get('summary') or '')}")
|
||||
if "entity" in item:
|
||||
lines.append(f"entity: {self._one_line_value(item.get('entity') or '')}")
|
||||
if "relation" in item:
|
||||
lines.append(f"relation: {self._one_line_value(item.get('relation') or '')}")
|
||||
line_text = self._one_line_value(item.get("line_text") or "")
|
||||
lines.append(f"line_text: {line_text or '-'}")
|
||||
lines.append("")
|
||||
return "\n".join(lines).rstrip()
|
||||
|
||||
def _render_find(self, data: Any) -> str:
|
||||
if not isinstance(data, list):
|
||||
return str(data)
|
||||
|
|
@ -1422,6 +1442,37 @@ class PIFSCommandExecutor:
|
|||
break
|
||||
return hits
|
||||
|
||||
def _semantic_channel_hits_from_results(
|
||||
self,
|
||||
channel: str,
|
||||
results: list[Any],
|
||||
query: str,
|
||||
) -> list[dict[str, Any]]:
|
||||
hits = []
|
||||
for result in results:
|
||||
metadata = result.metadata or {}
|
||||
line, text = self._first_matching_line(result.file_ref, query)
|
||||
line_text = ""
|
||||
if text:
|
||||
line_text = f"{line}: {self._compact_text(text, max_chars=220)}"
|
||||
hit = {
|
||||
"path": self._file_target_path(
|
||||
{
|
||||
"file_ref": result.file_ref,
|
||||
"title": result.title,
|
||||
"folder_paths": result.folder_paths,
|
||||
"source_path": result.source_path,
|
||||
"external_id": result.external_id,
|
||||
}
|
||||
),
|
||||
"summary": metadata.get("summary") or "",
|
||||
"line_text": line_text,
|
||||
}
|
||||
if channel in {"entity", "relation"}:
|
||||
hit[channel] = metadata.get(channel) or ""
|
||||
hits.append(hit)
|
||||
return hits
|
||||
|
||||
def _rank_child_folders_from_source(
|
||||
self,
|
||||
*,
|
||||
|
|
|
|||
|
|
@ -22,18 +22,53 @@ class SummaryBackend:
|
|||
]
|
||||
|
||||
|
||||
class ChannelBackend:
|
||||
def __init__(self, document_id, channels=("summary", "entity", "relation")):
|
||||
self.document_id = document_id
|
||||
self.channels = channels
|
||||
|
||||
def available_channels(self):
|
||||
return self.channels
|
||||
|
||||
def search_channel(self, channel, query, *, limit=10, filters=None):
|
||||
return [
|
||||
SimpleNamespace(
|
||||
document_id=self.document_id,
|
||||
snippet=f"{channel} candidate: {query}",
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
return MetadataGenerationResult(
|
||||
values={"summary": "Federal Reserve annual report summary"}
|
||||
)
|
||||
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/report.pdf",
|
||||
source_path="examples/documents/report.pdf",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_report",
|
||||
title="Annual report",
|
||||
title="report.pdf",
|
||||
metadata={"source_type": "examples-documents"},
|
||||
content="Federal Reserve supervision and regulation annual report.",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
}
|
||||
},
|
||||
)
|
||||
backend = SummaryBackend("dsid_report")
|
||||
filesystem.semantic_retrieval_backend = backend
|
||||
|
|
@ -44,7 +79,80 @@ def test_semantic_search_scope_keeps_ordinary_folders_out_of_source_type_filters
|
|||
)
|
||||
|
||||
assert backend.calls[0][2] == {}
|
||||
assert result["data"]["data"][0]["external_id"] == "dsid_report"
|
||||
assert result["data"]["data"][0] == {
|
||||
"path": "/documents/report.pdf",
|
||||
"summary": "Federal Reserve annual report summary",
|
||||
"line_text": "1: Federal Reserve supervision and regulation annual report.",
|
||||
}
|
||||
|
||||
executor.json_output = False
|
||||
rendered = executor.execute('search-summary "Federal Reserve annual report" /documents')
|
||||
assert "/documents/report.pdf" in rendered
|
||||
assert "summary: Federal Reserve annual report summary" in rendered
|
||||
assert "line_text: 1: Federal Reserve supervision and regulation annual report." in rendered
|
||||
assert "id=dsid_report" not in rendered
|
||||
assert "file_ref=" not in rendered
|
||||
|
||||
|
||||
def test_entity_relation_search_return_minimal_fields_with_summary(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
class MetadataGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
values = {
|
||||
"summary": "Risk and compliance summary",
|
||||
"entity": "Federal Reserve; Disney",
|
||||
"relation": "Federal Reserve affects Disney valuation",
|
||||
}
|
||||
return MetadataGenerationResult(values={field: values[field] for field in fields})
|
||||
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=MetadataGenerator(),
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri="file:///tmp/market-note.pdf",
|
||||
source_path="examples/documents/market-note.pdf",
|
||||
folder_path="/documents",
|
||||
external_id="dsid_market_note",
|
||||
title="market-note.pdf",
|
||||
content="Federal Reserve policy affects Disney valuation.",
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
"entity": True,
|
||||
"relation": True,
|
||||
}
|
||||
},
|
||||
)
|
||||
filesystem.semantic_retrieval_backend = ChannelBackend("dsid_market_note")
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
entity = json.loads(executor.execute('search-entity "Federal Reserve" /documents'))
|
||||
assert entity["data"]["data"][0] == {
|
||||
"path": "/documents/market-note.pdf",
|
||||
"summary": "Risk and compliance summary",
|
||||
"line_text": "1: Federal Reserve policy affects Disney valuation.",
|
||||
"entity": "Federal Reserve; Disney",
|
||||
}
|
||||
|
||||
relation = json.loads(executor.execute('search-relation "Disney valuation" /documents'))
|
||||
assert relation["data"]["data"][0] == {
|
||||
"path": "/documents/market-note.pdf",
|
||||
"summary": "Risk and compliance summary",
|
||||
"line_text": "1: Federal Reserve policy affects Disney valuation.",
|
||||
"relation": "Federal Reserve affects Disney valuation",
|
||||
}
|
||||
|
||||
executor.json_output = False
|
||||
rendered = executor.execute('search-entity "Federal Reserve" /documents')
|
||||
assert "summary: Risk and compliance summary" in rendered
|
||||
assert "entity: Federal Reserve; Disney" in rendered
|
||||
assert "file_ref=" not in rendered
|
||||
|
||||
|
||||
def test_semantic_search_rejects_unquoted_multi_word_query(tmp_path):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue