mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
fix(filesystem): require real grep source lines
Do not emit source-file grep fallback candidates unless an actual source line matches the query.
This commit is contained in:
parent
9734bf6914
commit
70eece52e5
2 changed files with 39 additions and 3 deletions
|
|
@ -1466,6 +1466,8 @@ class PIFSCommandExecutor:
|
|||
if direct_only and self._folder_path_for_source_path(file_row["source_path"]) != folder_path:
|
||||
continue
|
||||
line_number, text = self._first_matching_source_line(path, query)
|
||||
if line_number is None:
|
||||
continue
|
||||
hits.append(
|
||||
{
|
||||
"file_ref": file_row["file_ref"],
|
||||
|
|
@ -1560,15 +1562,15 @@ class PIFSCommandExecutor:
|
|||
break
|
||||
return filtered
|
||||
|
||||
def _first_matching_source_line(self, path: Path, query: str) -> tuple[int, str]:
|
||||
def _first_matching_source_line(self, path: Path, query: str) -> tuple[int | None, str]:
|
||||
try:
|
||||
lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
|
||||
except OSError:
|
||||
return 1, ""
|
||||
return None, ""
|
||||
for line_number, line in enumerate(lines, 1):
|
||||
if self._line_matches(line, query):
|
||||
return line_number, self._compact_text(line, max_chars=220)
|
||||
return 1, self._compact_text(lines[0], max_chars=220) if lines else ""
|
||||
return None, ""
|
||||
|
||||
def _source_root(self) -> Path | None:
|
||||
with self.filesystem.store.connect() as conn:
|
||||
|
|
|
|||
|
|
@ -87,6 +87,40 @@ def test_semantic_search_scope_filters_explicit_source_type_facets():
|
|||
) == {}
|
||||
|
||||
|
||||
def test_grep_source_file_requires_terms_on_same_line(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
|
||||
source_dir = tmp_path / "source" / "documents"
|
||||
source_dir.mkdir(parents=True)
|
||||
source = source_dir / "split.json"
|
||||
source.write_text(
|
||||
'{\n "first": "alpha evidence lives here",\n'
|
||||
' "second": "omega evidence lives there"\n}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
filesystem.register_file(
|
||||
storage_uri=str(source),
|
||||
source_path="documents/split.json",
|
||||
folder_path="/documents",
|
||||
external_id="doc_split_terms",
|
||||
title="Split source terms",
|
||||
content="registered artifact without the searched tokens",
|
||||
)
|
||||
executor = PIFSCommandExecutor(filesystem, json_output=True)
|
||||
|
||||
result = json.loads(executor.execute('grep -R "alpha omega" /documents'))
|
||||
|
||||
assert result["data"]["mode"] == "files"
|
||||
assert result["data"]["data"] == []
|
||||
|
||||
matched = json.loads(executor.execute('grep -R "alpha evidence" /documents'))
|
||||
|
||||
assert matched["data"]["data"][0]["external_id"] == "doc_split_terms"
|
||||
assert matched["data"]["data"][0]["line"] == 2
|
||||
assert "alpha evidence" in matched["data"]["data"][0]["text"]
|
||||
|
||||
|
||||
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue