fix(filesystem): require real grep source lines

Do not emit source-file grep fallback candidates unless an actual source line matches the query.
This commit is contained in:
Bukely_ 2026-05-26 20:29:51 +08:00 committed by BukeLy
parent 9734bf6914
commit 70eece52e5
2 changed files with 39 additions and 3 deletions

View file

@ -1466,6 +1466,8 @@ class PIFSCommandExecutor:
if direct_only and self._folder_path_for_source_path(file_row["source_path"]) != folder_path:
continue
line_number, text = self._first_matching_source_line(path, query)
if line_number is None:
continue
hits.append(
{
"file_ref": file_row["file_ref"],
@ -1560,15 +1562,15 @@ class PIFSCommandExecutor:
break
return filtered
def _first_matching_source_line(self, path: Path, query: str) -> tuple[int, str]:
def _first_matching_source_line(self, path: Path, query: str) -> tuple[int | None, str]:
try:
lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
except OSError:
return 1, ""
return None, ""
for line_number, line in enumerate(lines, 1):
if self._line_matches(line, query):
return line_number, self._compact_text(line, max_chars=220)
return 1, self._compact_text(lines[0], max_chars=220) if lines else ""
return None, ""
def _source_root(self) -> Path | None:
with self.filesystem.store.connect() as conn:

View file

@ -87,6 +87,40 @@ def test_semantic_search_scope_filters_explicit_source_type_facets():
) == {}
def test_grep_source_file_requires_terms_on_same_line(tmp_path):
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
source_dir = tmp_path / "source" / "documents"
source_dir.mkdir(parents=True)
source = source_dir / "split.json"
source.write_text(
'{\n "first": "alpha evidence lives here",\n'
' "second": "omega evidence lives there"\n}\n',
encoding="utf-8",
)
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
filesystem.register_file(
storage_uri=str(source),
source_path="documents/split.json",
folder_path="/documents",
external_id="doc_split_terms",
title="Split source terms",
content="registered artifact without the searched tokens",
)
executor = PIFSCommandExecutor(filesystem, json_output=True)
result = json.loads(executor.execute('grep -R "alpha omega" /documents'))
assert result["data"]["mode"] == "files"
assert result["data"]["data"] == []
matched = json.loads(executor.execute('grep -R "alpha evidence" /documents'))
assert matched["data"]["data"][0]["external_id"] == "doc_split_terms"
assert matched["data"]["data"][0]["line"] == 2
assert "alpha evidence" in matched["data"]["data"][0]["text"]
def test_existing_summary_projection_index_configures_retrieval_backend(tmp_path, monkeypatch):
from pageindex.filesystem import PageIndexFileSystem
from pageindex.filesystem.semantic_index import SemanticIndexRecord, SQLiteVecSemanticIndex