mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-20 21:18:13 +02:00
test: span-aware paragraph chunker fixture
This commit is contained in:
parent
a7cf9bd946
commit
03012c3077
1 changed files with 13 additions and 7 deletions
|
|
@ -18,16 +18,22 @@ _V1 = "Intro paragraph.\n\nBody paragraph.\n\nOutro paragraph."
|
|||
|
||||
@pytest.fixture
|
||||
def paragraph_chunker(monkeypatch):
|
||||
"""One chunk per markdown paragraph, so edits map to chunk-level diffs."""
|
||||
"""One slice per markdown paragraph, so edits map to chunk-level diffs."""
|
||||
from app.indexing_pipeline.document_chunker import ChunkSlice
|
||||
|
||||
def _split(markdown, **_kwargs):
|
||||
return [p for p in markdown.split("\n\n") if p.strip()]
|
||||
def _split(markdown, *_args, **_kwargs):
|
||||
slices = []
|
||||
cursor = 0
|
||||
for para in markdown.split("\n\n"):
|
||||
start = markdown.index(para, cursor)
|
||||
cursor = start + len(para)
|
||||
if para.strip():
|
||||
slices.append(ChunkSlice(para, start, cursor))
|
||||
return slices
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text", _split
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_text_hybrid", _split
|
||||
"app.indexing_pipeline.cache.cached_indexing.chunk_markdown_with_spans",
|
||||
_split,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue