mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
retrieved_context: drop document completeness concept
This commit is contained in:
parent
6bb20df510
commit
26a1431e87
3 changed files with 10 additions and 20 deletions
|
|
@ -21,17 +21,11 @@ class RetrievedPassage:
|
|||
|
||||
@dataclass(frozen=True)
|
||||
class RetrievedDocument:
|
||||
"""A source document and the passages retrieved from it, in order.
|
||||
|
||||
``is_complete`` is ``True`` when every chunk of the document is present in
|
||||
this block, so the model knows whether it is seeing the whole source or
|
||||
only excerpts.
|
||||
"""
|
||||
"""A source document and the passages retrieved from it, in order."""
|
||||
|
||||
document_id: int
|
||||
title: str
|
||||
source_label: str | None = None
|
||||
is_complete: bool = False
|
||||
passages: list[RetrievedPassage] = field(default_factory=list)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -18,9 +18,8 @@ from .models import RetrievedDocument, RetrievedPassage
|
|||
_HEADER = (
|
||||
"These are excerpts from the user's knowledge base, selected for this query.\n"
|
||||
"A document is a full source (a file, a Slack thread, a Notion page); a chunk\n"
|
||||
"is one ordered fragment of it. Each document is tagged (partial) when only\n"
|
||||
"some of its chunks were retrieved or (complete) when all of them are shown\n"
|
||||
"here, so you know whether you have the whole source or only parts of it.\n"
|
||||
"is one ordered fragment of it. You are seeing only the chunks that matched\n"
|
||||
"this query, not the whole source.\n"
|
||||
"Cite a chunk with [n]."
|
||||
)
|
||||
|
||||
|
|
@ -61,10 +60,9 @@ def _render_document(
|
|||
|
||||
|
||||
def _render_header(document: RetrievedDocument) -> str:
|
||||
"""``Document: "Title" (source) (partial|complete)``."""
|
||||
"""``Document: "Title" (source)``."""
|
||||
source = f" ({document.source_label})" if document.source_label else ""
|
||||
completeness = "(complete)" if document.is_complete else "(partial)"
|
||||
return f'Document: "{_clean(document.title)}"{source} {completeness}'
|
||||
return f'Document: "{_clean(document.title)}"{source}'
|
||||
|
||||
|
||||
def _render_passage(
|
||||
|
|
|
|||
|
|
@ -23,13 +23,11 @@ def _document(
|
|||
chunk_ids: list[int],
|
||||
*,
|
||||
source_label: str | None = None,
|
||||
is_complete: bool = False,
|
||||
) -> RetrievedDocument:
|
||||
return RetrievedDocument(
|
||||
document_id=document_id,
|
||||
title=title,
|
||||
source_label=source_label,
|
||||
is_complete=is_complete,
|
||||
passages=[
|
||||
RetrievedPassage(document_id=document_id, chunk_id=cid, content=f"text {cid}")
|
||||
for cid in chunk_ids
|
||||
|
|
@ -73,20 +71,20 @@ def test_registers_passages_with_chunk_locators() -> None:
|
|||
assert entry.display["title"] == "Doc"
|
||||
|
||||
|
||||
def test_header_shows_source_and_completeness() -> None:
|
||||
def test_header_shows_source_when_present() -> None:
|
||||
registry = CitationRegistry()
|
||||
|
||||
block = render_retrieved_context(
|
||||
[
|
||||
_document(1, "Q3", [1], source_label="Slack · #launch", is_complete=False),
|
||||
_document(2, "Plan", [2], is_complete=True),
|
||||
_document(1, "Q3", [1], source_label="Slack · #launch"),
|
||||
_document(2, "Plan", [2]),
|
||||
],
|
||||
registry,
|
||||
)
|
||||
|
||||
assert block is not None
|
||||
assert 'Document: "Q3" (Slack · #launch) (partial)' in block
|
||||
assert 'Document: "Plan" (complete)' in block
|
||||
assert 'Document: "Q3" (Slack · #launch)' in block
|
||||
assert 'Document: "Plan"' in block
|
||||
|
||||
|
||||
def test_wraps_block_and_explains_chunk_vs_document() -> None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue