mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-20 21:18:13 +02:00
feat: return chunk char spans from hybrid search
This commit is contained in:
parent
90502d21d3
commit
04b679e2bf
1 changed files with 16 additions and 2 deletions
|
|
@ -440,8 +440,15 @@ class ChucksHybridSearchRetriever:
|
|||
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
|
||||
|
||||
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
|
||||
# start_char/end_char carry the citation span; None for legacy rows.
|
||||
chunk_query = (
|
||||
select(Chunk.id, Chunk.content, Chunk.document_id)
|
||||
select(
|
||||
Chunk.id,
|
||||
Chunk.content,
|
||||
Chunk.document_id,
|
||||
Chunk.start_char,
|
||||
Chunk.end_char,
|
||||
)
|
||||
.join(numbered, Chunk.id == numbered.c.chunk_id)
|
||||
.where(chunk_filter)
|
||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||
|
|
@ -476,7 +483,14 @@ class ChucksHybridSearchRetriever:
|
|||
if doc_id not in doc_map:
|
||||
continue
|
||||
doc_entry = doc_map[doc_id]
|
||||
doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
|
||||
doc_entry["chunks"].append(
|
||||
{
|
||||
"chunk_id": row.id,
|
||||
"content": row.content,
|
||||
"start_char": row.start_char,
|
||||
"end_char": row.end_char,
|
||||
}
|
||||
)
|
||||
if row.id in matched_chunk_ids:
|
||||
doc_entry["matched_chunk_ids"].append(row.id)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue