mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-28 21:49:40 +02:00
feat: return chunk char spans from hybrid search
This commit is contained in:
parent
90502d21d3
commit
04b679e2bf
1 changed files with 16 additions and 2 deletions
|
|
@ -440,8 +440,15 @@ class ChucksHybridSearchRetriever:
|
||||||
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
|
chunk_filter = numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC
|
||||||
|
|
||||||
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
|
# Select only the columns we need (skip Chunk.embedding ~12KB/row).
|
||||||
|
# start_char/end_char carry the citation span; None for legacy rows.
|
||||||
chunk_query = (
|
chunk_query = (
|
||||||
select(Chunk.id, Chunk.content, Chunk.document_id)
|
select(
|
||||||
|
Chunk.id,
|
||||||
|
Chunk.content,
|
||||||
|
Chunk.document_id,
|
||||||
|
Chunk.start_char,
|
||||||
|
Chunk.end_char,
|
||||||
|
)
|
||||||
.join(numbered, Chunk.id == numbered.c.chunk_id)
|
.join(numbered, Chunk.id == numbered.c.chunk_id)
|
||||||
.where(chunk_filter)
|
.where(chunk_filter)
|
||||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||||
|
|
@ -476,7 +483,14 @@ class ChucksHybridSearchRetriever:
|
||||||
if doc_id not in doc_map:
|
if doc_id not in doc_map:
|
||||||
continue
|
continue
|
||||||
doc_entry = doc_map[doc_id]
|
doc_entry = doc_map[doc_id]
|
||||||
doc_entry["chunks"].append({"chunk_id": row.id, "content": row.content})
|
doc_entry["chunks"].append(
|
||||||
|
{
|
||||||
|
"chunk_id": row.id,
|
||||||
|
"content": row.content,
|
||||||
|
"start_char": row.start_char,
|
||||||
|
"end_char": row.end_char,
|
||||||
|
}
|
||||||
|
)
|
||||||
if row.id in matched_chunk_ids:
|
if row.id in matched_chunk_ids:
|
||||||
doc_entry["matched_chunk_ids"].append(row.id)
|
doc_entry["matched_chunk_ids"].append(row.id)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue