mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
refactor(chunks): order chunk reads by (document_id, position)
Presentation and citation ordering moves off Chunk.id/created_at to the explicit position column (id kept as tiebreaker). Vector and ts_rank ranking order_by clauses are untouched.
This commit is contained in:
parent
5a71769dba
commit
052e9ef4d1
9 changed files with 28 additions and 19 deletions
|
|
@ -508,7 +508,7 @@ class KBPostgresBackend(BackendProtocol):
|
|||
chunk_rows = await session.execute(
|
||||
select(Chunk.id, Chunk.content)
|
||||
.where(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunks = [
|
||||
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
|
||||
|
|
@ -725,7 +725,7 @@ class KBPostgresBackend(BackendProtocol):
|
|||
.join(Document, Document.id == Chunk.document_id)
|
||||
.where(Document.search_space_id == self.search_space_id)
|
||||
.where(Chunk.content.ilike(f"%{pattern}%"))
|
||||
.order_by(Chunk.document_id, Chunk.id)
|
||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_rows = await session.execute(sub)
|
||||
per_doc: dict[int, int] = {}
|
||||
|
|
|
|||
|
|
@ -394,7 +394,10 @@ async def browse_recent_documents(
|
|||
Chunk.document_id,
|
||||
Chunk.content,
|
||||
func.row_number()
|
||||
.over(partition_by=Chunk.document_id, order_by=Chunk.id)
|
||||
.over(
|
||||
partition_by=Chunk.document_id,
|
||||
order_by=(Chunk.position, Chunk.id),
|
||||
)
|
||||
.label("rn"),
|
||||
)
|
||||
.where(Chunk.document_id.in_(doc_ids))
|
||||
|
|
@ -404,7 +407,7 @@ async def browse_recent_documents(
|
|||
chunk_query = (
|
||||
select(numbered.c.chunk_id, numbered.c.document_id, numbered.c.content)
|
||||
.where(numbered.c.rn <= _RECENCY_MAX_CHUNKS_PER_DOC)
|
||||
.order_by(numbered.c.document_id, numbered.c.chunk_id)
|
||||
.order_by(numbered.c.document_id, numbered.c.rn)
|
||||
)
|
||||
chunk_result = await session.execute(chunk_query)
|
||||
fetched_chunks = chunk_result.all()
|
||||
|
|
@ -531,7 +534,7 @@ async def fetch_mentioned_documents(
|
|||
chunk_result = await session.execute(
|
||||
select(Chunk.id, Chunk.content, Chunk.document_id)
|
||||
.where(Chunk.document_id.in_(list(docs.keys())))
|
||||
.order_by(Chunk.document_id, Chunk.id)
|
||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||
)
|
||||
chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
|
||||
for row in chunk_result.all():
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ async def _browse_recent_documents(
|
|||
chunk_query = (
|
||||
select(Chunk)
|
||||
.where(Chunk.document_id.in_(doc_ids))
|
||||
.order_by(Chunk.document_id, Chunk.id)
|
||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_result = await session.execute(chunk_query)
|
||||
raw_chunks = chunk_result.scalars().all()
|
||||
|
|
|
|||
|
|
@ -420,7 +420,10 @@ class ChucksHybridSearchRetriever:
|
|||
select(
|
||||
Chunk.id.label("chunk_id"),
|
||||
func.row_number()
|
||||
.over(partition_by=Chunk.document_id, order_by=Chunk.id)
|
||||
.over(
|
||||
partition_by=Chunk.document_id,
|
||||
order_by=(Chunk.position, Chunk.id),
|
||||
)
|
||||
.label("rn"),
|
||||
)
|
||||
.where(Chunk.document_id.in_(doc_ids))
|
||||
|
|
@ -441,7 +444,7 @@ class ChucksHybridSearchRetriever:
|
|||
select(Chunk.id, Chunk.content, Chunk.document_id)
|
||||
.join(numbered, Chunk.id == numbered.c.chunk_id)
|
||||
.where(chunk_filter)
|
||||
.order_by(Chunk.document_id, Chunk.id)
|
||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||
)
|
||||
|
||||
t_fetch = time.perf_counter()
|
||||
|
|
|
|||
|
|
@ -357,7 +357,10 @@ class DocumentHybridSearchRetriever:
|
|||
select(
|
||||
Chunk.id.label("chunk_id"),
|
||||
func.row_number()
|
||||
.over(partition_by=Chunk.document_id, order_by=Chunk.id)
|
||||
.over(
|
||||
partition_by=Chunk.document_id,
|
||||
order_by=(Chunk.position, Chunk.id),
|
||||
)
|
||||
.label("rn"),
|
||||
)
|
||||
.where(Chunk.document_id.in_(doc_ids))
|
||||
|
|
@ -369,7 +372,7 @@ class DocumentHybridSearchRetriever:
|
|||
select(Chunk.id, Chunk.content, Chunk.document_id)
|
||||
.join(numbered, Chunk.id == numbered.c.chunk_id)
|
||||
.where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC)
|
||||
.order_by(Chunk.document_id, Chunk.id)
|
||||
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
|
||||
)
|
||||
|
||||
t_fetch = time.perf_counter()
|
||||
|
|
|
|||
|
|
@ -1014,8 +1014,8 @@ async def get_document_by_chunk_id(
|
|||
.filter(
|
||||
Chunk.document_id == document.id,
|
||||
or_(
|
||||
Chunk.created_at < chunk.created_at,
|
||||
and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
|
||||
Chunk.position < chunk.position,
|
||||
and_(Chunk.position == chunk.position, Chunk.id < chunk.id),
|
||||
),
|
||||
)
|
||||
)
|
||||
|
|
@ -1027,7 +1027,7 @@ async def get_document_by_chunk_id(
|
|||
windowed_result = await session.execute(
|
||||
select(Chunk)
|
||||
.filter(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.created_at, Chunk.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
.offset(start)
|
||||
.limit(end - start)
|
||||
)
|
||||
|
|
@ -1137,7 +1137,7 @@ async def get_document_chunks_paginated(
|
|||
chunks_result = await session.execute(
|
||||
select(Chunk)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.created_at, Chunk.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
.offset(offset)
|
||||
.limit(page_size)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ async def get_editor_content(
|
|||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
|
||||
|
|
@ -205,7 +205,7 @@ async def download_document_markdown(
|
|||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
if chunk_contents:
|
||||
|
|
@ -354,7 +354,7 @@ async def export_document(
|
|||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
if chunk_contents:
|
||||
|
|
|
|||
|
|
@ -156,7 +156,7 @@ async def _resolve_document_text(
|
|||
stmt = (
|
||||
select(Chunk.content)
|
||||
.where(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
.limit(_MAX_CHUNKS_FOR_CONTEXT)
|
||||
)
|
||||
result = await session.execute(stmt)
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ async def _get_document_markdown(
|
|||
chunk_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.id)
|
||||
.order_by(Chunk.position, Chunk.id)
|
||||
)
|
||||
chunks = chunk_result.scalars().all()
|
||||
if chunks:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue