refactor(chunks): order chunk reads by (document_id, position)

Presentation and citation ordering moves off Chunk.id/created_at to the
explicit position column (id kept as tiebreaker). Vector and ts_rank
ranking order_by clauses are untouched.
This commit is contained in:
CREDO23 2026-06-12 18:53:21 +02:00
parent 5a71769dba
commit 052e9ef4d1
9 changed files with 28 additions and 19 deletions

View file

@ -508,7 +508,7 @@ class KBPostgresBackend(BackendProtocol):
chunk_rows = await session.execute(
select(Chunk.id, Chunk.content)
.where(Chunk.document_id == document.id)
.order_by(Chunk.id)
.order_by(Chunk.position, Chunk.id)
)
chunks = [
{"chunk_id": row.id, "content": row.content} for row in chunk_rows.all()
@ -725,7 +725,7 @@ class KBPostgresBackend(BackendProtocol):
.join(Document, Document.id == Chunk.document_id)
.where(Document.search_space_id == self.search_space_id)
.where(Chunk.content.ilike(f"%{pattern}%"))
.order_by(Chunk.document_id, Chunk.id)
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
)
chunk_rows = await session.execute(sub)
per_doc: dict[int, int] = {}

View file

@ -394,7 +394,10 @@ async def browse_recent_documents(
Chunk.document_id,
Chunk.content,
func.row_number()
.over(partition_by=Chunk.document_id, order_by=Chunk.id)
.over(
partition_by=Chunk.document_id,
order_by=(Chunk.position, Chunk.id),
)
.label("rn"),
)
.where(Chunk.document_id.in_(doc_ids))
@ -404,7 +407,7 @@ async def browse_recent_documents(
chunk_query = (
select(numbered.c.chunk_id, numbered.c.document_id, numbered.c.content)
.where(numbered.c.rn <= _RECENCY_MAX_CHUNKS_PER_DOC)
.order_by(numbered.c.document_id, numbered.c.chunk_id)
.order_by(numbered.c.document_id, numbered.c.rn)
)
chunk_result = await session.execute(chunk_query)
fetched_chunks = chunk_result.all()
@ -531,7 +534,7 @@ async def fetch_mentioned_documents(
chunk_result = await session.execute(
select(Chunk.id, Chunk.content, Chunk.document_id)
.where(Chunk.document_id.in_(list(docs.keys())))
.order_by(Chunk.document_id, Chunk.id)
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
)
chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
for row in chunk_result.all():

View file

@ -122,7 +122,7 @@ async def _browse_recent_documents(
chunk_query = (
select(Chunk)
.where(Chunk.document_id.in_(doc_ids))
.order_by(Chunk.document_id, Chunk.id)
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
)
chunk_result = await session.execute(chunk_query)
raw_chunks = chunk_result.scalars().all()

View file

@ -420,7 +420,10 @@ class ChucksHybridSearchRetriever:
select(
Chunk.id.label("chunk_id"),
func.row_number()
.over(partition_by=Chunk.document_id, order_by=Chunk.id)
.over(
partition_by=Chunk.document_id,
order_by=(Chunk.position, Chunk.id),
)
.label("rn"),
)
.where(Chunk.document_id.in_(doc_ids))
@ -441,7 +444,7 @@ class ChucksHybridSearchRetriever:
select(Chunk.id, Chunk.content, Chunk.document_id)
.join(numbered, Chunk.id == numbered.c.chunk_id)
.where(chunk_filter)
.order_by(Chunk.document_id, Chunk.id)
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
)
t_fetch = time.perf_counter()

View file

@ -357,7 +357,10 @@ class DocumentHybridSearchRetriever:
select(
Chunk.id.label("chunk_id"),
func.row_number()
.over(partition_by=Chunk.document_id, order_by=Chunk.id)
.over(
partition_by=Chunk.document_id,
order_by=(Chunk.position, Chunk.id),
)
.label("rn"),
)
.where(Chunk.document_id.in_(doc_ids))
@ -369,7 +372,7 @@ class DocumentHybridSearchRetriever:
select(Chunk.id, Chunk.content, Chunk.document_id)
.join(numbered, Chunk.id == numbered.c.chunk_id)
.where(numbered.c.rn <= _MAX_FETCH_CHUNKS_PER_DOC)
.order_by(Chunk.document_id, Chunk.id)
.order_by(Chunk.document_id, Chunk.position, Chunk.id)
)
t_fetch = time.perf_counter()

View file

@ -1014,8 +1014,8 @@ async def get_document_by_chunk_id(
.filter(
Chunk.document_id == document.id,
or_(
Chunk.created_at < chunk.created_at,
and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
Chunk.position < chunk.position,
and_(Chunk.position == chunk.position, Chunk.id < chunk.id),
),
)
)
@ -1027,7 +1027,7 @@ async def get_document_by_chunk_id(
windowed_result = await session.execute(
select(Chunk)
.filter(Chunk.document_id == document.id)
.order_by(Chunk.created_at, Chunk.id)
.order_by(Chunk.position, Chunk.id)
.offset(start)
.limit(end - start)
)
@ -1137,7 +1137,7 @@ async def get_document_chunks_paginated(
chunks_result = await session.execute(
select(Chunk)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.created_at, Chunk.id)
.order_by(Chunk.position, Chunk.id)
.offset(offset)
.limit(page_size)
)

View file

@ -119,7 +119,7 @@ async def get_editor_content(
chunk_contents_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.id)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
@ -205,7 +205,7 @@ async def download_document_markdown(
chunk_contents_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.id)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
if chunk_contents:
@ -354,7 +354,7 @@ async def export_document(
chunk_contents_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document_id)
.order_by(Chunk.id)
.order_by(Chunk.position, Chunk.id)
)
chunk_contents = chunk_contents_result.scalars().all()
if chunk_contents:

View file

@ -156,7 +156,7 @@ async def _resolve_document_text(
stmt = (
select(Chunk.content)
.where(Chunk.document_id == document.id)
.order_by(Chunk.id)
.order_by(Chunk.position, Chunk.id)
.limit(_MAX_CHUNKS_FOR_CONTEXT)
)
result = await session.execute(stmt)

View file

@ -62,7 +62,7 @@ async def _get_document_markdown(
chunk_result = await session.execute(
select(Chunk.content)
.filter(Chunk.document_id == document.id)
.order_by(Chunk.id)
.order_by(Chunk.position, Chunk.id)
)
chunks = chunk_result.scalars().all()
if chunks: