mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
fix(chunks): set position on remaining chunk insert paths
document_converters, the github size-fallback chunker, revert_service restores, and the kb-persistence middleware now write explicit positions (the middleware read path also orders by position).
This commit is contained in:
parent
7d55aaf2c1
commit
5a71769dba
4 changed files with 43 additions and 12 deletions
|
|
@ -241,8 +241,15 @@ async def _create_document(
|
||||||
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
|
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
|
||||||
session.add_all(
|
session.add_all(
|
||||||
[
|
[
|
||||||
Chunk(document_id=doc.id, content=text, embedding=embedding)
|
Chunk(
|
||||||
for text, embedding in zip(chunks, chunk_embeddings, strict=True)
|
document_id=doc.id,
|
||||||
|
content=text,
|
||||||
|
embedding=embedding,
|
||||||
|
position=i,
|
||||||
|
)
|
||||||
|
for i, (text, embedding) in enumerate(
|
||||||
|
zip(chunks, chunk_embeddings, strict=True)
|
||||||
|
)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
return doc
|
return doc
|
||||||
|
|
@ -289,8 +296,15 @@ async def _update_document(
|
||||||
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
|
chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
|
||||||
session.add_all(
|
session.add_all(
|
||||||
[
|
[
|
||||||
Chunk(document_id=document.id, content=text, embedding=embedding)
|
Chunk(
|
||||||
for text, embedding in zip(chunks, chunk_embeddings, strict=True)
|
document_id=document.id,
|
||||||
|
content=text,
|
||||||
|
embedding=embedding,
|
||||||
|
position=i,
|
||||||
|
)
|
||||||
|
for i, (text, embedding) in enumerate(
|
||||||
|
zip(chunks, chunk_embeddings, strict=True)
|
||||||
|
)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
return document
|
return document
|
||||||
|
|
@ -475,7 +489,9 @@ async def _load_chunks_for_snapshot(
|
||||||
session: AsyncSession, *, doc_id: int
|
session: AsyncSession, *, doc_id: int
|
||||||
) -> list[dict[str, str]]:
|
) -> list[dict[str, str]]:
|
||||||
rows = await session.execute(
|
rows = await session.execute(
|
||||||
select(Chunk.content).where(Chunk.document_id == doc_id).order_by(Chunk.id)
|
select(Chunk.content)
|
||||||
|
.where(Chunk.document_id == doc_id)
|
||||||
|
.order_by(Chunk.position, Chunk.id)
|
||||||
)
|
)
|
||||||
return [{"content": row.content} for row in rows.all() if row.content is not None]
|
return [{"content": row.content} for row in rows.all() if row.content is not None]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -238,9 +238,14 @@ async def _restore_in_place_document(
|
||||||
chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
|
chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
|
||||||
session.add_all(
|
session.add_all(
|
||||||
[
|
[
|
||||||
Chunk(document_id=doc.id, content=text, embedding=embedding)
|
Chunk(
|
||||||
for text, embedding in zip(
|
document_id=doc.id,
|
||||||
chunk_texts, chunk_embeddings, strict=True
|
content=text,
|
||||||
|
embedding=embedding,
|
||||||
|
position=i,
|
||||||
|
)
|
||||||
|
for i, (text, embedding) in enumerate(
|
||||||
|
zip(chunk_texts, chunk_embeddings, strict=True)
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
@ -336,8 +341,15 @@ async def _reinsert_document_from_revision(
|
||||||
chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
|
chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
|
||||||
session.add_all(
|
session.add_all(
|
||||||
[
|
[
|
||||||
Chunk(document_id=new_doc.id, content=text, embedding=embedding)
|
Chunk(
|
||||||
for text, embedding in zip(chunk_texts, chunk_embeddings, strict=True)
|
document_id=new_doc.id,
|
||||||
|
content=text,
|
||||||
|
embedding=embedding,
|
||||||
|
position=i,
|
||||||
|
)
|
||||||
|
for i, (text, embedding) in enumerate(
|
||||||
|
zip(chunk_texts, chunk_embeddings, strict=True)
|
||||||
|
)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -525,6 +525,7 @@ async def _simple_chunk_content(content: str, chunk_size: int = 4000) -> list:
|
||||||
Chunk(
|
Chunk(
|
||||||
content=chunk_text,
|
content=chunk_text,
|
||||||
embedding=embed_text(chunk_text),
|
embedding=embed_text(chunk_text),
|
||||||
|
position=len(chunks),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -188,8 +188,10 @@ async def create_document_chunks(content: str) -> list[Chunk]:
|
||||||
chunk_texts = [c.text for c in config.chunker_instance.chunk(content)]
|
chunk_texts = [c.text for c in config.chunker_instance.chunk(content)]
|
||||||
chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
|
chunk_embeddings = await asyncio.to_thread(embed_texts, chunk_texts)
|
||||||
return [
|
return [
|
||||||
Chunk(content=text, embedding=emb)
|
Chunk(content=text, embedding=emb, position=i)
|
||||||
for text, emb in zip(chunk_texts, chunk_embeddings, strict=False)
|
for i, (text, emb) in enumerate(
|
||||||
|
zip(chunk_texts, chunk_embeddings, strict=False)
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue