2024-08-27 21:55:12 +01:00
|
|
|
|
|
|
|
|
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
|
|
|
|
|
from . topic import topic
|
|
|
|
|
from . types import Error
|
2024-10-02 18:14:29 +01:00
|
|
|
from . metadata import Metadata
|
2024-08-27 21:55:12 +01:00
|
|
|
|
|
|
|
|
############################################################################
|
|
|
|
|
|
|
|
|
|
# PDF docs etc.
|
|
|
|
|
class Document(Record):
|
2024-10-02 18:14:29 +01:00
|
|
|
metadata = Metadata()
|
2024-08-27 21:55:12 +01:00
|
|
|
data = Bytes()
|
|
|
|
|
|
|
|
|
|
document_ingest_queue = topic('document-load')
|
|
|
|
|
|
|
|
|
|
############################################################################
|
|
|
|
|
|
|
|
|
|
# Text documents / text from PDF
|
|
|
|
|
|
|
|
|
|
class TextDocument(Record):
|
2024-10-02 18:14:29 +01:00
|
|
|
metadata = Metadata()
|
2024-08-27 21:55:12 +01:00
|
|
|
text = Bytes()
|
|
|
|
|
|
|
|
|
|
text_ingest_queue = topic('text-document-load')
|
|
|
|
|
|
|
|
|
|
############################################################################
|
|
|
|
|
|
|
|
|
|
# Chunks of text
|
|
|
|
|
|
|
|
|
|
class Chunk(Record):
|
2024-10-02 18:14:29 +01:00
|
|
|
metadata = Metadata()
|
2024-08-27 21:55:12 +01:00
|
|
|
chunk = Bytes()
|
|
|
|
|
|
|
|
|
|
chunk_ingest_queue = topic('chunk-load')
|
|
|
|
|
|
|
|
|
|
############################################################################
|
|
|
|
|
|
2025-01-04 21:51:28 +00:00
|
|
|
# Document embeddings are embeddings associated with a chunk
|
|
|
|
|
|
|
|
|
|
class ChunkEmbeddings(Record):
|
|
|
|
|
chunk = Bytes()
|
|
|
|
|
vectors = Array(Array(Double()))
|
|
|
|
|
|
|
|
|
|
# This is a 'batching' mechanism for the above data
|
|
|
|
|
class DocumentEmbeddings(Record):
|
|
|
|
|
metadata = Metadata()
|
|
|
|
|
chunks = Array(ChunkEmbeddings())
|
|
|
|
|
|
|
|
|
|
document_embeddings_store_queue = topic('document-embeddings-store')
|
|
|
|
|
|
|
|
|
|
############################################################################
|
|
|
|
|
|
2024-08-27 21:55:12 +01:00
|
|
|
# Doc embeddings query
|
|
|
|
|
|
|
|
|
|
class DocumentEmbeddingsRequest(Record):
|
|
|
|
|
vectors = Array(Array(Double()))
|
|
|
|
|
limit = Integer()
|
2025-01-04 21:51:28 +00:00
|
|
|
user = String()
|
|
|
|
|
collection = String()
|
2024-08-27 21:55:12 +01:00
|
|
|
|
|
|
|
|
class DocumentEmbeddingsResponse(Record):
|
|
|
|
|
error = Error()
|
|
|
|
|
documents = Array(Bytes())
|
|
|
|
|
|
|
|
|
|
document_embeddings_request_queue = topic(
|
|
|
|
|
'doc-embeddings', kind='non-persistent', namespace='request'
|
|
|
|
|
)
|
|
|
|
|
document_embeddings_response_queue = topic(
|
2024-11-29 17:17:20 +00:00
|
|
|
'doc-embeddings', kind='non-persistent', namespace='response',
|
2024-08-27 21:55:12 +01:00
|
|
|
)
|
2024-12-30 12:53:19 +00:00
|
|
|
|