trustgraph/trustgraph-base/trustgraph/schema/documents.py
cybermaggedon a9197d11ee
Feature/configure flows (#345)
- Keeps processing in different flows separate so that data can go to different stores / collections etc.
- Potentially supports different processing flows
- Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow
2025-04-22 20:21:38 +01:00

56 lines
1.4 KiB
Python

from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
from . topic import topic
from . types import Error
from . metadata import Metadata
############################################################################
# PDF docs etc.
class Document(Record):
metadata = Metadata()
data = Bytes()
############################################################################
# Text documents / text from PDF
class TextDocument(Record):
metadata = Metadata()
text = Bytes()
############################################################################
# Chunks of text
class Chunk(Record):
metadata = Metadata()
chunk = Bytes()
############################################################################
# Document embeddings are embeddings associated with a chunk
class ChunkEmbeddings(Record):
chunk = Bytes()
vectors = Array(Array(Double()))
# This is a 'batching' mechanism for the above data
class DocumentEmbeddings(Record):
metadata = Metadata()
chunks = Array(ChunkEmbeddings())
############################################################################
# Doc embeddings query
class DocumentEmbeddingsRequest(Record):
vectors = Array(Array(Double()))
limit = Integer()
user = String()
collection = String()
class DocumentEmbeddingsResponse(Record):
error = Error()
documents = Array(Bytes())