trustgraph/trustgraph_configurator/templates/2.2/flows/load.jsonnet

54 lines
2 KiB
Jsonnet
Raw Normal View History

// Document loading and preprocessing module
// Handles document ingestion, format conversion, and chunking
// Converts PDFs to text and splits documents into processable chunks
local helpers = import "helpers.jsonnet";
local flow = helpers.flow;
local request = helpers.request;
local response = helpers.response;
local request_response = helpers.request_response;
local librarian_request = helpers.librarian_request;
local librarian_response = helpers.librarian_response;
// Import shared services (load requires embeddings for chunk processing)
local embeddings_service = import "embeddings-service.jsonnet";
// Merge shared services with load-specific configuration
embeddings_service + {
// External interfaces for document loading
"interfaces" +: {
"document-load": flow("document-load:{id}"),
"text-load": flow("text-document-load:{id}"),
},
// Flow-level processors for document preprocessing
"flow" +: {
// PDF decoder converts PDF documents to text
// Also emits page provenance triples and saves pages via librarian
"document-decoder:{id}": {
input: flow("document-load:{id}"),
output: flow("text-document-load:{id}"),
triples: flow("triples-store:{id}"),
"librarian-request": librarian_request,
"librarian-response": librarian_response,
},
// Chunker splits documents into smaller, processable pieces
// Also emits chunk provenance triples and saves chunks via librarian
"chunker:{id}": {
input: flow("text-document-load:{id}"),
output: flow("chunk-load:{id}"),
triples: flow("triples-store:{id}"),
"librarian-request": librarian_request,
"librarian-response": librarian_response,
"chunk-size": "{chunk-size}",
"chunk-overlap": "{chunk-overlap}",
},
},
// Blueprint-level processors for document loading services
"blueprint" +: {
},
}