feat: add document pipeline — PDF decoder, Ollama LLM, storage services

Add end-to-end document processing pipeline:
- PDF decoder service (pdfjs-dist) extracts text per page from librarian docs
- Ollama native LLM service for local model inference
- FalkorDB triples store FlowProcessor consumer
- Qdrant graph embeddings store FlowProcessor consumer
- Fix spec name collisions in chunker/extractor (input→chunk-input, etc.)
- Gateway /load endpoint to trigger document processing
- Align flow manager blueprint and seed config with full pipeline topics
- Add runner scripts and test coverage for document load

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
elpresidank 2026-04-06 23:47:43 -05:00
parent 8f9de7604e
commit 8f7008822a
20 changed files with 894 additions and 37 deletions

View file

@ -0,0 +1,91 @@
/**
* Graph embeddings store service vectorizes entity contexts and writes to Qdrant.
*
* A FlowProcessor that:
* 1. Consumes EntityContexts messages
* 2. Calls the embeddings service to vectorize entity context strings
* 3. Writes entity+vector pairs to Qdrant using QdrantGraphEmbeddingsStore
*
* Python reference: trustgraph-flow/trustgraph/storage/graph_embeddings/qdrant/service.py
*/
import {
FlowProcessor,
ConsumerSpec,
RequestResponseSpec,
type ProcessorConfig,
type FlowContext,
type EntityContexts,
type EmbeddingsRequest,
type EmbeddingsResponse,
} from "@trustgraph/base";
import { QdrantGraphEmbeddingsStore } from "./qdrant-graph.js";
export class GraphEmbeddingsStoreService extends FlowProcessor {
private store: QdrantGraphEmbeddingsStore;
constructor(config: ProcessorConfig) {
super(config);
this.store = new QdrantGraphEmbeddingsStore();
this.registerSpecification(
new ConsumerSpec<EntityContexts>(
"store-graph-embeddings-input",
this.onMessage.bind(this),
),
);
this.registerSpecification(
new RequestResponseSpec<EmbeddingsRequest, EmbeddingsResponse>(
"embeddings-client",
"embeddings-request",
"embeddings-response",
),
);
console.log("[GraphEmbeddingsStore] Service initialized");
}
private async onMessage(
msg: EntityContexts,
properties: Record<string, string>,
flowCtx: FlowContext,
): Promise<void> {
if (!msg.entities || msg.entities.length === 0) return;
const embeddingsClient =
flowCtx.flow.requestor<EmbeddingsRequest, EmbeddingsResponse>("embeddings-client");
const user = msg.metadata?.user ?? "default";
const collection = msg.metadata?.collection ?? "default";
// Get text contexts for vectorization
const texts = msg.entities.map((e) => e.context);
// Call embeddings service
const embResponse = await embeddingsClient.request({ text: texts });
if (embResponse.error) {
console.error(
"[GraphEmbeddingsStore] Embeddings error:",
embResponse.error.message,
);
return;
}
// Store entity+vector pairs
const entities = msg.entities.map((e, i) => ({
entity: e.entity,
vector: embResponse.vectors[i],
chunkId: e.chunkId,
}));
await this.store.store({ user, collection, entities });
console.log(
`[GraphEmbeddingsStore] Stored ${entities.length} embeddings for ${user}/${collection}`,
);
}
}
export async function run(): Promise<void> {
await GraphEmbeddingsStoreService.launch("graph-embeddings-store");
}

View file

@ -0,0 +1,54 @@
/**
* Triples store service writes RDF triples to FalkorDB via FlowProcessor.
*
* A FlowProcessor that:
* 1. Consumes Triples messages
* 2. Writes each triple to FalkorDB using FalkorDBTriplesStore
*
* Python reference: trustgraph-flow/trustgraph/storage/triples/falkordb/service.py
*/
import {
FlowProcessor,
ConsumerSpec,
type ProcessorConfig,
type FlowContext,
type Triples,
} from "@trustgraph/base";
import { FalkorDBTriplesStore } from "./falkordb.js";
export class TriplesStoreService extends FlowProcessor {
private store: FalkorDBTriplesStore;
constructor(config: ProcessorConfig) {
super(config);
this.store = new FalkorDBTriplesStore();
this.registerSpecification(
new ConsumerSpec<Triples>("store-triples-input", this.onMessage.bind(this)),
);
console.log("[TriplesStore] Service initialized");
}
private async onMessage(
msg: Triples,
properties: Record<string, string>,
flowCtx: FlowContext,
): Promise<void> {
if (!msg.triples || msg.triples.length === 0) return;
const user = msg.metadata?.user ?? "default";
const collection = msg.metadata?.collection ?? "default";
await this.store.storeTriples(msg.triples, user, collection);
console.log(
`[TriplesStore] Stored ${msg.triples.length} triples for ${user}/${collection}`,
);
}
}
export async function run(): Promise<void> {
await TriplesStoreService.launch("triples-store");
}