mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-02 02:58:10 +02:00
feat: add document pipeline — PDF decoder, Ollama LLM, storage services
Add end-to-end document processing pipeline: - PDF decoder service (pdfjs-dist) extracts text per page from librarian docs - Ollama native LLM service for local model inference - FalkorDB triples store FlowProcessor consumer - Qdrant graph embeddings store FlowProcessor consumer - Fix spec name collisions in chunker/extractor (input→chunk-input, etc.) - Gateway /load endpoint to trigger document processing - Align flow manager blueprint and seed config with full pipeline topics - Add runner scripts and test coverage for document load Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8f9de7604e
commit
8f7008822a
20 changed files with 894 additions and 37 deletions
|
|
@ -0,0 +1,91 @@
|
|||
/**
|
||||
* Graph embeddings store service — vectorizes entity contexts and writes to Qdrant.
|
||||
*
|
||||
* A FlowProcessor that:
|
||||
* 1. Consumes EntityContexts messages
|
||||
* 2. Calls the embeddings service to vectorize entity context strings
|
||||
* 3. Writes entity+vector pairs to Qdrant using QdrantGraphEmbeddingsStore
|
||||
*
|
||||
* Python reference: trustgraph-flow/trustgraph/storage/graph_embeddings/qdrant/service.py
|
||||
*/
|
||||
|
||||
import {
|
||||
FlowProcessor,
|
||||
ConsumerSpec,
|
||||
RequestResponseSpec,
|
||||
type ProcessorConfig,
|
||||
type FlowContext,
|
||||
type EntityContexts,
|
||||
type EmbeddingsRequest,
|
||||
type EmbeddingsResponse,
|
||||
} from "@trustgraph/base";
|
||||
import { QdrantGraphEmbeddingsStore } from "./qdrant-graph.js";
|
||||
|
||||
export class GraphEmbeddingsStoreService extends FlowProcessor {
|
||||
private store: QdrantGraphEmbeddingsStore;
|
||||
|
||||
constructor(config: ProcessorConfig) {
|
||||
super(config);
|
||||
this.store = new QdrantGraphEmbeddingsStore();
|
||||
|
||||
this.registerSpecification(
|
||||
new ConsumerSpec<EntityContexts>(
|
||||
"store-graph-embeddings-input",
|
||||
this.onMessage.bind(this),
|
||||
),
|
||||
);
|
||||
this.registerSpecification(
|
||||
new RequestResponseSpec<EmbeddingsRequest, EmbeddingsResponse>(
|
||||
"embeddings-client",
|
||||
"embeddings-request",
|
||||
"embeddings-response",
|
||||
),
|
||||
);
|
||||
|
||||
console.log("[GraphEmbeddingsStore] Service initialized");
|
||||
}
|
||||
|
||||
private async onMessage(
|
||||
msg: EntityContexts,
|
||||
properties: Record<string, string>,
|
||||
flowCtx: FlowContext,
|
||||
): Promise<void> {
|
||||
if (!msg.entities || msg.entities.length === 0) return;
|
||||
|
||||
const embeddingsClient =
|
||||
flowCtx.flow.requestor<EmbeddingsRequest, EmbeddingsResponse>("embeddings-client");
|
||||
|
||||
const user = msg.metadata?.user ?? "default";
|
||||
const collection = msg.metadata?.collection ?? "default";
|
||||
|
||||
// Get text contexts for vectorization
|
||||
const texts = msg.entities.map((e) => e.context);
|
||||
|
||||
// Call embeddings service
|
||||
const embResponse = await embeddingsClient.request({ text: texts });
|
||||
if (embResponse.error) {
|
||||
console.error(
|
||||
"[GraphEmbeddingsStore] Embeddings error:",
|
||||
embResponse.error.message,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Store entity+vector pairs
|
||||
const entities = msg.entities.map((e, i) => ({
|
||||
entity: e.entity,
|
||||
vector: embResponse.vectors[i],
|
||||
chunkId: e.chunkId,
|
||||
}));
|
||||
|
||||
await this.store.store({ user, collection, entities });
|
||||
|
||||
console.log(
|
||||
`[GraphEmbeddingsStore] Stored ${entities.length} embeddings for ${user}/${collection}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export async function run(): Promise<void> {
|
||||
await GraphEmbeddingsStoreService.launch("graph-embeddings-store");
|
||||
}
|
||||
54
ts/packages/flow/src/storage/triples/falkordb-service.ts
Normal file
54
ts/packages/flow/src/storage/triples/falkordb-service.ts
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Triples store service — writes RDF triples to FalkorDB via FlowProcessor.
|
||||
*
|
||||
* A FlowProcessor that:
|
||||
* 1. Consumes Triples messages
|
||||
* 2. Writes each triple to FalkorDB using FalkorDBTriplesStore
|
||||
*
|
||||
* Python reference: trustgraph-flow/trustgraph/storage/triples/falkordb/service.py
|
||||
*/
|
||||
|
||||
import {
|
||||
FlowProcessor,
|
||||
ConsumerSpec,
|
||||
type ProcessorConfig,
|
||||
type FlowContext,
|
||||
type Triples,
|
||||
} from "@trustgraph/base";
|
||||
import { FalkorDBTriplesStore } from "./falkordb.js";
|
||||
|
||||
export class TriplesStoreService extends FlowProcessor {
|
||||
private store: FalkorDBTriplesStore;
|
||||
|
||||
constructor(config: ProcessorConfig) {
|
||||
super(config);
|
||||
this.store = new FalkorDBTriplesStore();
|
||||
|
||||
this.registerSpecification(
|
||||
new ConsumerSpec<Triples>("store-triples-input", this.onMessage.bind(this)),
|
||||
);
|
||||
|
||||
console.log("[TriplesStore] Service initialized");
|
||||
}
|
||||
|
||||
private async onMessage(
|
||||
msg: Triples,
|
||||
properties: Record<string, string>,
|
||||
flowCtx: FlowContext,
|
||||
): Promise<void> {
|
||||
if (!msg.triples || msg.triples.length === 0) return;
|
||||
|
||||
const user = msg.metadata?.user ?? "default";
|
||||
const collection = msg.metadata?.collection ?? "default";
|
||||
|
||||
await this.store.storeTriples(msg.triples, user, collection);
|
||||
|
||||
console.log(
|
||||
`[TriplesStore] Stored ${msg.triples.length} triples for ${user}/${collection}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export async function run(): Promise<void> {
|
||||
await TriplesStoreService.launch("triples-store");
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue