feat: add document pipeline — PDF decoder, Ollama LLM, storage services

Add end-to-end document processing pipeline:
- PDF decoder service (pdfjs-dist) extracts text per page from librarian docs
- Ollama native LLM service for local model inference
- FalkorDB triples store FlowProcessor consumer
- Qdrant graph embeddings store FlowProcessor consumer
- Fix spec name collisions in chunker/extractor (input→chunk-input, etc.)
- Gateway /load endpoint to trigger document processing
- Align flow manager blueprint and seed config with full pipeline topics
- Add runner scripts and test coverage for document load

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
elpresidank 2026-04-06 23:47:43 -05:00
parent 8f9de7604e
commit 8f7008822a
20 changed files with 894 additions and 37 deletions

View file

@ -0,0 +1,6 @@
import { run } from "../packages/flow/src/storage/embeddings/graph-embeddings-service.js";
run().catch((err) => {
console.error("Graph embeddings store service failed:", err);
process.exit(1);
});

16
ts/scripts/run-ollama.ts Normal file
View file

@ -0,0 +1,16 @@
/**
* Start the Ollama text-completion service.
*
* Usage: pnpm tsx scripts/run-ollama.ts
*
* Env:
* NATS_URL (default: nats://localhost:4222)
* OLLAMA_URL (default: http://localhost:11434)
* OLLAMA_MODEL (default: qwen2.5:0.5b)
*/
import { run } from "../packages/flow/src/model/text-completion/ollama.js";
run().catch((err) => {
console.error("Ollama LLM service failed:", err);
process.exit(1);
});

View file

@ -0,0 +1,14 @@
/**
* Start the PDF decoder service.
*
* Usage: pnpm tsx scripts/run-pdf-decoder.ts
*
* Env:
* NATS_URL (default: nats://localhost:4222)
*/
import { run } from "../packages/flow/src/decoding/pdf-decoder.js";
run().catch((err) => {
console.error("PDF decoder service failed:", err);
process.exit(1);
});

View file

@ -0,0 +1,6 @@
import { run } from "../packages/flow/src/storage/triples/falkordb-service.js";
run().catch((err) => {
console.error("Triples store service failed:", err);
process.exit(1);
});

View file

@ -95,6 +95,19 @@ async function main(): Promise<void> {
await pushConfig(["flows"], {
default: {
topics: {
// Document processing pipeline
"decode-input": "tg.flow.document",
"decode-output": "tg.flow.text-document",
"decode-triples": "tg.flow.triples",
"chunk-input": "tg.flow.text-document",
"chunk-output": "tg.flow.chunk",
"chunk-triples": "tg.flow.triples",
"extract-input": "tg.flow.chunk",
"extract-triples": "tg.flow.triples",
"extract-entity-contexts": "tg.flow.entity-contexts",
// Storage consumers
"store-triples-input": "tg.flow.triples",
"store-graph-embeddings-input": "tg.flow.entity-contexts",
// LLM text completion
"text-completion-request": "tg.flow.text-completion-request",
"text-completion-response": "tg.flow.text-completion-response",
@ -107,17 +120,18 @@ async function main(): Promise<void> {
// Document RAG
"document-rag-request": "tg.flow.document-rag-request",
"document-rag-response": "tg.flow.document-rag-response",
// Triples
// Triples query
"triples-request": "tg.flow.triples-request",
"triples-response": "tg.flow.triples-response",
// Agent
"agent-request": "tg.flow.agent-request",
"agent-response": "tg.flow.agent-response",
// Chunking pipeline
"input": "tg.flow.chunk",
"output": "tg.flow.chunk",
"triples": "tg.flow.triples",
"entity-contexts": "tg.flow.entity-contexts",
// Embeddings
"embeddings-request": "tg.flow.embeddings-request",
"embeddings-response": "tg.flow.embeddings-response",
// Librarian RPC (for PDF decoder)
"librarian-request": "tg.flow.librarian-request",
"librarian-response": "tg.flow.librarian-response",
},
},
});

View file

@ -134,22 +134,43 @@ async function testPushFlowConfig(): Promise<boolean> {
values: {
default: {
topics: {
// Document processing pipeline
"decode-input": "tg.flow.document",
"decode-output": "tg.flow.text-document",
"decode-triples": "tg.flow.triples",
"chunk-input": "tg.flow.text-document",
"chunk-output": "tg.flow.chunk",
"chunk-triples": "tg.flow.triples",
"extract-input": "tg.flow.chunk",
"extract-triples": "tg.flow.triples",
"extract-entity-contexts": "tg.flow.entity-contexts",
// Storage consumers
"store-triples-input": "tg.flow.triples",
"store-graph-embeddings-input": "tg.flow.entity-contexts",
// LLM text completion
"text-completion-request": "tg.flow.text-completion-request",
"text-completion-response": "tg.flow.text-completion-response",
// Prompt service
"prompt-request": "tg.flow.prompt-request",
"prompt-response": "tg.flow.prompt-response",
// Graph RAG
"graph-rag-request": "tg.flow.graph-rag-request",
"graph-rag-response": "tg.flow.graph-rag-response",
// Document RAG
"document-rag-request": "tg.flow.document-rag-request",
"document-rag-response": "tg.flow.document-rag-response",
// Triples query
"triples-request": "tg.flow.triples-request",
"triples-response": "tg.flow.triples-response",
// Agent
"agent-request": "tg.flow.agent-request",
"agent-response": "tg.flow.agent-response",
"input": "tg.flow.chunk",
"output": "tg.flow.chunk",
"triples": "tg.flow.triples",
"entity-contexts": "tg.flow.entity-contexts",
// Embeddings
"embeddings-request": "tg.flow.embeddings-request",
"embeddings-response": "tg.flow.embeddings-response",
// Librarian RPC (for PDF decoder)
"librarian-request": "tg.flow.librarian-request",
"librarian-response": "tg.flow.librarian-response",
},
},
},
@ -373,6 +394,69 @@ async function testLibrarianDelete(): Promise<boolean> {
}
}
// ─── Document Load Test ──────────────────────────────────────────────
async function testDocumentLoad(): Promise<boolean> {
try {
// First upload a test document via librarian
const content = Buffer.from("Test document for pipeline processing.").toString("base64");
const addRes = await post("/api/v1/librarian", {
operation: "add-document",
user: "test-user",
collection: "test-collection",
content,
documentMetadata: {
id: "",
time: Date.now(),
kind: "application/pdf",
title: "Test Pipeline Document",
comments: "",
user: "test-user",
tags: ["test"],
documentType: "source",
},
}) as Record<string, unknown>;
const meta = addRes.documentMetadata as Record<string, unknown> | undefined;
if (!meta?.id) {
fail("Document load", "failed to upload test document");
return false;
}
const docId = meta.id as string;
// Trigger document processing via the load endpoint
const res = await fetch(`${GATEWAY_URL}/api/v1/flow/default/load`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
documentId: docId,
user: "test-user",
collection: "test-collection",
}),
});
const data = await res.json() as Record<string, unknown>;
log("document-load", data);
if (data.status === "processing") {
pass(`Document load triggered for ${docId.slice(0, 8)}...`);
// Clean up the test document
await post("/api/v1/librarian", {
operation: "remove-document",
documentId: docId,
user: "test-user",
});
return true;
}
fail("Document load", "unexpected response");
return false;
} catch (err) {
fail("Document load", err);
return false;
}
}
// ─── Agent Test ───────────────────────────────────────────────────────
async function testAgentQuery(): Promise<boolean> {
@ -444,6 +528,14 @@ async function main(): Promise<void> {
// Flow config push
await run("Push Flow Config", testPushFlowConfig);
// Document pipeline load test (requires librarian + gateway)
if (process.env.SKIP_PIPELINE !== "1" && process.env.SKIP_LIBRARIAN !== "1") {
console.log("\n (Testing document load — set SKIP_PIPELINE=1 to skip)");
await run("Document Load", testDocumentLoad);
} else {
console.log("\n (Skipping document pipeline load test)");
}
// LLM test (only if a running LLM service is available)
if (process.env.SKIP_LLM !== "1") {
console.log("\n (Testing text-completion — set SKIP_LLM=1 to skip)");