feat: add document pipeline — PDF decoder, Ollama LLM, storage services

Add end-to-end document processing pipeline: - PDF decoder service (pdfjs-dist) extracts text per page from librarian docs - Ollama native LLM service for local model inference - FalkorDB triples store FlowProcessor consumer - Qdrant graph embeddings store FlowProcessor consumer - Fix spec name collisions in chunker/extractor (input→chunk-input, etc.) - Gateway /load endpoint to trigger document processing - Align flow manager blueprint and seed config with full pipeline topics - Add runner scripts and test coverage for document load Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-07-01 09:29:38 +02:00 · 2026-04-06 23:47:43 -05:00 · 2026-04-06 23:47:43 -05:00 · 8f7008822a
commit 8f7008822a
parent 8f9de7604e
20 changed files with 894 additions and 37 deletions
--- a/ts/scripts/run-graph-embeddings-store.ts
+++ b/ts/scripts/run-graph-embeddings-store.ts
@ -0,0 +1,6 @@
+import { run } from "../packages/flow/src/storage/embeddings/graph-embeddings-service.js";
+
+run().catch((err) => {
+  console.error("Graph embeddings store service failed:", err);
+  process.exit(1);
+});
--- a/ts/scripts/run-ollama.ts
+++ b/ts/scripts/run-ollama.ts
@ -0,0 +1,16 @@
+/**
+ * Start the Ollama text-completion service.
+ *
+ * Usage: pnpm tsx scripts/run-ollama.ts
+ *
+ * Env:
+ *   NATS_URL     (default: nats://localhost:4222)
+ *   OLLAMA_URL   (default: http://localhost:11434)
+ *   OLLAMA_MODEL (default: qwen2.5:0.5b)
+ */
+import { run } from "../packages/flow/src/model/text-completion/ollama.js";
+
+run().catch((err) => {
+  console.error("Ollama LLM service failed:", err);
+  process.exit(1);
+});
--- a/ts/scripts/run-pdf-decoder.ts
+++ b/ts/scripts/run-pdf-decoder.ts
@ -0,0 +1,14 @@
+/**
+ * Start the PDF decoder service.
+ *
+ * Usage: pnpm tsx scripts/run-pdf-decoder.ts
+ *
+ * Env:
+ *   NATS_URL (default: nats://localhost:4222)
+ */
+import { run } from "../packages/flow/src/decoding/pdf-decoder.js";
+
+run().catch((err) => {
+  console.error("PDF decoder service failed:", err);
+  process.exit(1);
+});
--- a/ts/scripts/run-triples-store.ts
+++ b/ts/scripts/run-triples-store.ts
@ -0,0 +1,6 @@
+import { run } from "../packages/flow/src/storage/triples/falkordb-service.js";
+
+run().catch((err) => {
+  console.error("Triples store service failed:", err);
+  process.exit(1);
+});
--- a/ts/scripts/seed-config.ts
+++ b/ts/scripts/seed-config.ts
@ -95,6 +95,19 @@ async function main(): Promise<void> {
  await pushConfig(["flows"], {
    default: {
      topics: {
+        // Document processing pipeline
+        "decode-input": "tg.flow.document",
+        "decode-output": "tg.flow.text-document",
+        "decode-triples": "tg.flow.triples",
+        "chunk-input": "tg.flow.text-document",
+        "chunk-output": "tg.flow.chunk",
+        "chunk-triples": "tg.flow.triples",
+        "extract-input": "tg.flow.chunk",
+        "extract-triples": "tg.flow.triples",
+        "extract-entity-contexts": "tg.flow.entity-contexts",
+        // Storage consumers
+        "store-triples-input": "tg.flow.triples",
+        "store-graph-embeddings-input": "tg.flow.entity-contexts",
        // LLM text completion
        "text-completion-request": "tg.flow.text-completion-request",
        "text-completion-response": "tg.flow.text-completion-response",
@ -107,17 +120,18 @@ async function main(): Promise<void> {
        // Document RAG
        "document-rag-request": "tg.flow.document-rag-request",
        "document-rag-response": "tg.flow.document-rag-response",
-        // Triples
+        // Triples query
        "triples-request": "tg.flow.triples-request",
        "triples-response": "tg.flow.triples-response",
        // Agent
        "agent-request": "tg.flow.agent-request",
        "agent-response": "tg.flow.agent-response",
-        // Chunking pipeline
-        "input": "tg.flow.chunk",
-        "output": "tg.flow.chunk",
-        "triples": "tg.flow.triples",
-        "entity-contexts": "tg.flow.entity-contexts",
+        // Embeddings
+        "embeddings-request": "tg.flow.embeddings-request",
+        "embeddings-response": "tg.flow.embeddings-response",
+        // Librarian RPC (for PDF decoder)
+        "librarian-request": "tg.flow.librarian-request",
+        "librarian-response": "tg.flow.librarian-response",
      },
    },
  });
--- a/ts/scripts/test-pipeline.ts
+++ b/ts/scripts/test-pipeline.ts
@ -134,22 +134,43 @@ async function testPushFlowConfig(): Promise<boolean> {
      values: {
        default: {
          topics: {
+            // Document processing pipeline
+            "decode-input": "tg.flow.document",
+            "decode-output": "tg.flow.text-document",
+            "decode-triples": "tg.flow.triples",
+            "chunk-input": "tg.flow.text-document",
+            "chunk-output": "tg.flow.chunk",
+            "chunk-triples": "tg.flow.triples",
+            "extract-input": "tg.flow.chunk",
+            "extract-triples": "tg.flow.triples",
+            "extract-entity-contexts": "tg.flow.entity-contexts",
+            // Storage consumers
+            "store-triples-input": "tg.flow.triples",
+            "store-graph-embeddings-input": "tg.flow.entity-contexts",
+            // LLM text completion
            "text-completion-request": "tg.flow.text-completion-request",
            "text-completion-response": "tg.flow.text-completion-response",
+            // Prompt service
            "prompt-request": "tg.flow.prompt-request",
            "prompt-response": "tg.flow.prompt-response",
+            // Graph RAG
            "graph-rag-request": "tg.flow.graph-rag-request",
            "graph-rag-response": "tg.flow.graph-rag-response",
+            // Document RAG
            "document-rag-request": "tg.flow.document-rag-request",
            "document-rag-response": "tg.flow.document-rag-response",
+            // Triples query
            "triples-request": "tg.flow.triples-request",
            "triples-response": "tg.flow.triples-response",
+            // Agent
            "agent-request": "tg.flow.agent-request",
            "agent-response": "tg.flow.agent-response",
-            "input": "tg.flow.chunk",
-            "output": "tg.flow.chunk",
-            "triples": "tg.flow.triples",
-            "entity-contexts": "tg.flow.entity-contexts",
+            // Embeddings
+            "embeddings-request": "tg.flow.embeddings-request",
+            "embeddings-response": "tg.flow.embeddings-response",
+            // Librarian RPC (for PDF decoder)
+            "librarian-request": "tg.flow.librarian-request",
+            "librarian-response": "tg.flow.librarian-response",
          },
        },
      },
@ -373,6 +394,69 @@ async function testLibrarianDelete(): Promise<boolean> {
  }
 }

+// ─── Document Load Test ──────────────────────────────────────────────
+
+async function testDocumentLoad(): Promise<boolean> {
+  try {
+    // First upload a test document via librarian
+    const content = Buffer.from("Test document for pipeline processing.").toString("base64");
+    const addRes = await post("/api/v1/librarian", {
+      operation: "add-document",
+      user: "test-user",
+      collection: "test-collection",
+      content,
+      documentMetadata: {
+        id: "",
+        time: Date.now(),
+        kind: "application/pdf",
+        title: "Test Pipeline Document",
+        comments: "",
+        user: "test-user",
+        tags: ["test"],
+        documentType: "source",
+      },
+    }) as Record<string, unknown>;
+
+    const meta = addRes.documentMetadata as Record<string, unknown> | undefined;
+    if (!meta?.id) {
+      fail("Document load", "failed to upload test document");
+      return false;
+    }
+    const docId = meta.id as string;
+
+    // Trigger document processing via the load endpoint
+    const res = await fetch(`${GATEWAY_URL}/api/v1/flow/default/load`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({
+        documentId: docId,
+        user: "test-user",
+        collection: "test-collection",
+      }),
+    });
+    const data = await res.json() as Record<string, unknown>;
+    log("document-load", data);
+
+    if (data.status === "processing") {
+      pass(`Document load triggered for ${docId.slice(0, 8)}...`);
+
+      // Clean up the test document
+      await post("/api/v1/librarian", {
+        operation: "remove-document",
+        documentId: docId,
+        user: "test-user",
+      });
+
+      return true;
+    }
+    fail("Document load", "unexpected response");
+    return false;
+  } catch (err) {
+    fail("Document load", err);
+    return false;
+  }
+}
+
 // ─── Agent Test ───────────────────────────────────────────────────────

 async function testAgentQuery(): Promise<boolean> {
@ -444,6 +528,14 @@ async function main(): Promise<void> {
  // Flow config push
  await run("Push Flow Config", testPushFlowConfig);

+  // Document pipeline load test (requires librarian + gateway)
+  if (process.env.SKIP_PIPELINE !== "1" && process.env.SKIP_LIBRARIAN !== "1") {
+    console.log("\n  (Testing document load — set SKIP_PIPELINE=1 to skip)");
+    await run("Document Load", testDocumentLoad);
+  } else {
+    console.log("\n  (Skipping document pipeline load test)");
+  }
+
  // LLM test (only if a running LLM service is available)
  if (process.env.SKIP_LLM !== "1") {
    console.log("\n  (Testing text-completion — set SKIP_LLM=1 to skip)");