bidirectional streaming for output filter chains

Replace per-chunk HTTP requests to output filters with a single bidirectional streaming connection per filter. This eliminates the 50-200+ round-trips per streaming LLM response. Filters opt in via streaming: true in config. When all output filters support streaming, brightstaff opens one POST per filter with a streaming request body (Body::wrap_stream) and reads the streaming response. Filters that don't opt in fall back to the existing per-chunk behavior. Updates the PII deanonymizer demo as the reference implementation with request.stream() + StreamingResponse support. Made-with: Cursor
2026-05-18 13:45:15 +02:00 · 2026-03-19 02:27:26 -07:00 · 2026-03-19 02:27:26 -07:00 · 42d3de8906
commit 42d3de8906
parent 1f23c573bf
10 changed files with 613 additions and 133 deletions
--- a/demos/filter_chains/pii_anonymizer/config.yaml
+++ b/demos/filter_chains/pii_anonymizer/config.yaml
@ -7,6 +7,7 @@ filters:
  - id: pii_deanonymizer
    url: http://localhost:10501/deanonymize
    type: http
+    streaming: true

 model_providers:
  - model: openai/gpt-4o-mini
--- a/demos/filter_chains/pii_anonymizer/pii_anonymizer.py
+++ b/demos/filter_chains/pii_anonymizer/pii_anonymizer.py
@ -21,10 +21,16 @@ import logging
 from typing import Any, Dict

 from fastapi import FastAPI, Request
-from fastapi.responses import Response
+from fastapi.responses import Response, StreamingResponse

 from pii import anonymize_text, anonymize_message_content
-from store import get_mapping, store_mapping, deanonymize_sse, deanonymize_json
+from store import (
+    get_mapping,
+    store_mapping,
+    deanonymize_sse,
+    deanonymize_sse_stream,
+    deanonymize_json,
+)

 logging.basicConfig(
    level=logging.INFO,
@ -105,11 +111,36 @@ async def deanonymize(path: str, request: Request) -> Response:
      /deanonymize/v1/chat/completions  — OpenAI chat completions
      /deanonymize/v1/messages          — Anthropic messages
      /deanonymize/v1/responses         — OpenAI responses API
+
+    Supports two modes:
+      - Bidirectional streaming: request body is streamed (Content-Type: application/octet-stream).
+        Reads via request.stream(), processes SSE events incrementally, returns StreamingResponse.
+      - Per-chunk / full body: reads entire body, processes, returns complete Response.
    """
    endpoint = f"/{path}"
    is_anthropic = endpoint == "/v1/messages"
    request_id = request.headers.get("x-request-id", "unknown")
    mapping = get_mapping(request_id)
+
+    content_type = request.headers.get("content-type", "")
+    is_streaming = "application/octet-stream" in content_type
+
+    if is_streaming:
+        if not mapping:
+            logger.info("request_id=%s streaming, no mapping — passthrough", request_id)
+
+            async def passthrough():
+                async for chunk in request.stream():
+                    yield chunk
+
+            return StreamingResponse(passthrough(), media_type="text/event-stream")
+
+        logger.info("request_id=%s streaming deanonymize", request_id)
+        return StreamingResponse(
+            deanonymize_sse_stream(request_id, request.stream(), mapping, is_anthropic),
+            media_type="text/event-stream",
+        )
+
    raw_body = await request.body()

    if not mapping:
--- a/demos/filter_chains/pii_anonymizer/store.py
+++ b/demos/filter_chains/pii_anonymizer/store.py
@ -4,7 +4,7 @@ import json
 import logging
 import threading
 import time
-from typing import Dict, Optional, Tuple
+from typing import AsyncIterator, Dict, Optional, Tuple

 from fastapi.responses import Response

@ -59,36 +59,71 @@ def restore_streaming(request_id: str, content: str, mapping: Dict[str, str]) ->
 def deanonymize_sse(
    request_id: str, body_str: str, mapping: Dict[str, str], is_anthropic: bool
 ) -> Response:
-    result_lines = []
-    for line in body_str.split("\n"):
-        stripped = line.strip()
-        if not (stripped.startswith("data: ") and stripped[6:] != "[DONE]"):
-            result_lines.append(line)
-            continue
-        try:
-            chunk = json.loads(stripped[6:])
-            if is_anthropic:
-                # {"type": "content_block_delta", "delta": {"type": "text_delta", "text": "..."}}
-                if chunk.get("type") == "content_block_delta":
-                    delta = chunk.get("delta", {})
-                    if delta.get("type") == "text_delta" and delta.get("text"):
-                        delta["text"] = restore_streaming(
-                            request_id, delta["text"], mapping
-                        )
-            else:
-                # {"choices": [{"delta": {"content": "..."}}]}
-                for choice in chunk.get("choices", []):
-                    delta = choice.get("delta", {})
-                    if delta.get("content"):
-                        delta["content"] = restore_streaming(
-                            request_id, delta["content"], mapping
-                        )
-            result_lines.append("data: " + json.dumps(chunk))
-        except json.JSONDecodeError:
-            result_lines.append(line)
+    result_lines = [
+        _process_sse_line(request_id, line, mapping, is_anthropic)
+        for line in body_str.split("\n")
+    ]
    return Response(content="\n".join(result_lines), media_type="text/plain")


+def _process_sse_line(
+    request_id: str, line: str, mapping: Dict[str, str], is_anthropic: bool
+) -> str:
+    """Process a single SSE line, restoring PII in data payloads."""
+    stripped = line.strip()
+    if not (stripped.startswith("data: ") and stripped[6:] != "[DONE]"):
+        return line
+    try:
+        chunk = json.loads(stripped[6:])
+        if is_anthropic:
+            if chunk.get("type") == "content_block_delta":
+                delta = chunk.get("delta", {})
+                if delta.get("type") == "text_delta" and delta.get("text"):
+                    delta["text"] = restore_streaming(
+                        request_id, delta["text"], mapping
+                    )
+        else:
+            for choice in chunk.get("choices", []):
+                delta = choice.get("delta", {})
+                if delta.get("content"):
+                    delta["content"] = restore_streaming(
+                        request_id, delta["content"], mapping
+                    )
+        return "data: " + json.dumps(chunk)
+    except json.JSONDecodeError:
+        return line
+
+
+async def deanonymize_sse_stream(
+    request_id: str,
+    byte_stream: AsyncIterator[bytes],
+    mapping: Dict[str, str],
+    is_anthropic: bool,
+):
+    """Async generator that reads SSE events from a streaming request body,
+    de-anonymizes them, and yields processed events as they become complete.
+    Buffers partial data and splits on SSE event boundaries (blank lines).
+    """
+    buffer = ""
+    async for raw_chunk in byte_stream:
+        buffer += raw_chunk.decode("utf-8", errors="replace")
+        # Yield each complete SSE event (delimited by double newline)
+        while "\n\n" in buffer:
+            event, buffer = buffer.split("\n\n", 1)
+            processed_lines = [
+                _process_sse_line(request_id, line, mapping, is_anthropic)
+                for line in event.split("\n")
+            ]
+            yield "\n".join(processed_lines) + "\n\n"
+    # Flush any trailing data
+    if buffer.strip():
+        processed_lines = [
+            _process_sse_line(request_id, line, mapping, is_anthropic)
+            for line in buffer.split("\n")
+        ]
+        yield "\n".join(processed_lines)
+
+
 def deanonymize_json(
    request_id: str,
    raw_body: bytes,