bidirectional streaming for output filter chains

Replace per-chunk HTTP requests to output filters with a single
bidirectional streaming connection per filter. This eliminates
the 50-200+ round-trips per streaming LLM response.

Filters opt in via streaming: true in config. When all output filters
support streaming, brightstaff opens one POST per filter with a streaming
request body (Body::wrap_stream) and reads the streaming response. Filters
that don't opt in fall back to the existing per-chunk behavior.

Updates the PII deanonymizer demo as the reference implementation with
request.stream() + StreamingResponse support.

Made-with: Cursor
This commit is contained in:
Adil Hafeez 2026-03-19 02:27:26 -07:00
parent 1f23c573bf
commit 42d3de8906
10 changed files with 613 additions and 133 deletions

View file

@ -7,6 +7,7 @@ filters:
- id: pii_deanonymizer
url: http://localhost:10501/deanonymize
type: http
streaming: true
model_providers:
- model: openai/gpt-4o-mini

View file

@ -21,10 +21,16 @@ import logging
from typing import Any, Dict
from fastapi import FastAPI, Request
from fastapi.responses import Response
from fastapi.responses import Response, StreamingResponse
from pii import anonymize_text, anonymize_message_content
from store import get_mapping, store_mapping, deanonymize_sse, deanonymize_json
from store import (
get_mapping,
store_mapping,
deanonymize_sse,
deanonymize_sse_stream,
deanonymize_json,
)
logging.basicConfig(
level=logging.INFO,
@ -105,11 +111,36 @@ async def deanonymize(path: str, request: Request) -> Response:
/deanonymize/v1/chat/completions OpenAI chat completions
/deanonymize/v1/messages Anthropic messages
/deanonymize/v1/responses OpenAI responses API
Supports two modes:
- Bidirectional streaming: request body is streamed (Content-Type: application/octet-stream).
Reads via request.stream(), processes SSE events incrementally, returns StreamingResponse.
- Per-chunk / full body: reads entire body, processes, returns complete Response.
"""
endpoint = f"/{path}"
is_anthropic = endpoint == "/v1/messages"
request_id = request.headers.get("x-request-id", "unknown")
mapping = get_mapping(request_id)
content_type = request.headers.get("content-type", "")
is_streaming = "application/octet-stream" in content_type
if is_streaming:
if not mapping:
logger.info("request_id=%s streaming, no mapping — passthrough", request_id)
async def passthrough():
async for chunk in request.stream():
yield chunk
return StreamingResponse(passthrough(), media_type="text/event-stream")
logger.info("request_id=%s streaming deanonymize", request_id)
return StreamingResponse(
deanonymize_sse_stream(request_id, request.stream(), mapping, is_anthropic),
media_type="text/event-stream",
)
raw_body = await request.body()
if not mapping:

View file

@ -4,7 +4,7 @@ import json
import logging
import threading
import time
from typing import Dict, Optional, Tuple
from typing import AsyncIterator, Dict, Optional, Tuple
from fastapi.responses import Response
@ -59,36 +59,71 @@ def restore_streaming(request_id: str, content: str, mapping: Dict[str, str]) ->
def deanonymize_sse(
request_id: str, body_str: str, mapping: Dict[str, str], is_anthropic: bool
) -> Response:
result_lines = []
for line in body_str.split("\n"):
stripped = line.strip()
if not (stripped.startswith("data: ") and stripped[6:] != "[DONE]"):
result_lines.append(line)
continue
try:
chunk = json.loads(stripped[6:])
if is_anthropic:
# {"type": "content_block_delta", "delta": {"type": "text_delta", "text": "..."}}
if chunk.get("type") == "content_block_delta":
delta = chunk.get("delta", {})
if delta.get("type") == "text_delta" and delta.get("text"):
delta["text"] = restore_streaming(
request_id, delta["text"], mapping
)
else:
# {"choices": [{"delta": {"content": "..."}}]}
for choice in chunk.get("choices", []):
delta = choice.get("delta", {})
if delta.get("content"):
delta["content"] = restore_streaming(
request_id, delta["content"], mapping
)
result_lines.append("data: " + json.dumps(chunk))
except json.JSONDecodeError:
result_lines.append(line)
result_lines = [
_process_sse_line(request_id, line, mapping, is_anthropic)
for line in body_str.split("\n")
]
return Response(content="\n".join(result_lines), media_type="text/plain")
def _process_sse_line(
request_id: str, line: str, mapping: Dict[str, str], is_anthropic: bool
) -> str:
"""Process a single SSE line, restoring PII in data payloads."""
stripped = line.strip()
if not (stripped.startswith("data: ") and stripped[6:] != "[DONE]"):
return line
try:
chunk = json.loads(stripped[6:])
if is_anthropic:
if chunk.get("type") == "content_block_delta":
delta = chunk.get("delta", {})
if delta.get("type") == "text_delta" and delta.get("text"):
delta["text"] = restore_streaming(
request_id, delta["text"], mapping
)
else:
for choice in chunk.get("choices", []):
delta = choice.get("delta", {})
if delta.get("content"):
delta["content"] = restore_streaming(
request_id, delta["content"], mapping
)
return "data: " + json.dumps(chunk)
except json.JSONDecodeError:
return line
async def deanonymize_sse_stream(
request_id: str,
byte_stream: AsyncIterator[bytes],
mapping: Dict[str, str],
is_anthropic: bool,
):
"""Async generator that reads SSE events from a streaming request body,
de-anonymizes them, and yields processed events as they become complete.
Buffers partial data and splits on SSE event boundaries (blank lines).
"""
buffer = ""
async for raw_chunk in byte_stream:
buffer += raw_chunk.decode("utf-8", errors="replace")
# Yield each complete SSE event (delimited by double newline)
while "\n\n" in buffer:
event, buffer = buffer.split("\n\n", 1)
processed_lines = [
_process_sse_line(request_id, line, mapping, is_anthropic)
for line in event.split("\n")
]
yield "\n".join(processed_lines) + "\n\n"
# Flush any trailing data
if buffer.strip():
processed_lines = [
_process_sse_line(request_id, line, mapping, is_anthropic)
for line in buffer.split("\n")
]
yield "\n".join(processed_lines)
def deanonymize_json(
request_id: str,
raw_body: bytes,