mirror of
https://github.com/katanemo/plano.git
synced 2026-04-25 00:36:34 +02:00
Add support for v1/responses API (#622)
* making first commit. still need to work on streaming respones * making first commit. still need to work on streaming respones * stream buffer implementation with tests * adding grok API keys to workflow * fixed changes based on code review * adding support for bedrock models * fixed issues with translation to claude code --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-342.local>
This commit is contained in:
parent
b01a81927d
commit
a448c6e9cb
38 changed files with 7015 additions and 2955 deletions
|
|
@ -22,11 +22,13 @@ use common::ratelimit::Header;
|
|||
use common::stats::{IncrementingMetric, RecordingMetric};
|
||||
use common::tracing::{Event, Span, TraceData, Traceparent};
|
||||
use common::{ratelimit, routing, tokenizer};
|
||||
use hermesllm::apis::amazon_bedrock_binary_frame::BedrockBinaryFrameDecoder;
|
||||
use hermesllm::apis::anthropic::{MessagesContentBlock, MessagesStreamEvent};
|
||||
use hermesllm::apis::sse::{SseEvent, SseStreamIter};
|
||||
use hermesllm::clients::endpoints::SupportedAPIs;
|
||||
use hermesllm::apis::streaming_shapes::amazon_bedrock_binary_frame::BedrockBinaryFrameDecoder;
|
||||
use hermesllm::apis::streaming_shapes::sse::{
|
||||
SseEvent, SseStreamBuffer, SseStreamBufferTrait, SseStreamIter,
|
||||
};
|
||||
use hermesllm::clients::endpoints::SupportedAPIsFromClient;
|
||||
use hermesllm::providers::response::ProviderResponse;
|
||||
use hermesllm::providers::streaming_response::ProviderStreamResponse;
|
||||
use hermesllm::{
|
||||
DecodedFrame, ProviderId, ProviderRequest, ProviderRequestType, ProviderResponseType,
|
||||
ProviderStreamResponseType,
|
||||
|
|
@ -38,7 +40,7 @@ pub struct StreamContext {
|
|||
streaming_response: bool,
|
||||
response_tokens: usize,
|
||||
/// The API that is requested by the client (before compatibility mapping)
|
||||
client_api: Option<SupportedAPIs>,
|
||||
client_api: Option<SupportedAPIsFromClient>,
|
||||
/// The API that should be used for the upstream provider (after compatibility mapping)
|
||||
resolved_api: Option<SupportedUpstreamAPIs>,
|
||||
llm_providers: Rc<LlmProviders>,
|
||||
|
|
@ -56,6 +58,7 @@ pub struct StreamContext {
|
|||
binary_frame_decoder: Option<BedrockBinaryFrameDecoder<bytes::BytesMut>>,
|
||||
http_method: Option<String>,
|
||||
http_protocol: Option<String>,
|
||||
sse_buffer: Option<SseStreamBuffer>,
|
||||
}
|
||||
|
||||
impl StreamContext {
|
||||
|
|
@ -87,6 +90,7 @@ impl StreamContext {
|
|||
binary_frame_decoder: None,
|
||||
http_method: None,
|
||||
http_protocol: None,
|
||||
sse_buffer: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -172,7 +176,8 @@ impl StreamContext {
|
|||
Some(
|
||||
SupportedUpstreamAPIs::OpenAIChatCompletions(_)
|
||||
| SupportedUpstreamAPIs::AmazonBedrockConverse(_)
|
||||
| SupportedUpstreamAPIs::AmazonBedrockConverseStream(_),
|
||||
| SupportedUpstreamAPIs::AmazonBedrockConverseStream(_)
|
||||
| SupportedUpstreamAPIs::OpenAIResponsesAPI(_),
|
||||
)
|
||||
| None => {
|
||||
// OpenAI and default: use Authorization Bearer token
|
||||
|
|
@ -476,7 +481,17 @@ impl StreamContext {
|
|||
}
|
||||
};
|
||||
|
||||
let mut response_buffer = Vec::new();
|
||||
// Initialize SSE buffer if not present
|
||||
if self.sse_buffer.is_none() {
|
||||
self.sse_buffer = match SseStreamBuffer::try_from((&client_api, &upstream_api))
|
||||
{
|
||||
Ok(buffer) => Some(buffer),
|
||||
Err(e) => {
|
||||
warn!("Failed to create SSE buffer: {}", e);
|
||||
return Err(Action::Continue);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Process each SSE event
|
||||
for sse_event in sse_iter {
|
||||
|
|
@ -527,12 +542,32 @@ impl StreamContext {
|
|||
}
|
||||
}
|
||||
|
||||
// Add transformed event to response buffer
|
||||
let bytes: Vec<u8> = transformed_event.into();
|
||||
response_buffer.extend_from_slice(&bytes);
|
||||
// Add transformed event to buffer (buffer may inject lifecycle events)
|
||||
if let Some(buffer) = self.sse_buffer.as_mut() {
|
||||
buffer.add_transformed_event(transformed_event);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(response_buffer)
|
||||
// Get accumulated bytes from buffer and return
|
||||
match self.sse_buffer.as_mut() {
|
||||
Some(buffer) => {
|
||||
let bytes = buffer.into_bytes();
|
||||
if !bytes.is_empty() {
|
||||
let content = String::from_utf8_lossy(&bytes);
|
||||
debug!(
|
||||
"[ARCHGW_REQ_ID:{}] UPSTREAM_TRANSFORMED_CLIENT_RESPONSE: size={} content={}",
|
||||
self.request_identifier(),
|
||||
bytes.len(),
|
||||
content
|
||||
);
|
||||
}
|
||||
Ok(bytes)
|
||||
}
|
||||
None => {
|
||||
warn!("SSE buffer unexpectedly missing after initialization");
|
||||
Err(Action::Continue)
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
warn!("Missing client_api for non-streaming response");
|
||||
|
|
@ -544,7 +579,7 @@ impl StreamContext {
|
|||
fn handle_bedrock_binary_stream(
|
||||
&mut self,
|
||||
body: &[u8],
|
||||
client_api: &SupportedAPIs,
|
||||
client_api: &SupportedAPIsFromClient,
|
||||
upstream_api: &SupportedUpstreamAPIs,
|
||||
) -> Result<Vec<u8>, Action> {
|
||||
// Initialize decoder if not present
|
||||
|
|
@ -552,83 +587,57 @@ impl StreamContext {
|
|||
self.binary_frame_decoder = Some(BedrockBinaryFrameDecoder::from_bytes(&[]));
|
||||
}
|
||||
|
||||
// Add incoming bytes to buffer
|
||||
// Initialize SSE buffer if not present
|
||||
if self.sse_buffer.is_none() {
|
||||
self.sse_buffer = match SseStreamBuffer::try_from((client_api, upstream_api)) {
|
||||
Ok(buffer) => Some(buffer),
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"[ARCHGW_REQ_ID:{}] BEDROCK_BUFFER_INIT_ERROR: {}",
|
||||
self.request_identifier(),
|
||||
e
|
||||
);
|
||||
return Err(Action::Continue);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Add incoming bytes to decoder buffer
|
||||
let decoder = self.binary_frame_decoder.as_mut().unwrap();
|
||||
decoder.buffer_mut().extend_from_slice(body);
|
||||
|
||||
let mut response_buffer = Vec::new();
|
||||
// Process all complete frames
|
||||
loop {
|
||||
let decoded_frame = self.binary_frame_decoder.as_mut().unwrap().decode_frame();
|
||||
match decoded_frame {
|
||||
Some(DecodedFrame::Complete(ref frame_ref)) => {
|
||||
let frame = DecodedFrame::Complete(frame_ref.clone());
|
||||
|
||||
// Convert frame to provider response type
|
||||
match ProviderStreamResponseType::try_from((&frame, client_api, upstream_api)) {
|
||||
Ok(provider_response) => {
|
||||
self.record_ttft_if_needed();
|
||||
|
||||
// Handle ContentBlockStart and ContentBlockDelta events
|
||||
match &provider_response {
|
||||
ProviderStreamResponseType::MessagesStreamEvent(evt) => {
|
||||
match evt {
|
||||
MessagesStreamEvent::ContentBlockStart {
|
||||
index, ..
|
||||
} => {
|
||||
// Mark that we've seen ContentBlockStart for this index
|
||||
self.binary_frame_decoder
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set_content_block_start_sent(*index as i32);
|
||||
debug!(
|
||||
"[ARCHGW_REQ_ID:{}] BEDROCK_CONTENT_BLOCK_START_TRACKED: index={}",
|
||||
self.request_identifier(),
|
||||
*index
|
||||
);
|
||||
}
|
||||
MessagesStreamEvent::ContentBlockDelta {
|
||||
index, ..
|
||||
} => {
|
||||
// Check if ContentBlockStart was sent for this index
|
||||
let needs_start = !self
|
||||
.binary_frame_decoder
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.has_content_block_start_been_sent(*index as i32);
|
||||
|
||||
if needs_start {
|
||||
// Emit empty ContentBlockStart before delta
|
||||
let content_block_start =
|
||||
MessagesStreamEvent::ContentBlockStart {
|
||||
index: *index,
|
||||
content_block: MessagesContentBlock::Text {
|
||||
text: String::new(),
|
||||
cache_control: None,
|
||||
},
|
||||
};
|
||||
let start_sse: String = content_block_start.into();
|
||||
response_buffer
|
||||
.extend_from_slice(start_sse.as_bytes());
|
||||
|
||||
// Mark that we've now sent it
|
||||
self.binary_frame_decoder
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set_content_block_start_sent(*index as i32);
|
||||
|
||||
debug!(
|
||||
"[ARCHGW_REQ_ID:{}] BEDROCK_INJECTED_CONTENT_BLOCK_START: index={}",
|
||||
self.request_identifier(),
|
||||
*index
|
||||
);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
// Track token usage
|
||||
if let Some(content) = provider_response.content_delta() {
|
||||
let estimated_tokens = content.len() / 4;
|
||||
self.response_tokens += estimated_tokens.max(1);
|
||||
debug!(
|
||||
"[ARCHGW_REQ_ID:{}] BEDROCK_TOKEN_UPDATE: delta_chars={} estimated_tokens={} total_tokens={}",
|
||||
self.request_identifier(),
|
||||
content.len(),
|
||||
estimated_tokens.max(1),
|
||||
self.response_tokens
|
||||
);
|
||||
}
|
||||
|
||||
let sse_string: String = provider_response.into();
|
||||
response_buffer.extend_from_slice(sse_string.as_bytes());
|
||||
// Create SseEvent from provider response
|
||||
let event = SseEvent::from_provider_response(provider_response);
|
||||
|
||||
// Add to buffer (buffer handles all shim logic including ContentBlockStart injection)
|
||||
if let Some(buffer) = self.sse_buffer.as_mut() {
|
||||
buffer.add_transformed_event(event);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
|
|
@ -658,8 +667,29 @@ impl StreamContext {
|
|||
}
|
||||
}
|
||||
|
||||
// Return accumulated complete frames (may be empty if all frames incomplete)
|
||||
Ok(response_buffer)
|
||||
// Get accumulated bytes from buffer and return
|
||||
match self.sse_buffer.as_mut() {
|
||||
Some(buffer) => {
|
||||
let bytes = buffer.into_bytes();
|
||||
if !bytes.is_empty() {
|
||||
let content = String::from_utf8_lossy(&bytes);
|
||||
debug!(
|
||||
"[ARCHGW_REQ_ID:{}] UPSTREAM_TRANSFORMED_CLIENT_RESPONSE: size={} content={}",
|
||||
self.request_identifier(),
|
||||
bytes.len(),
|
||||
content
|
||||
);
|
||||
}
|
||||
Ok(bytes)
|
||||
}
|
||||
None => {
|
||||
warn!(
|
||||
"[ARCHGW_REQ_ID:{}] BEDROCK_BUFFER_MISSING",
|
||||
self.request_identifier()
|
||||
);
|
||||
Err(Action::Continue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_non_streaming_response(
|
||||
|
|
@ -782,13 +812,14 @@ impl HttpContext for StreamContext {
|
|||
self.select_llm_provider();
|
||||
|
||||
// Check if this is a supported API endpoint
|
||||
if SupportedAPIs::from_endpoint(&request_path).is_none() {
|
||||
if SupportedAPIsFromClient::from_endpoint(&request_path).is_none() {
|
||||
self.send_http_response(404, vec![], Some(b"Unsupported endpoint"));
|
||||
return Action::Continue;
|
||||
}
|
||||
|
||||
// Get the SupportedApi for routing decisions
|
||||
let supported_api: Option<SupportedAPIs> = SupportedAPIs::from_endpoint(&request_path);
|
||||
let supported_api: Option<SupportedAPIsFromClient> =
|
||||
SupportedAPIsFromClient::from_endpoint(&request_path);
|
||||
self.client_api = supported_api;
|
||||
|
||||
// Debug: log provider, client API, resolved API, and request path
|
||||
|
|
@ -1131,8 +1162,9 @@ impl HttpContext for StreamContext {
|
|||
}
|
||||
|
||||
match self.client_api {
|
||||
Some(SupportedAPIs::OpenAIChatCompletions(_)) => {}
|
||||
Some(SupportedAPIs::AnthropicMessagesAPI(_)) => {}
|
||||
Some(SupportedAPIsFromClient::OpenAIChatCompletions(_)) => {}
|
||||
Some(SupportedAPIsFromClient::AnthropicMessagesAPI(_)) => {}
|
||||
Some(SupportedAPIsFromClient::OpenAIResponsesAPI(_)) => {}
|
||||
_ => {
|
||||
let api_info = match &self.client_api {
|
||||
Some(api) => format!("{}", api),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue