Add support for v1/responses API (#622)

* making first commit. still need to work on streaming respones

* making first commit. still need to work on streaming respones

* stream buffer implementation with tests

* adding grok API keys to workflow

* fixed changes based on code review

* adding support for bedrock models

* fixed issues with translation to claude code

---------

Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-342.local>
This commit is contained in:
Salman Paracha 2025-12-03 14:58:26 -08:00 committed by GitHub
parent b01a81927d
commit a448c6e9cb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
38 changed files with 7015 additions and 2955 deletions

View file

@ -22,11 +22,13 @@ use common::ratelimit::Header;
use common::stats::{IncrementingMetric, RecordingMetric};
use common::tracing::{Event, Span, TraceData, Traceparent};
use common::{ratelimit, routing, tokenizer};
use hermesllm::apis::amazon_bedrock_binary_frame::BedrockBinaryFrameDecoder;
use hermesllm::apis::anthropic::{MessagesContentBlock, MessagesStreamEvent};
use hermesllm::apis::sse::{SseEvent, SseStreamIter};
use hermesllm::clients::endpoints::SupportedAPIs;
use hermesllm::apis::streaming_shapes::amazon_bedrock_binary_frame::BedrockBinaryFrameDecoder;
use hermesllm::apis::streaming_shapes::sse::{
SseEvent, SseStreamBuffer, SseStreamBufferTrait, SseStreamIter,
};
use hermesllm::clients::endpoints::SupportedAPIsFromClient;
use hermesllm::providers::response::ProviderResponse;
use hermesllm::providers::streaming_response::ProviderStreamResponse;
use hermesllm::{
DecodedFrame, ProviderId, ProviderRequest, ProviderRequestType, ProviderResponseType,
ProviderStreamResponseType,
@ -38,7 +40,7 @@ pub struct StreamContext {
streaming_response: bool,
response_tokens: usize,
/// The API that is requested by the client (before compatibility mapping)
client_api: Option<SupportedAPIs>,
client_api: Option<SupportedAPIsFromClient>,
/// The API that should be used for the upstream provider (after compatibility mapping)
resolved_api: Option<SupportedUpstreamAPIs>,
llm_providers: Rc<LlmProviders>,
@ -56,6 +58,7 @@ pub struct StreamContext {
binary_frame_decoder: Option<BedrockBinaryFrameDecoder<bytes::BytesMut>>,
http_method: Option<String>,
http_protocol: Option<String>,
sse_buffer: Option<SseStreamBuffer>,
}
impl StreamContext {
@ -87,6 +90,7 @@ impl StreamContext {
binary_frame_decoder: None,
http_method: None,
http_protocol: None,
sse_buffer: None,
}
}
@ -172,7 +176,8 @@ impl StreamContext {
Some(
SupportedUpstreamAPIs::OpenAIChatCompletions(_)
| SupportedUpstreamAPIs::AmazonBedrockConverse(_)
| SupportedUpstreamAPIs::AmazonBedrockConverseStream(_),
| SupportedUpstreamAPIs::AmazonBedrockConverseStream(_)
| SupportedUpstreamAPIs::OpenAIResponsesAPI(_),
)
| None => {
// OpenAI and default: use Authorization Bearer token
@ -476,7 +481,17 @@ impl StreamContext {
}
};
let mut response_buffer = Vec::new();
// Initialize SSE buffer if not present
if self.sse_buffer.is_none() {
self.sse_buffer = match SseStreamBuffer::try_from((&client_api, &upstream_api))
{
Ok(buffer) => Some(buffer),
Err(e) => {
warn!("Failed to create SSE buffer: {}", e);
return Err(Action::Continue);
}
};
}
// Process each SSE event
for sse_event in sse_iter {
@ -527,12 +542,32 @@ impl StreamContext {
}
}
// Add transformed event to response buffer
let bytes: Vec<u8> = transformed_event.into();
response_buffer.extend_from_slice(&bytes);
// Add transformed event to buffer (buffer may inject lifecycle events)
if let Some(buffer) = self.sse_buffer.as_mut() {
buffer.add_transformed_event(transformed_event);
}
}
Ok(response_buffer)
// Get accumulated bytes from buffer and return
match self.sse_buffer.as_mut() {
Some(buffer) => {
let bytes = buffer.into_bytes();
if !bytes.is_empty() {
let content = String::from_utf8_lossy(&bytes);
debug!(
"[ARCHGW_REQ_ID:{}] UPSTREAM_TRANSFORMED_CLIENT_RESPONSE: size={} content={}",
self.request_identifier(),
bytes.len(),
content
);
}
Ok(bytes)
}
None => {
warn!("SSE buffer unexpectedly missing after initialization");
Err(Action::Continue)
}
}
}
None => {
warn!("Missing client_api for non-streaming response");
@ -544,7 +579,7 @@ impl StreamContext {
fn handle_bedrock_binary_stream(
&mut self,
body: &[u8],
client_api: &SupportedAPIs,
client_api: &SupportedAPIsFromClient,
upstream_api: &SupportedUpstreamAPIs,
) -> Result<Vec<u8>, Action> {
// Initialize decoder if not present
@ -552,83 +587,57 @@ impl StreamContext {
self.binary_frame_decoder = Some(BedrockBinaryFrameDecoder::from_bytes(&[]));
}
// Add incoming bytes to buffer
// Initialize SSE buffer if not present
if self.sse_buffer.is_none() {
self.sse_buffer = match SseStreamBuffer::try_from((client_api, upstream_api)) {
Ok(buffer) => Some(buffer),
Err(e) => {
warn!(
"[ARCHGW_REQ_ID:{}] BEDROCK_BUFFER_INIT_ERROR: {}",
self.request_identifier(),
e
);
return Err(Action::Continue);
}
};
}
// Add incoming bytes to decoder buffer
let decoder = self.binary_frame_decoder.as_mut().unwrap();
decoder.buffer_mut().extend_from_slice(body);
let mut response_buffer = Vec::new();
// Process all complete frames
loop {
let decoded_frame = self.binary_frame_decoder.as_mut().unwrap().decode_frame();
match decoded_frame {
Some(DecodedFrame::Complete(ref frame_ref)) => {
let frame = DecodedFrame::Complete(frame_ref.clone());
// Convert frame to provider response type
match ProviderStreamResponseType::try_from((&frame, client_api, upstream_api)) {
Ok(provider_response) => {
self.record_ttft_if_needed();
// Handle ContentBlockStart and ContentBlockDelta events
match &provider_response {
ProviderStreamResponseType::MessagesStreamEvent(evt) => {
match evt {
MessagesStreamEvent::ContentBlockStart {
index, ..
} => {
// Mark that we've seen ContentBlockStart for this index
self.binary_frame_decoder
.as_mut()
.unwrap()
.set_content_block_start_sent(*index as i32);
debug!(
"[ARCHGW_REQ_ID:{}] BEDROCK_CONTENT_BLOCK_START_TRACKED: index={}",
self.request_identifier(),
*index
);
}
MessagesStreamEvent::ContentBlockDelta {
index, ..
} => {
// Check if ContentBlockStart was sent for this index
let needs_start = !self
.binary_frame_decoder
.as_ref()
.unwrap()
.has_content_block_start_been_sent(*index as i32);
if needs_start {
// Emit empty ContentBlockStart before delta
let content_block_start =
MessagesStreamEvent::ContentBlockStart {
index: *index,
content_block: MessagesContentBlock::Text {
text: String::new(),
cache_control: None,
},
};
let start_sse: String = content_block_start.into();
response_buffer
.extend_from_slice(start_sse.as_bytes());
// Mark that we've now sent it
self.binary_frame_decoder
.as_mut()
.unwrap()
.set_content_block_start_sent(*index as i32);
debug!(
"[ARCHGW_REQ_ID:{}] BEDROCK_INJECTED_CONTENT_BLOCK_START: index={}",
self.request_identifier(),
*index
);
}
}
_ => {}
}
}
_ => {}
// Track token usage
if let Some(content) = provider_response.content_delta() {
let estimated_tokens = content.len() / 4;
self.response_tokens += estimated_tokens.max(1);
debug!(
"[ARCHGW_REQ_ID:{}] BEDROCK_TOKEN_UPDATE: delta_chars={} estimated_tokens={} total_tokens={}",
self.request_identifier(),
content.len(),
estimated_tokens.max(1),
self.response_tokens
);
}
let sse_string: String = provider_response.into();
response_buffer.extend_from_slice(sse_string.as_bytes());
// Create SseEvent from provider response
let event = SseEvent::from_provider_response(provider_response);
// Add to buffer (buffer handles all shim logic including ContentBlockStart injection)
if let Some(buffer) = self.sse_buffer.as_mut() {
buffer.add_transformed_event(event);
}
}
Err(e) => {
warn!(
@ -658,8 +667,29 @@ impl StreamContext {
}
}
// Return accumulated complete frames (may be empty if all frames incomplete)
Ok(response_buffer)
// Get accumulated bytes from buffer and return
match self.sse_buffer.as_mut() {
Some(buffer) => {
let bytes = buffer.into_bytes();
if !bytes.is_empty() {
let content = String::from_utf8_lossy(&bytes);
debug!(
"[ARCHGW_REQ_ID:{}] UPSTREAM_TRANSFORMED_CLIENT_RESPONSE: size={} content={}",
self.request_identifier(),
bytes.len(),
content
);
}
Ok(bytes)
}
None => {
warn!(
"[ARCHGW_REQ_ID:{}] BEDROCK_BUFFER_MISSING",
self.request_identifier()
);
Err(Action::Continue)
}
}
}
fn handle_non_streaming_response(
@ -782,13 +812,14 @@ impl HttpContext for StreamContext {
self.select_llm_provider();
// Check if this is a supported API endpoint
if SupportedAPIs::from_endpoint(&request_path).is_none() {
if SupportedAPIsFromClient::from_endpoint(&request_path).is_none() {
self.send_http_response(404, vec![], Some(b"Unsupported endpoint"));
return Action::Continue;
}
// Get the SupportedApi for routing decisions
let supported_api: Option<SupportedAPIs> = SupportedAPIs::from_endpoint(&request_path);
let supported_api: Option<SupportedAPIsFromClient> =
SupportedAPIsFromClient::from_endpoint(&request_path);
self.client_api = supported_api;
// Debug: log provider, client API, resolved API, and request path
@ -1131,8 +1162,9 @@ impl HttpContext for StreamContext {
}
match self.client_api {
Some(SupportedAPIs::OpenAIChatCompletions(_)) => {}
Some(SupportedAPIs::AnthropicMessagesAPI(_)) => {}
Some(SupportedAPIsFromClient::OpenAIChatCompletions(_)) => {}
Some(SupportedAPIsFromClient::AnthropicMessagesAPI(_)) => {}
Some(SupportedAPIsFromClient::OpenAIResponsesAPI(_)) => {}
_ => {
let api_info = match &self.client_api {
Some(api) => format!("{}", api),