diff --git a/arch/arch_config_schema.yaml b/arch/arch_config_schema.yaml index feaaa90f..e5d8b88a 100644 --- a/arch/arch_config_schema.yaml +++ b/arch/arch_config_schema.yaml @@ -170,8 +170,6 @@ properties: type: object additionalProperties: type: string - pass_context: - type: boolean additionalProperties: false required: - name diff --git a/crates/common/src/configuration.rs b/crates/common/src/configuration.rs index 616536a4..a956e71c 100644 --- a/crates/common/src/configuration.rs +++ b/crates/common/src/configuration.rs @@ -219,7 +219,6 @@ pub struct EndpointDetails { #[serde(rename = "http_method")] pub method: Option, pub http_headers: Option>, - pub pass_context: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs index e0a319ef..5ad375af 100644 --- a/crates/prompt_gateway/src/stream_context.rs +++ b/crates/prompt_gateway/src/stream_context.rs @@ -379,7 +379,7 @@ impl StreamContext { let http_method = endpoint_details.method.clone().unwrap_or_default(); let prompt_target_params = prompt_target.parameters.clone().unwrap_or_default(); - let (path, body) = match compute_request_path_body( + let (path, api_call_body) = match compute_request_path_body( &endpoint_path, tool_params, &prompt_target_params, @@ -396,6 +396,8 @@ impl StreamContext { } }; + debug!("api call body {:?}", api_call_body); + let timeout_str = API_REQUEST_TIMEOUT_MS.to_string(); let http_method_str = http_method.to_string(); @@ -411,25 +413,6 @@ impl StreamContext { .into_iter() .collect(); - let api_call_body = match endpoint_details.pass_context.unwrap_or_default() { - true => { - let messages = self.construct_llm_messages(&callout_context); - - let chat_completion_request = ChatCompletionsRequest { - model: callout_context.request_body.model.clone(), - messages, - tools: None, - stream: callout_context.request_body.stream, - stream_options: callout_context.request_body.stream_options.clone(), - metadata: None, - }; - - let body_str = serde_json::to_string(&chat_completion_request).unwrap(); - Some(body_str) - } - false => body, - }; - if self.request_id.is_some() { headers.insert(REQUEST_ID_HEADER, self.request_id.as_ref().unwrap()); } @@ -444,7 +427,6 @@ impl StreamContext { headers.insert(key.as_str(), value.as_str()); } - debug!("api call body string: {}", api_call_body.as_ref().unwrap()); let call_args = CallArgs::new( ARCH_INTERNAL_CLUSTER_NAME, @@ -519,7 +501,7 @@ impl StreamContext { if !prompt_target .auto_llm_dispatch_on_response - .unwrap_or_default() + .unwrap_or(true) { let tool_call_response = self.tool_call_response.as_ref().unwrap().clone(); @@ -675,7 +657,7 @@ impl StreamContext { // check if the default target should be dispatched to the LLM provider if !prompt_target .auto_llm_dispatch_on_response - .unwrap_or_default() + .unwrap_or(true) { let default_target_response_str = if self.streaming_response { let chat_completion_response = diff --git a/demos/use_cases/orchestrating_agents/hurl_tests/simple.hurl b/demos/use_cases/orchestrating_agents/hurl_tests/simple.hurl new file mode 100644 index 00000000..4db2c67c --- /dev/null +++ b/demos/use_cases/orchestrating_agents/hurl_tests/simple.hurl @@ -0,0 +1,19 @@ +POST http://localhost:10000/v1/chat/completions +Content-Type: application/json + +{ + "messages": [ + { + "role": "user", + "content": "I want to sell red shoes" + } + ] +} +HTTP 200 +[Asserts] +header "content-type" == "application/json" +jsonpath "$.model" matches /^gpt-4o-mini/ +jsonpath "$.metadata.x-arch-state" != null +jsonpath "$.usage" != null +jsonpath "$.choices[0].message.content" != null +jsonpath "$.choices[0].message.role" == "assistant" diff --git a/demos/use_cases/orchestrating_agents/hurl_tests/simple_stream.hurl b/demos/use_cases/orchestrating_agents/hurl_tests/simple_stream.hurl new file mode 100644 index 00000000..f060fed0 --- /dev/null +++ b/demos/use_cases/orchestrating_agents/hurl_tests/simple_stream.hurl @@ -0,0 +1,16 @@ +POST http://localhost:10000/v1/chat/completions +Content-Type: application/json + +{ + "messages": [ + { + "role": "user", + "content": "I want to sell red shoes" + } + ], + "stream": true +} +HTTP 200 +[Asserts] +header "content-type" matches /text\/event-stream/ +body matches /^data: .*?sales_agent.*?\n/ diff --git a/demos/use_cases/orchestrating_agents/main.py b/demos/use_cases/orchestrating_agents/main.py index db178db0..b51e4ad3 100644 --- a/demos/use_cases/orchestrating_agents/main.py +++ b/demos/use_cases/orchestrating_agents/main.py @@ -54,6 +54,7 @@ class ChatCompletionsRequest(BaseModel): messages: List[Message] model: str metadata: Dict[str, Any] = None + stream: bool = False class Choice(BaseModel): @@ -115,48 +116,35 @@ agent_map = { @app.post("/v1/chat/completions") async def completion_api(req: ChatCompletionsRequest): + logger.info(f"request: {req}") if req.metadata is None: req.metadata = {} agent_name = req.metadata.get("Agent-Name", "unknown agent") logger.info(f"agent: {agent_name}") - def stream(): - agent_role = agent_map.get(agent_name)["role"] - agent_instructions = agent_map.get(agent_name)["instructions"] - system_prompt = "You are a " + agent_role + ". " + agent_instructions - messages = [{"role": "system", "content": system_prompt}] - for message in req.messages: - messages.append({"role": message.role, "content": message.content}) - completion = client.chat.completions.create( - model="--", - messages=messages, - stream=True, - ) - for line in completion: - if line.choices and len(line.choices) > 0 and line.choices[0].delta: - chunk_response_str = json.dumps(line.model_dump()) - yield "data: " + chunk_response_str + "\n\n" - yield "data: [DONE]" + "\n\n" + agent_role = agent_map.get(agent_name)["role"] + agent_instructions = agent_map.get(agent_name)["instructions"] + system_prompt = "You are a " + agent_role + ". " + agent_instructions + messages = [{"role": "system", "content": system_prompt}] + for message in req.messages: + messages.append({"role": message.role, "content": message.content}) + logger.info("messages: " + str(messages)) + completion = client.chat.completions.create( + model="--", + messages=messages, + stream=req.stream, + ) - # content = agent_map.get(agent_name) + if req.stream: - # for c in content: - # resp = ChatCompletionStreamResponse( - # model="--", - # choices=[ - # ChunkChoice( - # delta=Message( - # role="assistant", - # content=c, - # ) - # ) - # ], - # ) - # # random sleep between 10m and 50ms - # time.sleep(random.randint(10, 50) / 1000) + def stream(): + for line in completion: + if line.choices and len(line.choices) > 0 and line.choices[0].delta: + chunk_response_str = json.dumps(line.model_dump()) + yield "data: " + chunk_response_str + "\n\n" + yield "data: [DONE]" + "\n\n" - # yield "data: " + json.dumps(resp.model_dump()) + "\n\n" + return StreamingResponse(stream(), media_type="text/event-stream") - # yield "data: [DONE]" + "\n\n" - - return StreamingResponse(stream(), media_type="text/event-stream") + else: + return completion diff --git a/model_server/src/core/function_calling.py b/model_server/src/core/function_calling.py index 71108dcd..0e33cd90 100644 --- a/model_server/src/core/function_calling.py +++ b/model_server/src/core/function_calling.py @@ -547,7 +547,7 @@ class ArchFunctionHandler(ArchBaseHandler): messages=messages, model=self.model_name, stream=True, - extra_body={"temperature": 0.01, "logprobs": True}, + extra_body=self.generation_params, ) use_agent_orchestrator = req.metadata.get("use_agent_orchestrator", False)