diff --git a/crates/prompt_gateway/src/stream_context.rs b/crates/prompt_gateway/src/stream_context.rs index e02f96c5..6851b3c0 100644 --- a/crates/prompt_gateway/src/stream_context.rs +++ b/crates/prompt_gateway/src/stream_context.rs @@ -124,7 +124,8 @@ impl StreamContext { let arch_fc_response: ChatCompletionsResponse = match serde_json::from_str(&body_str) { Ok(arch_fc_response) => arch_fc_response, Err(e) => { - warn!("error deserializing archfc response: {}", e); + warn!("error deserializing archfc response: {}, body: {}", e, body_str + ); return self.send_server_error(ServerError::Deserialization(e), None); } }; diff --git a/model_server/pyproject.toml b/model_server/pyproject.toml index d4a35682..9fa447f0 100644 --- a/model_server/pyproject.toml +++ b/model_server/pyproject.toml @@ -37,7 +37,7 @@ opentelemetry-instrumentation-fastapi = "^0.49b0" overrides = "^7.7.0" [tool.poetry.scripts] -archgw_modelserver = "src.cli:start_server" +archgw_modelserver = "src.cli:run_server" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/model_server/src/core/model_utils.py b/model_server/src/core/model_utils.py index c411d835..3f6b36b0 100644 --- a/model_server/src/core/model_utils.py +++ b/model_server/src/core/model_utils.py @@ -30,6 +30,7 @@ class ChatCompletionResponse(BaseModel): created: Optional[str] = "" choices: List[Choice] model: str + metadata: Optional[Dict[str, str]] = {} class GuardRequest(BaseModel): diff --git a/model_server/src/main.py b/model_server/src/main.py index ef15ff78..c1675412 100644 --- a/model_server/src/main.py +++ b/model_server/src/main.py @@ -67,11 +67,12 @@ async def function_calling(req: ChatMessage, res: Response): "Arch-Function" ].chat_completion(req) function_latency = time.perf_counter() - function_start_time - return { - "response": function_calling_response, - "intent_latency": round(intent_latency * 1000, 3), - "function_latency": round(function_latency * 1000, 3), + function_calling_response.metadata = { + "intent_latency": str(round(intent_latency * 1000, 3)), + "function_latency": str(round(function_latency * 1000, 3)), } + + return function_calling_response except Exception as e: # [TODO] Review: update how to collect debugging outputs # logger.error(f"Error in chat_completion from `Arch-Function`: {e}")