send latency numbers from model_server as metadata

This commit is contained in:
Adil Hafeez 2024-12-10 16:01:17 -08:00
parent 14625e2a1d
commit 60dfb18018
4 changed files with 9 additions and 6 deletions

View file

@ -124,7 +124,8 @@ impl StreamContext {
let arch_fc_response: ChatCompletionsResponse = match serde_json::from_str(&body_str) {
Ok(arch_fc_response) => arch_fc_response,
Err(e) => {
warn!("error deserializing archfc response: {}", e);
warn!("error deserializing archfc response: {}, body: {}", e, body_str
);
return self.send_server_error(ServerError::Deserialization(e), None);
}
};

View file

@ -37,7 +37,7 @@ opentelemetry-instrumentation-fastapi = "^0.49b0"
overrides = "^7.7.0"
[tool.poetry.scripts]
archgw_modelserver = "src.cli:start_server"
archgw_modelserver = "src.cli:run_server"
[build-system]
requires = ["poetry-core>=1.0.0"]

View file

@ -30,6 +30,7 @@ class ChatCompletionResponse(BaseModel):
created: Optional[str] = ""
choices: List[Choice]
model: str
metadata: Optional[Dict[str, str]] = {}
class GuardRequest(BaseModel):

View file

@ -67,11 +67,12 @@ async def function_calling(req: ChatMessage, res: Response):
"Arch-Function"
].chat_completion(req)
function_latency = time.perf_counter() - function_start_time
return {
"response": function_calling_response,
"intent_latency": round(intent_latency * 1000, 3),
"function_latency": round(function_latency * 1000, 3),
function_calling_response.metadata = {
"intent_latency": str(round(intent_latency * 1000, 3)),
"function_latency": str(round(function_latency * 1000, 3)),
}
return function_calling_response
except Exception as e:
# [TODO] Review: update how to collect debugging outputs
# logger.error(f"Error in chat_completion from `Arch-Function`: {e}")