Collect debugging log

2026-06-23 15:38:07 +02:00 · 2024-12-09 14:15:10 -08:00 · 2024-12-09 14:15:10 -08:00 · 8871d3f751
commit 8871d3f751
parent f13947732c
4 changed files with 92 additions and 14 deletions
--- a/e2e_tests/api_model_server.rest
+++ b/e2e_tests/api_model_server.rest
@ -10,22 +10,66 @@ Content-Type: application/json
  "messages": [
    {
      "role": "user",
-      "content": "how is the weather in seattle for next 10 days"
+      "content": "what is the weather forcast for seattle in the next 10 days?"
    }
  ],
  "tools": [
    {
-        "id": "weather-112",
+      "type": "function",
-        "tool_type": "function",
+      "function": {
-        "function": {
+        "name": "weather_forecast",
-          "name": "weather_forecast",
+        "parameters": {
-          "arguments": {"city": "str", "days": "int"}
+          "type": "object",
          "properties": {
            "city": {
              "type": "str"
            },
            "days": {
              "type": "int"
            }
          },
          "required": ["city", "days"]
        }
      }
    }
  ]
 }
 ### talk to function calling endpoint
 POST {{model_server_endpoint}}/function_calling HTTP/1.1
 Content-Type: application/json
 {
  "messages": [
    {
      "role": "user",
      "content": "book a hotel for me"
    }
  ],
  "tools": [
    {
      "type": "function",
      "function": {
        "name": "weather_forecast",
        "parameters": {
          "type": "object",
          "properties": {
            "city": {
              "type": "str"
            },
            "days": {
              "type": "int"
            }
          },
          "required": ["city", "days"]
        }
      }
    }
  ]
 }
 ### talk to Arch-Intent directly for completion
 POST https://api.fc.archgw.com/v1/chat/completions HTTP/1.1
 Content-Type: application/json
--- a/model_server/src/core/function_calling.py
+++ b/model_server/src/core/function_calling.py
@ -174,7 +174,7 @@ class ArchFunctionConfig:
    ).strip()
    GENERATION_PARAMS = {
-        "temperature": 0.2,
+        "temperature": 0.6,
        "top_p": 1.0,
        "top_k": 50,
        "max_tokens": 512,
@ -482,7 +482,7 @@ class ArchFunctionHandler(ArchBaseHandler):
        for _ in self.hallu_handler:
            # check if the first token is <tool_call>
-            if len(self.hallu_handler.tokens) > 0 and has_tool_call == None:
+            if len(self.hallu_handler.tokens) > 0 and has_tool_call is None:
                if self.hallu_handler.tokens[0] == "<tool_call>":
                    has_tool_call = True
                else:
@ -490,29 +490,42 @@ class ArchFunctionHandler(ArchBaseHandler):
                    break
            # if the model is hallucinating, start parameter gathering
-            if self.hallu_handler.hallucination == True:
+            if self.hallu_handler.hallucination is True:
                # [TODO] - Review: remove the following code
                print(
                    f"Hallucination detected for the following response, start parameter gathering: \n{''.join(self.hallu_handler.tokens)}"
                )
                prefill_response = self._engage_parameter_gathering(messages)
                model_response = prefill_response.choices[0].message.content
                break
-        if has_tool_call and self.hallu_handler.hallucination == False:
+        if has_tool_call and self.hallu_handler.hallucination is False:
            # [TODO] - Review: remove the following code
            print("Tool call found, no hallucination detected!")
            model_response = "".join(self.hallu_handler.tokens)
        # start parameter gathering if the model is not generating tool calls
        if has_tool_call is False:
            # [TODO] - Review: remove the following code
            print("No tool call found, start parameter gathering")
            prefill_response = self._engage_parameter_gathering(messages)
            model_response = prefill_response.choices[0].message.content
        # Extract tool calls from model response
        extracted = self._extract_tool_calls(model_response)
        # [TODO] - Review: remvoe the following code
        print(f"[Extracted] - {extracted}")
-        if extracted["result"]:
+        if len(extracted["result"]) and extracted["status"]:
            # [TODO] Review: define the behavior in the case that tool call extraction fails
            # if not extracted["status"]:
            verified = self._verify_tool_calls(
                tools=req.tools, tool_calls=extracted["result"]
            )
            # [TODO] - Review: remvoe the following code
            print(f"[Verified] - {verified}")
            # [TODO] Review: In the case that tool calls are invalid, define the protocol to collect debugging output and the behavior to handle it appropriately
            if verified["status"]:
--- a/model_server/src/core/hallucination.py
+++ b/model_server/src/core/hallucination.py
@ -278,6 +278,9 @@ class HallucinationStateHandler:
                f"Hallucination: token '{self.tokens[-1]}' is uncertain."
            )
            # [TODO] - Review: remove the following code
            print(f"[Hallucination] - Hallucination detected: {self.error_message}")
    def _count_consecutive_token(self, token=MaskToken.PARAMETER_VALUE) -> int:
        """
        Counts the number of consecutive occurrences of a given token in the mask.
--- a/model_server/src/main.py
+++ b/model_server/src/main.py
@ -1,4 +1,5 @@
 import os
 import time
 from src.commons.globals import handler_map
 from src.core.model_utils import ChatMessage, GuardRequest
@ -54,22 +55,34 @@ async def models():
@app.post("/function_calling")
 async def function_calling(req: ChatMessage, res: Response):
    try:
        intent_start_time = time.perf_counter()
        intent_response = await handler_map["Arch-Intent"].chat_completion(req)
        intent_latency = time.perf_counter() - intent_start_time
        if handler_map["Arch-Intent"].detect_intent(intent_response):
            # [TODO] measure agreement between intent detection and function calling
            try:
                function_start_time = time.perf_counter()
                function_calling_response = await handler_map[
                    "Arch-Function"
                ].chat_completion(req)
-                return function_calling_response
+                function_latency = time.perf_counter() - function_start_time
                return {
                    "response": function_calling_response,
                    "intent_latency": round(intent_latency * 1000, 3),
                    "function_latency": round(function_latency * 1000, 3),
                }
            except Exception as e:
                # [TODO] Review: update how to collect debugging outputs
                # logger.error(f"Error in chat_completion from `Arch-Function`: {e}")
                res.status_code = 500
                return {"error": f"[Arch-Function] - {e}"}
        # [TODO] Review: define the behavior if `Arch-Intent` doesn't detect an intent
-        # else:
+        else:
            return {
                "result": "No intent matched",
                "intent_latency": round(intent_latency * 1000, 3),
            }
    except Exception as e:
        # [TODO] Review: update how to collect debugging outputs
@ -81,8 +94,13 @@ async def function_calling(req: ChatMessage, res: Response):
@app.post("/guardrails")
 async def guardrails(req: GuardRequest, res: Response, max_num_words=300):
    try:
        guard_start_time = time.perf_counter()
        guard_result = handler_map["Arch-Guard"].predict(req)
-        return guard_result
+        guard_latency = time.perf_counter() - guard_start_time
        return {
            "response": guard_result,
            "guard_latency": round(guard_latency * 1000, 3),
        }
    except Exception as e:
        # [TODO] Review: update how to collect debugging outputs
        res.status_code = 500