move custom tracer to llm filter (#267)

2026-07-23 16:51:04 +02:00 · 2024-11-15 10:44:01 -08:00 · 2024-11-15 10:44:01 -08:00 · d3c17c7abd
commit d3c17c7abd
parent 1d229cba8f
22 changed files with 335 additions and 133 deletions
--- a/demos/llm_routing/README.md
+++ b/demos/llm_routing/README.md
@ -0,0 +1,19 @@
+# LLM Routing
+This demo shows how you can arch gateway to manage keys and route to appropricate LLM.
+
+# Starting the demo
+1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
+1. Start Arch
+   ```sh
+   sh run_demo.sh
+   ```
+1. Navigate to http://localhost:18080/
+
+# Observability
+Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visalize the stats in dashboard. To see grafana dashboard follow instructions below,
+
+1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials)
+1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats
+
+# Selecting different LLM
+You can pick different LLM based on header `x-arch-llm-provider-hint` to override default LLM.
--- a/demos/llm_routing/docker-compose.yaml
+++ b/demos/llm_routing/docker-compose.yaml
@ -2,7 +2,7 @@ services:

  chatbot_ui:
    build:
-      context: ../../chatbot_ui
+      context: ../shared/chatbot_ui
      dockerfile: Dockerfile
    ports:
      - "18080:8080"
@ -12,3 +12,21 @@ services:
      - "host.docker.internal:host-gateway"
    volumes:
      - ./arch_config.yaml:/app/arch_config.yaml
+
+  jaeger:
+    build:
+      context: ../shared/jaeger
+    ports:
+      - "16686:16686"
+      - "4317:4317"
+      - "4318:4318"
+
+  prometheus:
+    build:
+      context: ../shared/prometheus
+
+  grafana:
+    build:
+      context: ../shared/grafana
+    ports:
+      - "3000:3000"
--- a/demos/shared/chatbot_ui/.vscode/launch.json
+++ b/demos/shared/chatbot_ui/.vscode/launch.json
@ -15,19 +15,21 @@
        "LLM": "1",
        "CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
        "STREAMING": "True",
-        "ARCH_CONFIG": "../demos/weather_forecast/arch_config.yaml"
+        "ARCH_CONFIG": "../../weather_forecast/arch_config.yaml"
      }
    },
    {
+      "python": "${workspaceFolder}/venv/bin/python",
      "name": "chatbot-ui llm",
-      "cwd": "${workspaceFolder}/app",
      "type": "debugpy",
      "request": "launch",
-      "program": "run.py",
+      "program": "run_stream.py",
      "console": "integratedTerminal",
      "env": {
        "LLM": "1",
-        "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1"
+        "CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1",
+        "STREAMING": "True",
+        "ARCH_CONFIG": "../../llm_routing/arch_config.yaml"
      }
    },
  ]
--- a/demos/shared/chatbot_ui/common.py
+++ b/demos/shared/chatbot_ui/common.py
@ -1,3 +1,4 @@
+from datetime import datetime
 import json
 import logging
 import os
@ -159,13 +160,44 @@ def get_prompt_targets():
            config = yaml.safe_load(file)

            available_tools = []
-            for target in config["prompt_targets"]:
-                if not target.get("default", False):
-                    available_tools.append(
-                        convert_prompt_target_to_openai_format(target)
-                    )
+            if "prompt_targets" in config:
+                for target in config["prompt_targets"]:
+                    if not target.get("default", False):
+                        available_tools.append(
+                            convert_prompt_target_to_openai_format(target)
+                        )
+
+                return {tool["name"]: tool["info"] for tool in available_tools}
+            elif "llm_providers" in config:
+                return config["llm_providers"]

-            return {tool["name"]: tool["info"] for tool in available_tools}
    except Exception as e:
        log.info(e)
        return None
+
+
+def get_llm_models():
+    try:
+        with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
+            config = yaml.safe_load(file)
+
+            available_models = [""]
+            default_llm = None
+            for llm_providers in config["llm_providers"]:
+                if llm_providers.get("default", False):
+                    default_llm = llm_providers["name"]
+                else:
+                    available_models.append(llm_providers["name"])
+
+            # place default model at the beginning of the list
+            if default_llm:
+                available_models.insert(0, default_llm)
+            return available_models
+    except Exception as e:
+        log.info(e)
+        return []
+
+
+def format_log(message):
+    time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
+    return f"{time_now} - {message}"
--- a/demos/shared/chatbot_ui/run_stream.py
+++ b/demos/shared/chatbot_ui/run_stream.py
@ -8,7 +8,7 @@ from typing import List, Optional, Tuple
 from openai import OpenAI
 from dotenv import load_dotenv

-from common import get_prompt_targets, process_stream_chunk
+from common import format_log, get_llm_models, get_prompt_targets, process_stream_chunk

 load_dotenv()

@ -36,20 +36,28 @@ CSS_STYLE = """
 footer {visibility: hidden}
 """

-client = OpenAI(
-    api_key="--",
-    base_url=CHAT_COMPLETION_ENDPOINT,
-)
-

 def chat(
    query: Optional[str],
    conversation: Optional[List[Tuple[str, str]]],
    history: List[dict],
+    debug_output: str,
+    model_selector: str,
 ):
    history.append({"role": "user", "content": query})

+    if debug_output is None:
+        debug_output = ""
+
    try:
+        headers = {}
+        if model_selector and model_selector != "":
+            headers["x-arch-llm-provider-hint"] = model_selector
+        client = OpenAI(
+            api_key="--",
+            base_url=CHAT_COMPLETION_ENDPOINT,
+            default_headers=headers,
+        )
        response = client.chat.completions.create(
            # we select model from arch_config file
            model="--",
@ -65,15 +73,20 @@ def chat(

    conversation.append((query, ""))

+    model_is_set = False
    for chunk in response:
        tokens = process_stream_chunk(chunk, history)
+        if tokens and not model_is_set:
+            model_is_set = True
+            model = history[-1]["model"]
+            debug_output = debug_output + "\n" + format_log(f"model: {model}")
        if tokens:
            conversation[-1] = (
                conversation[-1][0],
                conversation[-1][1] + tokens,
            )

-            yield "", conversation, history
+            yield "", conversation, history, debug_output, model_selector


 def main():
@ -94,8 +107,17 @@ def main():
                            value=get_prompt_targets(),
                            show_indices=False,
                            elem_classes="json-container",
-                            min_height="95vh",
+                            min_height="50vh",
                        )
+                    model_selector_textbox = gr.Dropdown(
+                        get_llm_models(),
+                        label="override model",
+                        elem_classes="dropdown",
+                    )
+                    debug_output = gr.TextArea(
+                        label="debug output",
+                        elem_classes="debug_output",
+                    )

            with gr.Column(scale=2):
                chatbot = gr.Chatbot(
@ -110,7 +132,9 @@ def main():
                )

            textbox.submit(
-                chat, [textbox, chatbot, history], [textbox, chatbot, history]
+                chat,
+                [textbox, chatbot, history, debug_output, model_selector_textbox],
+                [textbox, chatbot, history, debug_output, model_selector_textbox],
            )

    demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)
--- a/demos/shared/grafana/dashboards/envoy_overview.json
+++ b/demos/shared/grafana/dashboards/envoy_overview.json
@ -190,8 +190,8 @@
      "targets": [
        {
          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(input_sequence_length_bucket[1h])))",
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(input_sequence_length_bucket[5m])))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "legendFormat": "__auto",
@ -200,7 +200,7 @@
          "useBackend": false
        }
      ],
-      "title": "input sequence length (p50)",
+      "title": "input sequence length (p90)",
      "type": "timeseries"
    },
    {
@ -305,7 +305,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "code",
-          "expr": "histogram_quantile(0.5, sum(rate(output_sequence_length_bucket[1h])) by(le))",
+          "expr": "histogram_quantile(0.9, sum(rate(output_sequence_length_bucket[5m])) by(le))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
@ -315,7 +315,7 @@
          "useBackend": false
        }
      ],
-      "title": "output sequence length (p50)",
+      "title": "output sequence length (p90)",
      "type": "timeseries"
    },
    {
@ -415,7 +415,11 @@
        {
          "disableTextWrap": false,
          "editorMode": "code",
+<<<<<<< HEAD
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(time_to_first_token_bucket[5m])))",
+=======
          "expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))",
+>>>>>>> main
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "legendFormat": "__auto",
@ -424,7 +428,7 @@
          "useBackend": false
        }
      ],
-      "title": "time to first token (p50)",
+      "title": "time to first token (p90)",
      "type": "timeseries"
    },
    {
@ -539,20 +543,29 @@
            "type": "prometheus",
            "uid": "PBFA97CFB590B2093"
          },
+<<<<<<< HEAD
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum(rate(request_latency_bucket[60m])) by (le))",
+=======
          "disableTextWrap": false,
          "editorMode": "builder",
          "expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))",
          "fullMetaSearch": false,
+>>>>>>> main
          "hide": false,
          "includeNullMetadata": false,
          "instant": false,
          "legendFormat": "__auto",
          "range": true,
+<<<<<<< HEAD
+          "refId": "B"
+=======
          "refId": "A",
          "useBackend": false
+>>>>>>> main
        }
      ],
-      "title": "request latency (p50)",
+      "title": "request latency (p90)",
      "type": "timeseries"
    },
    {
--- a/demos/shared/trace_streamer/Dockerfile
+++ b/demos/shared/trace_streamer/Dockerfile
@ -1,11 +0,0 @@
-FROM python:3.12-slim as arch
-
-WORKDIR /app
-
-RUN pip install requests
-COPY stream_traces.py .
-
-RUN mkdir -p /var/log
-RUN touch /var/log/envoy.log
-
-CMD ["python", "stream_traces.py"]
--- a/demos/shared/trace_streamer/stream_traces.py
+++ b/demos/shared/trace_streamer/stream_traces.py
@ -1,42 +0,0 @@
-import os
-import time
-import requests
-import logging
-
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-
-
-otel_tracing_endpoint = os.getenv(
-    "OTEL_TRACING_HTTP_ENDPOINT", "http://localhost:4318/v1/traces"
-)
-envoy_log_path = os.getenv("ENVOY_LOG_PATH", "/var/log/envoy.log")
-
-logging.info(f"Using otel-tracing host: {otel_tracing_endpoint}")
-logging.info(f"Using envoy log path: {envoy_log_path}")
-
-
-def process_log_line(line):
-    try:
-        response = requests.post(
-            url=otel_tracing_endpoint,
-            data=line,
-            headers={"Content-Type": "application/json"},
-        )
-        logging.info(f"Sent trace to otel-tracing: {response.status_code}")
-    except Exception as e:
-        logging.error(f"Failed to send trace to otel-tracing: {e}")
-
-
-with open(envoy_log_path, "r") as f:
-    # Seek to the end of the file so we only read new lines
-    f.seek(0, os.SEEK_END)
-    while True:
-        line = f.readline()
-        if not line:
-            time.sleep(1)
-            continue
-        tokens = line.split("prompt_gateway: upstream_llm trace details: ")
-        if len(tokens) > 1:
-            process_log_line(tokens[1])
--- a/demos/weather_forecast/docker-compose.yaml
+++ b/demos/weather_forecast/docker-compose.yaml
@ -30,14 +30,6 @@ services:
      - "4317:4317"
      - "4318:4318"

-  trace_streamer:
-    build:
-      context: ../shared/trace_streamer
-    environment:
-      - OTEL_TRACING_HTTP_ENDPOINT=http://jaeger:4318/v1/traces
-    volumes:
-      - ~/archgw_logs:/var/log/
-
  prometheus:
    build:
      context: ../shared/prometheus
--- a/demos/weather_forecast_signoz/docker-compose.yaml
+++ b/demos/weather_forecast_signoz/docker-compose.yaml
@ -25,14 +25,6 @@ services:
    volumes:
      - ./arch_config.yaml:/app/arch_config.yaml

-  trace_streamer:
-    build:
-      context: ../shared/trace_streamer
-    environment:
-      - OTEL_TRACING_HTTP_ENDPOINT=http://otel-collector:4318/v1/traces
-    volumes:
-      - ~/archgw_logs:/var/log/
-
  prometheus:
    build:
      context: ../shared/prometheus