move custom tracer to llm filter (#267)

This commit is contained in:
Adil Hafeez 2024-11-15 10:44:01 -08:00 committed by GitHub
parent 1d229cba8f
commit d3c17c7abd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 335 additions and 133 deletions

View file

@ -0,0 +1,19 @@
# LLM Routing
This demo shows how you can arch gateway to manage keys and route to appropricate LLM.
# Starting the demo
1. Please make sure the [pre-requisites](https://github.com/katanemo/arch/?tab=readme-ov-file#prerequisites) are installed correctly
1. Start Arch
```sh
sh run_demo.sh
```
1. Navigate to http://localhost:18080/
# Observability
Arch gateway publishes stats endpoint at http://localhost:19901/stats. In this demo we are using prometheus to pull stats from arch and we are using grafana to visalize the stats in dashboard. To see grafana dashboard follow instructions below,
1. Navigate to http://localhost:3000/ to open grafana UI (use admin/grafana as credentials)
1. From grafana left nav click on dashboards and select "Intelligent Gateway Overview" to view arch gateway stats
# Selecting different LLM
You can pick different LLM based on header `x-arch-llm-provider-hint` to override default LLM.

View file

@ -2,7 +2,7 @@ services:
chatbot_ui:
build:
context: ../../chatbot_ui
context: ../shared/chatbot_ui
dockerfile: Dockerfile
ports:
- "18080:8080"
@ -12,3 +12,21 @@ services:
- "host.docker.internal:host-gateway"
volumes:
- ./arch_config.yaml:/app/arch_config.yaml
jaeger:
build:
context: ../shared/jaeger
ports:
- "16686:16686"
- "4317:4317"
- "4318:4318"
prometheus:
build:
context: ../shared/prometheus
grafana:
build:
context: ../shared/grafana
ports:
- "3000:3000"

View file

@ -15,19 +15,21 @@
"LLM": "1",
"CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
"STREAMING": "True",
"ARCH_CONFIG": "../demos/weather_forecast/arch_config.yaml"
"ARCH_CONFIG": "../../weather_forecast/arch_config.yaml"
}
},
{
"python": "${workspaceFolder}/venv/bin/python",
"name": "chatbot-ui llm",
"cwd": "${workspaceFolder}/app",
"type": "debugpy",
"request": "launch",
"program": "run.py",
"program": "run_stream.py",
"console": "integratedTerminal",
"env": {
"LLM": "1",
"CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1"
"CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1",
"STREAMING": "True",
"ARCH_CONFIG": "../../llm_routing/arch_config.yaml"
}
},
]

View file

@ -1,3 +1,4 @@
from datetime import datetime
import json
import logging
import os
@ -159,13 +160,44 @@ def get_prompt_targets():
config = yaml.safe_load(file)
available_tools = []
for target in config["prompt_targets"]:
if not target.get("default", False):
available_tools.append(
convert_prompt_target_to_openai_format(target)
)
if "prompt_targets" in config:
for target in config["prompt_targets"]:
if not target.get("default", False):
available_tools.append(
convert_prompt_target_to_openai_format(target)
)
return {tool["name"]: tool["info"] for tool in available_tools}
elif "llm_providers" in config:
return config["llm_providers"]
return {tool["name"]: tool["info"] for tool in available_tools}
except Exception as e:
log.info(e)
return None
def get_llm_models():
try:
with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
config = yaml.safe_load(file)
available_models = [""]
default_llm = None
for llm_providers in config["llm_providers"]:
if llm_providers.get("default", False):
default_llm = llm_providers["name"]
else:
available_models.append(llm_providers["name"])
# place default model at the beginning of the list
if default_llm:
available_models.insert(0, default_llm)
return available_models
except Exception as e:
log.info(e)
return []
def format_log(message):
time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
return f"{time_now} - {message}"

View file

@ -8,7 +8,7 @@ from typing import List, Optional, Tuple
from openai import OpenAI
from dotenv import load_dotenv
from common import get_prompt_targets, process_stream_chunk
from common import format_log, get_llm_models, get_prompt_targets, process_stream_chunk
load_dotenv()
@ -36,20 +36,28 @@ CSS_STYLE = """
footer {visibility: hidden}
"""
client = OpenAI(
api_key="--",
base_url=CHAT_COMPLETION_ENDPOINT,
)
def chat(
query: Optional[str],
conversation: Optional[List[Tuple[str, str]]],
history: List[dict],
debug_output: str,
model_selector: str,
):
history.append({"role": "user", "content": query})
if debug_output is None:
debug_output = ""
try:
headers = {}
if model_selector and model_selector != "":
headers["x-arch-llm-provider-hint"] = model_selector
client = OpenAI(
api_key="--",
base_url=CHAT_COMPLETION_ENDPOINT,
default_headers=headers,
)
response = client.chat.completions.create(
# we select model from arch_config file
model="--",
@ -65,15 +73,20 @@ def chat(
conversation.append((query, ""))
model_is_set = False
for chunk in response:
tokens = process_stream_chunk(chunk, history)
if tokens and not model_is_set:
model_is_set = True
model = history[-1]["model"]
debug_output = debug_output + "\n" + format_log(f"model: {model}")
if tokens:
conversation[-1] = (
conversation[-1][0],
conversation[-1][1] + tokens,
)
yield "", conversation, history
yield "", conversation, history, debug_output, model_selector
def main():
@ -94,8 +107,17 @@ def main():
value=get_prompt_targets(),
show_indices=False,
elem_classes="json-container",
min_height="95vh",
min_height="50vh",
)
model_selector_textbox = gr.Dropdown(
get_llm_models(),
label="override model",
elem_classes="dropdown",
)
debug_output = gr.TextArea(
label="debug output",
elem_classes="debug_output",
)
with gr.Column(scale=2):
chatbot = gr.Chatbot(
@ -110,7 +132,9 @@ def main():
)
textbox.submit(
chat, [textbox, chatbot, history], [textbox, chatbot, history]
chat,
[textbox, chatbot, history, debug_output, model_selector_textbox],
[textbox, chatbot, history, debug_output, model_selector_textbox],
)
demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)

View file

@ -190,8 +190,8 @@
"targets": [
{
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(input_sequence_length_bucket[1h])))",
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum by(le) (rate(input_sequence_length_bucket[5m])))",
"fullMetaSearch": false,
"includeNullMetadata": false,
"legendFormat": "__auto",
@ -200,7 +200,7 @@
"useBackend": false
}
],
"title": "input sequence length (p50)",
"title": "input sequence length (p90)",
"type": "timeseries"
},
{
@ -305,7 +305,7 @@
},
"disableTextWrap": false,
"editorMode": "code",
"expr": "histogram_quantile(0.5, sum(rate(output_sequence_length_bucket[1h])) by(le))",
"expr": "histogram_quantile(0.9, sum(rate(output_sequence_length_bucket[5m])) by(le))",
"fullMetaSearch": false,
"includeNullMetadata": false,
"instant": false,
@ -315,7 +315,7 @@
"useBackend": false
}
],
"title": "output sequence length (p50)",
"title": "output sequence length (p90)",
"type": "timeseries"
},
{
@ -415,7 +415,11 @@
{
"disableTextWrap": false,
"editorMode": "code",
<<<<<<< HEAD
"expr": "histogram_quantile(0.9, sum by(le) (rate(time_to_first_token_bucket[5m])))",
=======
"expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))",
>>>>>>> main
"fullMetaSearch": false,
"includeNullMetadata": false,
"legendFormat": "__auto",
@ -424,7 +428,7 @@
"useBackend": false
}
],
"title": "time to first token (p50)",
"title": "time to first token (p90)",
"type": "timeseries"
},
{
@ -539,20 +543,29 @@
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
<<<<<<< HEAD
"editorMode": "code",
"expr": "histogram_quantile(0.9, sum(rate(request_latency_bucket[60m])) by (le))",
=======
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))",
"fullMetaSearch": false,
>>>>>>> main
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "__auto",
"range": true,
<<<<<<< HEAD
"refId": "B"
=======
"refId": "A",
"useBackend": false
>>>>>>> main
}
],
"title": "request latency (p50)",
"title": "request latency (p90)",
"type": "timeseries"
},
{

View file

@ -1,11 +0,0 @@
FROM python:3.12-slim as arch
WORKDIR /app
RUN pip install requests
COPY stream_traces.py .
RUN mkdir -p /var/log
RUN touch /var/log/envoy.log
CMD ["python", "stream_traces.py"]

View file

@ -1,42 +0,0 @@
import os
import time
import requests
import logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
otel_tracing_endpoint = os.getenv(
"OTEL_TRACING_HTTP_ENDPOINT", "http://localhost:4318/v1/traces"
)
envoy_log_path = os.getenv("ENVOY_LOG_PATH", "/var/log/envoy.log")
logging.info(f"Using otel-tracing host: {otel_tracing_endpoint}")
logging.info(f"Using envoy log path: {envoy_log_path}")
def process_log_line(line):
try:
response = requests.post(
url=otel_tracing_endpoint,
data=line,
headers={"Content-Type": "application/json"},
)
logging.info(f"Sent trace to otel-tracing: {response.status_code}")
except Exception as e:
logging.error(f"Failed to send trace to otel-tracing: {e}")
with open(envoy_log_path, "r") as f:
# Seek to the end of the file so we only read new lines
f.seek(0, os.SEEK_END)
while True:
line = f.readline()
if not line:
time.sleep(1)
continue
tokens = line.split("prompt_gateway: upstream_llm trace details: ")
if len(tokens) > 1:
process_log_line(tokens[1])

View file

@ -30,14 +30,6 @@ services:
- "4317:4317"
- "4318:4318"
trace_streamer:
build:
context: ../shared/trace_streamer
environment:
- OTEL_TRACING_HTTP_ENDPOINT=http://jaeger:4318/v1/traces
volumes:
- ~/archgw_logs:/var/log/
prometheus:
build:
context: ../shared/prometheus

View file

@ -25,14 +25,6 @@ services:
volumes:
- ./arch_config.yaml:/app/arch_config.yaml
trace_streamer:
build:
context: ../shared/trace_streamer
environment:
- OTEL_TRACING_HTTP_ENDPOINT=http://otel-collector:4318/v1/traces
volumes:
- ~/archgw_logs:/var/log/
prometheus:
build:
context: ../shared/prometheus