mirror of
https://github.com/katanemo/plano.git
synced 2026-05-27 14:17:15 +02:00
move custom tracer to llm filter (#267)
This commit is contained in:
parent
1d229cba8f
commit
d3c17c7abd
22 changed files with 335 additions and 133 deletions
10
demos/shared/chatbot_ui/.vscode/launch.json
vendored
10
demos/shared/chatbot_ui/.vscode/launch.json
vendored
|
|
@ -15,19 +15,21 @@
|
|||
"LLM": "1",
|
||||
"CHAT_COMPLETION_ENDPOINT": "http://localhost:10000/v1",
|
||||
"STREAMING": "True",
|
||||
"ARCH_CONFIG": "../demos/weather_forecast/arch_config.yaml"
|
||||
"ARCH_CONFIG": "../../weather_forecast/arch_config.yaml"
|
||||
}
|
||||
},
|
||||
{
|
||||
"python": "${workspaceFolder}/venv/bin/python",
|
||||
"name": "chatbot-ui llm",
|
||||
"cwd": "${workspaceFolder}/app",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "run.py",
|
||||
"program": "run_stream.py",
|
||||
"console": "integratedTerminal",
|
||||
"env": {
|
||||
"LLM": "1",
|
||||
"CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1"
|
||||
"CHAT_COMPLETION_ENDPOINT": "http://localhost:12000/v1",
|
||||
"STREAMING": "True",
|
||||
"ARCH_CONFIG": "../../llm_routing/arch_config.yaml"
|
||||
}
|
||||
},
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
from datetime import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
|
@ -159,13 +160,44 @@ def get_prompt_targets():
|
|||
config = yaml.safe_load(file)
|
||||
|
||||
available_tools = []
|
||||
for target in config["prompt_targets"]:
|
||||
if not target.get("default", False):
|
||||
available_tools.append(
|
||||
convert_prompt_target_to_openai_format(target)
|
||||
)
|
||||
if "prompt_targets" in config:
|
||||
for target in config["prompt_targets"]:
|
||||
if not target.get("default", False):
|
||||
available_tools.append(
|
||||
convert_prompt_target_to_openai_format(target)
|
||||
)
|
||||
|
||||
return {tool["name"]: tool["info"] for tool in available_tools}
|
||||
elif "llm_providers" in config:
|
||||
return config["llm_providers"]
|
||||
|
||||
return {tool["name"]: tool["info"] for tool in available_tools}
|
||||
except Exception as e:
|
||||
log.info(e)
|
||||
return None
|
||||
|
||||
|
||||
def get_llm_models():
|
||||
try:
|
||||
with open(os.getenv("ARCH_CONFIG", "arch_config.yaml"), "r") as file:
|
||||
config = yaml.safe_load(file)
|
||||
|
||||
available_models = [""]
|
||||
default_llm = None
|
||||
for llm_providers in config["llm_providers"]:
|
||||
if llm_providers.get("default", False):
|
||||
default_llm = llm_providers["name"]
|
||||
else:
|
||||
available_models.append(llm_providers["name"])
|
||||
|
||||
# place default model at the beginning of the list
|
||||
if default_llm:
|
||||
available_models.insert(0, default_llm)
|
||||
return available_models
|
||||
except Exception as e:
|
||||
log.info(e)
|
||||
return []
|
||||
|
||||
|
||||
def format_log(message):
|
||||
time_now = datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
|
||||
return f"{time_now} - {message}"
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from typing import List, Optional, Tuple
|
|||
from openai import OpenAI
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from common import get_prompt_targets, process_stream_chunk
|
||||
from common import format_log, get_llm_models, get_prompt_targets, process_stream_chunk
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
|
@ -36,20 +36,28 @@ CSS_STYLE = """
|
|||
footer {visibility: hidden}
|
||||
"""
|
||||
|
||||
client = OpenAI(
|
||||
api_key="--",
|
||||
base_url=CHAT_COMPLETION_ENDPOINT,
|
||||
)
|
||||
|
||||
|
||||
def chat(
|
||||
query: Optional[str],
|
||||
conversation: Optional[List[Tuple[str, str]]],
|
||||
history: List[dict],
|
||||
debug_output: str,
|
||||
model_selector: str,
|
||||
):
|
||||
history.append({"role": "user", "content": query})
|
||||
|
||||
if debug_output is None:
|
||||
debug_output = ""
|
||||
|
||||
try:
|
||||
headers = {}
|
||||
if model_selector and model_selector != "":
|
||||
headers["x-arch-llm-provider-hint"] = model_selector
|
||||
client = OpenAI(
|
||||
api_key="--",
|
||||
base_url=CHAT_COMPLETION_ENDPOINT,
|
||||
default_headers=headers,
|
||||
)
|
||||
response = client.chat.completions.create(
|
||||
# we select model from arch_config file
|
||||
model="--",
|
||||
|
|
@ -65,15 +73,20 @@ def chat(
|
|||
|
||||
conversation.append((query, ""))
|
||||
|
||||
model_is_set = False
|
||||
for chunk in response:
|
||||
tokens = process_stream_chunk(chunk, history)
|
||||
if tokens and not model_is_set:
|
||||
model_is_set = True
|
||||
model = history[-1]["model"]
|
||||
debug_output = debug_output + "\n" + format_log(f"model: {model}")
|
||||
if tokens:
|
||||
conversation[-1] = (
|
||||
conversation[-1][0],
|
||||
conversation[-1][1] + tokens,
|
||||
)
|
||||
|
||||
yield "", conversation, history
|
||||
yield "", conversation, history, debug_output, model_selector
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -94,8 +107,17 @@ def main():
|
|||
value=get_prompt_targets(),
|
||||
show_indices=False,
|
||||
elem_classes="json-container",
|
||||
min_height="95vh",
|
||||
min_height="50vh",
|
||||
)
|
||||
model_selector_textbox = gr.Dropdown(
|
||||
get_llm_models(),
|
||||
label="override model",
|
||||
elem_classes="dropdown",
|
||||
)
|
||||
debug_output = gr.TextArea(
|
||||
label="debug output",
|
||||
elem_classes="debug_output",
|
||||
)
|
||||
|
||||
with gr.Column(scale=2):
|
||||
chatbot = gr.Chatbot(
|
||||
|
|
@ -110,7 +132,9 @@ def main():
|
|||
)
|
||||
|
||||
textbox.submit(
|
||||
chat, [textbox, chatbot, history], [textbox, chatbot, history]
|
||||
chat,
|
||||
[textbox, chatbot, history, debug_output, model_selector_textbox],
|
||||
[textbox, chatbot, history, debug_output, model_selector_textbox],
|
||||
)
|
||||
|
||||
demo.launch(server_name="0.0.0.0", server_port=8080, show_error=True, debug=True)
|
||||
|
|
|
|||
|
|
@ -190,8 +190,8 @@
|
|||
"targets": [
|
||||
{
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(input_sequence_length_bucket[1h])))",
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.9, sum by(le) (rate(input_sequence_length_bucket[5m])))",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": false,
|
||||
"legendFormat": "__auto",
|
||||
|
|
@ -200,7 +200,7 @@
|
|||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "input sequence length (p50)",
|
||||
"title": "input sequence length (p90)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
|
@ -305,7 +305,7 @@
|
|||
},
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.5, sum(rate(output_sequence_length_bucket[1h])) by(le))",
|
||||
"expr": "histogram_quantile(0.9, sum(rate(output_sequence_length_bucket[5m])) by(le))",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": false,
|
||||
"instant": false,
|
||||
|
|
@ -315,7 +315,7 @@
|
|||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "output sequence length (p50)",
|
||||
"title": "output sequence length (p90)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
|
@ -415,7 +415,11 @@
|
|||
{
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
<<<<<<< HEAD
|
||||
"expr": "histogram_quantile(0.9, sum by(le) (rate(time_to_first_token_bucket[5m])))",
|
||||
=======
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(time_to_first_token_bucket[1h])))",
|
||||
>>>>>>> main
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": false,
|
||||
"legendFormat": "__auto",
|
||||
|
|
@ -424,7 +428,7 @@
|
|||
"useBackend": false
|
||||
}
|
||||
],
|
||||
"title": "time to first token (p50)",
|
||||
"title": "time to first token (p90)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
|
@ -539,20 +543,29 @@
|
|||
"type": "prometheus",
|
||||
"uid": "PBFA97CFB590B2093"
|
||||
},
|
||||
<<<<<<< HEAD
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.9, sum(rate(request_latency_bucket[60m])) by (le))",
|
||||
=======
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "histogram_quantile(0.5, sum by(le) (rate(request_latency_bucket[1h])))",
|
||||
"fullMetaSearch": false,
|
||||
>>>>>>> main
|
||||
"hide": false,
|
||||
"includeNullMetadata": false,
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
<<<<<<< HEAD
|
||||
"refId": "B"
|
||||
=======
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
>>>>>>> main
|
||||
}
|
||||
],
|
||||
"title": "request latency (p50)",
|
||||
"title": "request latency (p90)",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,11 +0,0 @@
|
|||
FROM python:3.12-slim as arch
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install requests
|
||||
COPY stream_traces.py .
|
||||
|
||||
RUN mkdir -p /var/log
|
||||
RUN touch /var/log/envoy.log
|
||||
|
||||
CMD ["python", "stream_traces.py"]
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
import os
|
||||
import time
|
||||
import requests
|
||||
import logging
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
|
||||
|
||||
otel_tracing_endpoint = os.getenv(
|
||||
"OTEL_TRACING_HTTP_ENDPOINT", "http://localhost:4318/v1/traces"
|
||||
)
|
||||
envoy_log_path = os.getenv("ENVOY_LOG_PATH", "/var/log/envoy.log")
|
||||
|
||||
logging.info(f"Using otel-tracing host: {otel_tracing_endpoint}")
|
||||
logging.info(f"Using envoy log path: {envoy_log_path}")
|
||||
|
||||
|
||||
def process_log_line(line):
|
||||
try:
|
||||
response = requests.post(
|
||||
url=otel_tracing_endpoint,
|
||||
data=line,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
logging.info(f"Sent trace to otel-tracing: {response.status_code}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to send trace to otel-tracing: {e}")
|
||||
|
||||
|
||||
with open(envoy_log_path, "r") as f:
|
||||
# Seek to the end of the file so we only read new lines
|
||||
f.seek(0, os.SEEK_END)
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
time.sleep(1)
|
||||
continue
|
||||
tokens = line.split("prompt_gateway: upstream_llm trace details: ")
|
||||
if len(tokens) > 1:
|
||||
process_log_line(tokens[1])
|
||||
Loading…
Add table
Add a link
Reference in a new issue