Update LLM text-completion duration metric (#40)

* Added LLM duration metric, better buckets

* Added heatmap to dashboard to replace 95/97/99 chart

* Bump version
This commit is contained in:
cybermaggedon 2024-08-26 11:46:36 +01:00 committed by GitHub
parent d0e3fcf019
commit 0159e938a2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
26 changed files with 502 additions and 429 deletions

View file

@ -6,6 +6,7 @@ serverless endpoint service. Input is prompt, output is response.
import requests
import json
from prometheus_client import Histogram
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
from .... schema import text_completion_request_queue
@ -46,6 +47,19 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "text_completion_metric"):
__class__.text_completion_metric = Histogram(
'text_completion_duration',
'Text completion duration (seconds)',
buckets=[
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
120.0
]
)
self.endpoint = endpoint
self.token = token
self.temperature = temperature
@ -108,14 +122,15 @@ class Processor(ConsumerProducer):
print(f"Handling prompt {id}...", flush=True)
prompt = self.build_prompt(
"You are a helpful chatbot",
v.prompt
)
try:
response = self.call_llm(prompt)
prompt = self.build_prompt(
"You are a helpful chatbot",
v.prompt
)
with __class__.text_completion_metric.time():
response = self.call_llm(prompt)
print("Send response...", flush=True)

View file

@ -6,6 +6,7 @@ Input is prompt, output is response. Mistral is default.
import boto3
import json
from prometheus_client import Histogram
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
from .... schema import text_completion_request_queue
@ -52,6 +53,19 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "text_completion_metric"):
__class__.text_completion_metric = Histogram(
'text_completion_duration',
'Text completion duration (seconds)',
buckets=[
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
120.0
]
)
self.model = model
self.temperature = temperature
self.max_output = max_output
@ -78,85 +92,90 @@ class Processor(ConsumerProducer):
prompt = v.prompt
# Mistral Input Format
if self.model.startswith("mistral"):
promptbody = json.dumps({
"prompt": prompt,
"max_tokens": self.max_output,
"temperature": self.temperature,
"top_p": 0.99,
"top_k": 40
})
# Llama 3.1 Input Format
elif self.model.startswith("meta"):
promptbody = json.dumps({
"prompt": prompt,
"max_gen_len": self.max_output,
"temperature": self.temperature,
"top_p": 0.95,
})
# Anthropic Input Format
elif self.model.startswith("anthropic"):
promptbody = json.dumps({
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": self.max_output,
"temperature": self.temperature,
"top_p": 0.999,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
]
})
# Jamba Input Format
elif self.model.startswith("ai21"):
promptbody = json.dumps({
"max_tokens": self.max_output,
"temperature": self.temperature,
"top_p": 0.9,
"messages": [
{
"role": "user",
"content": prompt
}
]
})
# Cohere Input Format
elif self.model.startswith("cohere"):
promptbody = json.dumps({
"max_tokens": self.max_output,
"temperature": self.temperature,
"message": prompt
})
# Use Mistral format as defualt
else:
promptbody = json.dumps({
"prompt": prompt,
"max_tokens": self.max_output,
"temperature": self.temperature,
"top_p": 0.99,
"top_k": 40
})
accept = 'application/json'
contentType = 'application/json'
try:
# Mistral Input Format
if self.model.startswith("mistral"):
promptbody = json.dumps({
"prompt": prompt,
"max_tokens": self.max_output,
"temperature": self.temperature,
"top_p": 0.99,
"top_k": 40
})
# Llama 3.1 Input Format
elif self.model.startswith("meta"):
promptbody = json.dumps({
"prompt": prompt,
"max_gen_len": self.max_output,
"temperature": self.temperature,
"top_p": 0.95,
})
# Anthropic Input Format
elif self.model.startswith("anthropic"):
promptbody = json.dumps({
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": self.max_output,
"temperature": self.temperature,
"top_p": 0.999,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
]
})
# Jamba Input Format
elif self.model.startswith("ai21"):
promptbody = json.dumps({
"max_tokens": self.max_output,
"temperature": self.temperature,
"top_p": 0.9,
"messages": [
{
"role": "user",
"content": prompt
}
]
})
# Cohere Input Format
elif self.model.startswith("cohere"):
promptbody = json.dumps({
"max_tokens": self.max_output,
"temperature": self.temperature,
"message": prompt
})
# Use Mistral format as defualt
else:
promptbody = json.dumps({
"prompt": prompt,
"max_tokens": self.max_output,
"temperature": self.temperature,
"top_p": 0.99,
"top_k": 40
})
accept = 'application/json'
contentType = 'application/json'
# FIXME: Consider catching request limits and raise TooManyRequests
# See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html
response = self.bedrock.invoke_model(body=promptbody, modelId=self.model, accept=accept, contentType=contentType)
with __class__.text_completion_metric.time():
response = self.bedrock.invoke_model(
body=promptbody, modelId=self.model, accept=accept,
contentType=contentType
)
# Mistral Response Structure
if self.model.startswith("mistral"):

View file

@ -5,6 +5,7 @@ Input is prompt, output is response.
"""
import anthropic
from prometheus_client import Histogram
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
from .... schema import text_completion_request_queue
@ -47,6 +48,19 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "text_completion_metric"):
__class__.text_completion_metric = Histogram(
'text_completion_duration',
'Text completion duration (seconds)',
buckets=[
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
120.0
]
)
self.model = model
self.claude = anthropic.Anthropic(api_key=api_key)
self.temperature = temperature
@ -69,23 +83,26 @@ class Processor(ConsumerProducer):
try:
# FIXME: Rate limits?
response = message = self.claude.messages.create(
model=self.model,
max_tokens=self.max_output,
temperature=self.temperature,
system = "You are a helpful chatbot.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
]
)
with __class__.text_completion_metric.time():
response = message = self.claude.messages.create(
model=self.model,
max_tokens=self.max_output,
temperature=self.temperature,
system = "You are a helpful chatbot.",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
]
)
resp = response.content[0].text
print(resp, flush=True)

View file

@ -5,6 +5,7 @@ Input is prompt, output is response.
"""
import cohere
from prometheus_client import Histogram
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
from .... schema import text_completion_request_queue
@ -44,6 +45,19 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "text_completion_metric"):
__class__.text_completion_metric = Histogram(
'text_completion_duration',
'Text completion duration (seconds)',
buckets=[
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
120.0
]
)
self.model = model
self.temperature = temperature
self.cohere = cohere.Client(api_key=api_key)
@ -64,15 +78,17 @@ class Processor(ConsumerProducer):
try:
output = self.cohere.chat(
model=self.model,
message=prompt,
preamble = "You are a helpful AI-assistant.",
temperature=self.temperature,
chat_history=[],
prompt_truncation='auto',
connectors=[]
)
with __class__.text_completion_metric.time():
output = self.cohere.chat(
model=self.model,
message=prompt,
preamble = "You are a helpful AI-assistant.",
temperature=self.temperature,
chat_history=[],
prompt_truncation='auto',
connectors=[]
)
resp = output.text
print(resp, flush=True)

View file

@ -5,7 +5,7 @@ Input is prompt, output is response.
"""
from langchain_community.llms import Ollama
from prometheus_client import Histogram, Info, Counter
from prometheus_client import Histogram, Info
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
from .... schema import text_completion_request_queue
@ -44,6 +44,19 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "text_completion_metric"):
__class__.text_completion_metric = Histogram(
'text_completion_duration',
'Text completion duration (seconds)',
buckets=[
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
120.0
]
)
if not hasattr(__class__, "model_metric"):
__class__.model_metric = Info(
'model', 'Model information'
@ -69,7 +82,8 @@ class Processor(ConsumerProducer):
try:
response = self.llm.invoke(prompt)
with __class__.text_completion_metric.time():
response = self.llm.invoke(prompt)
print("Send response...", flush=True)

View file

@ -5,6 +5,7 @@ Input is prompt, output is response.
"""
from openai import OpenAI
from prometheus_client import Histogram
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
from .... schema import text_completion_request_queue
@ -47,6 +48,19 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "text_completion_metric"):
__class__.text_completion_metric = Histogram(
'text_completion_duration',
'Text completion duration (seconds)',
buckets=[
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
120.0
]
)
self.model = model
self.temperature = temperature
self.max_output = max_output
@ -69,28 +83,31 @@ class Processor(ConsumerProducer):
try:
# FIXME: Rate limits
resp = self.openai.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
with __class__.text_completion_metric.time():
resp = self.openai.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
}
]
}
],
temperature=self.temperature,
max_tokens=self.max_output,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
response_format={
"type": "text"
}
],
temperature=self.temperature,
max_tokens=self.max_output,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
response_format={
"type": "text"
}
)
)
print(resp.choices[0].message.content, flush=True)

View file

@ -6,6 +6,7 @@ Google Cloud. Input is prompt, output is response.
import vertexai
import time
from prometheus_client import Histogram
from google.oauth2 import service_account
import google
@ -61,6 +62,19 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "text_completion_metric"):
__class__.text_completion_metric = Histogram(
'text_completion_duration',
'Text completion duration (seconds)',
buckets=[
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
120.0
]
)
self.parameters = {
"temperature": temperature,
"top_p": 1.0,
@ -125,10 +139,12 @@ class Processor(ConsumerProducer):
prompt = v.prompt
resp = self.llm.generate_content(
prompt, generation_config=self.generation_config,
safety_settings=self.safety_settings
)
with __class__.text_completion_metric.time():
resp = self.llm.generate_content(
prompt, generation_config=self.generation_config,
safety_settings=self.safety_settings
)
resp = resp.text