mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-03 03:45:13 +02:00
Update LLM text-completion duration metric (#40)
* Added LLM duration metric, better buckets * Added heatmap to dashboard to replace 95/97/99 chart * Bump version
This commit is contained in:
parent
d0e3fcf019
commit
0159e938a2
26 changed files with 502 additions and 429 deletions
|
|
@ -6,6 +6,7 @@ serverless endpoint service. Input is prompt, output is response.
|
|||
|
||||
import requests
|
||||
import json
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
|
||||
from .... schema import text_completion_request_queue
|
||||
|
|
@ -46,6 +47,19 @@ class Processor(ConsumerProducer):
|
|||
}
|
||||
)
|
||||
|
||||
if not hasattr(__class__, "text_completion_metric"):
|
||||
__class__.text_completion_metric = Histogram(
|
||||
'text_completion_duration',
|
||||
'Text completion duration (seconds)',
|
||||
buckets=[
|
||||
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
|
||||
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
|
||||
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
|
||||
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
|
||||
120.0
|
||||
]
|
||||
)
|
||||
|
||||
self.endpoint = endpoint
|
||||
self.token = token
|
||||
self.temperature = temperature
|
||||
|
|
@ -108,14 +122,15 @@ class Processor(ConsumerProducer):
|
|||
|
||||
print(f"Handling prompt {id}...", flush=True)
|
||||
|
||||
prompt = self.build_prompt(
|
||||
"You are a helpful chatbot",
|
||||
v.prompt
|
||||
)
|
||||
|
||||
try:
|
||||
|
||||
response = self.call_llm(prompt)
|
||||
prompt = self.build_prompt(
|
||||
"You are a helpful chatbot",
|
||||
v.prompt
|
||||
)
|
||||
|
||||
with __class__.text_completion_metric.time():
|
||||
response = self.call_llm(prompt)
|
||||
|
||||
print("Send response...", flush=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ Input is prompt, output is response. Mistral is default.
|
|||
|
||||
import boto3
|
||||
import json
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
|
||||
from .... schema import text_completion_request_queue
|
||||
|
|
@ -52,6 +53,19 @@ class Processor(ConsumerProducer):
|
|||
}
|
||||
)
|
||||
|
||||
if not hasattr(__class__, "text_completion_metric"):
|
||||
__class__.text_completion_metric = Histogram(
|
||||
'text_completion_duration',
|
||||
'Text completion duration (seconds)',
|
||||
buckets=[
|
||||
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
|
||||
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
|
||||
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
|
||||
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
|
||||
120.0
|
||||
]
|
||||
)
|
||||
|
||||
self.model = model
|
||||
self.temperature = temperature
|
||||
self.max_output = max_output
|
||||
|
|
@ -78,85 +92,90 @@ class Processor(ConsumerProducer):
|
|||
|
||||
prompt = v.prompt
|
||||
|
||||
# Mistral Input Format
|
||||
if self.model.startswith("mistral"):
|
||||
promptbody = json.dumps({
|
||||
"prompt": prompt,
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.99,
|
||||
"top_k": 40
|
||||
})
|
||||
|
||||
# Llama 3.1 Input Format
|
||||
elif self.model.startswith("meta"):
|
||||
promptbody = json.dumps({
|
||||
"prompt": prompt,
|
||||
"max_gen_len": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.95,
|
||||
})
|
||||
|
||||
# Anthropic Input Format
|
||||
elif self.model.startswith("anthropic"):
|
||||
promptbody = json.dumps({
|
||||
"anthropic_version": "bedrock-2023-05-31",
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.999,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
# Jamba Input Format
|
||||
elif self.model.startswith("ai21"):
|
||||
promptbody = json.dumps({
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.9,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
# Cohere Input Format
|
||||
elif self.model.startswith("cohere"):
|
||||
promptbody = json.dumps({
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"message": prompt
|
||||
})
|
||||
|
||||
# Use Mistral format as defualt
|
||||
else:
|
||||
promptbody = json.dumps({
|
||||
"prompt": prompt,
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.99,
|
||||
"top_k": 40
|
||||
})
|
||||
|
||||
accept = 'application/json'
|
||||
contentType = 'application/json'
|
||||
|
||||
try:
|
||||
|
||||
# Mistral Input Format
|
||||
if self.model.startswith("mistral"):
|
||||
promptbody = json.dumps({
|
||||
"prompt": prompt,
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.99,
|
||||
"top_k": 40
|
||||
})
|
||||
|
||||
# Llama 3.1 Input Format
|
||||
elif self.model.startswith("meta"):
|
||||
promptbody = json.dumps({
|
||||
"prompt": prompt,
|
||||
"max_gen_len": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.95,
|
||||
})
|
||||
|
||||
# Anthropic Input Format
|
||||
elif self.model.startswith("anthropic"):
|
||||
promptbody = json.dumps({
|
||||
"anthropic_version": "bedrock-2023-05-31",
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.999,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
# Jamba Input Format
|
||||
elif self.model.startswith("ai21"):
|
||||
promptbody = json.dumps({
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.9,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
# Cohere Input Format
|
||||
elif self.model.startswith("cohere"):
|
||||
promptbody = json.dumps({
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"message": prompt
|
||||
})
|
||||
|
||||
# Use Mistral format as defualt
|
||||
else:
|
||||
promptbody = json.dumps({
|
||||
"prompt": prompt,
|
||||
"max_tokens": self.max_output,
|
||||
"temperature": self.temperature,
|
||||
"top_p": 0.99,
|
||||
"top_k": 40
|
||||
})
|
||||
|
||||
accept = 'application/json'
|
||||
contentType = 'application/json'
|
||||
|
||||
# FIXME: Consider catching request limits and raise TooManyRequests
|
||||
# See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html
|
||||
response = self.bedrock.invoke_model(body=promptbody, modelId=self.model, accept=accept, contentType=contentType)
|
||||
|
||||
with __class__.text_completion_metric.time():
|
||||
response = self.bedrock.invoke_model(
|
||||
body=promptbody, modelId=self.model, accept=accept,
|
||||
contentType=contentType
|
||||
)
|
||||
|
||||
# Mistral Response Structure
|
||||
if self.model.startswith("mistral"):
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ Input is prompt, output is response.
|
|||
"""
|
||||
|
||||
import anthropic
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
|
||||
from .... schema import text_completion_request_queue
|
||||
|
|
@ -47,6 +48,19 @@ class Processor(ConsumerProducer):
|
|||
}
|
||||
)
|
||||
|
||||
if not hasattr(__class__, "text_completion_metric"):
|
||||
__class__.text_completion_metric = Histogram(
|
||||
'text_completion_duration',
|
||||
'Text completion duration (seconds)',
|
||||
buckets=[
|
||||
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
|
||||
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
|
||||
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
|
||||
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
|
||||
120.0
|
||||
]
|
||||
)
|
||||
|
||||
self.model = model
|
||||
self.claude = anthropic.Anthropic(api_key=api_key)
|
||||
self.temperature = temperature
|
||||
|
|
@ -69,23 +83,26 @@ class Processor(ConsumerProducer):
|
|||
try:
|
||||
|
||||
# FIXME: Rate limits?
|
||||
response = message = self.claude.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=self.max_output,
|
||||
temperature=self.temperature,
|
||||
system = "You are a helpful chatbot.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
with __class__.text_completion_metric.time():
|
||||
|
||||
response = message = self.claude.messages.create(
|
||||
model=self.model,
|
||||
max_tokens=self.max_output,
|
||||
temperature=self.temperature,
|
||||
system = "You are a helpful chatbot.",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
resp = response.content[0].text
|
||||
print(resp, flush=True)
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ Input is prompt, output is response.
|
|||
"""
|
||||
|
||||
import cohere
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
|
||||
from .... schema import text_completion_request_queue
|
||||
|
|
@ -44,6 +45,19 @@ class Processor(ConsumerProducer):
|
|||
}
|
||||
)
|
||||
|
||||
if not hasattr(__class__, "text_completion_metric"):
|
||||
__class__.text_completion_metric = Histogram(
|
||||
'text_completion_duration',
|
||||
'Text completion duration (seconds)',
|
||||
buckets=[
|
||||
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
|
||||
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
|
||||
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
|
||||
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
|
||||
120.0
|
||||
]
|
||||
)
|
||||
|
||||
self.model = model
|
||||
self.temperature = temperature
|
||||
self.cohere = cohere.Client(api_key=api_key)
|
||||
|
|
@ -64,15 +78,17 @@ class Processor(ConsumerProducer):
|
|||
|
||||
try:
|
||||
|
||||
output = self.cohere.chat(
|
||||
model=self.model,
|
||||
message=prompt,
|
||||
preamble = "You are a helpful AI-assistant.",
|
||||
temperature=self.temperature,
|
||||
chat_history=[],
|
||||
prompt_truncation='auto',
|
||||
connectors=[]
|
||||
)
|
||||
with __class__.text_completion_metric.time():
|
||||
|
||||
output = self.cohere.chat(
|
||||
model=self.model,
|
||||
message=prompt,
|
||||
preamble = "You are a helpful AI-assistant.",
|
||||
temperature=self.temperature,
|
||||
chat_history=[],
|
||||
prompt_truncation='auto',
|
||||
connectors=[]
|
||||
)
|
||||
|
||||
resp = output.text
|
||||
print(resp, flush=True)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ Input is prompt, output is response.
|
|||
"""
|
||||
|
||||
from langchain_community.llms import Ollama
|
||||
from prometheus_client import Histogram, Info, Counter
|
||||
from prometheus_client import Histogram, Info
|
||||
|
||||
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
|
||||
from .... schema import text_completion_request_queue
|
||||
|
|
@ -44,6 +44,19 @@ class Processor(ConsumerProducer):
|
|||
}
|
||||
)
|
||||
|
||||
if not hasattr(__class__, "text_completion_metric"):
|
||||
__class__.text_completion_metric = Histogram(
|
||||
'text_completion_duration',
|
||||
'Text completion duration (seconds)',
|
||||
buckets=[
|
||||
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
|
||||
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
|
||||
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
|
||||
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
|
||||
120.0
|
||||
]
|
||||
)
|
||||
|
||||
if not hasattr(__class__, "model_metric"):
|
||||
__class__.model_metric = Info(
|
||||
'model', 'Model information'
|
||||
|
|
@ -69,7 +82,8 @@ class Processor(ConsumerProducer):
|
|||
|
||||
try:
|
||||
|
||||
response = self.llm.invoke(prompt)
|
||||
with __class__.text_completion_metric.time():
|
||||
response = self.llm.invoke(prompt)
|
||||
|
||||
print("Send response...", flush=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ Input is prompt, output is response.
|
|||
"""
|
||||
|
||||
from openai import OpenAI
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
|
||||
from .... schema import text_completion_request_queue
|
||||
|
|
@ -47,6 +48,19 @@ class Processor(ConsumerProducer):
|
|||
}
|
||||
)
|
||||
|
||||
if not hasattr(__class__, "text_completion_metric"):
|
||||
__class__.text_completion_metric = Histogram(
|
||||
'text_completion_duration',
|
||||
'Text completion duration (seconds)',
|
||||
buckets=[
|
||||
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
|
||||
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
|
||||
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
|
||||
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
|
||||
120.0
|
||||
]
|
||||
)
|
||||
|
||||
self.model = model
|
||||
self.temperature = temperature
|
||||
self.max_output = max_output
|
||||
|
|
@ -69,28 +83,31 @@ class Processor(ConsumerProducer):
|
|||
try:
|
||||
|
||||
# FIXME: Rate limits
|
||||
resp = self.openai.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
|
||||
with __class__.text_completion_metric.time():
|
||||
|
||||
resp = self.openai.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": prompt
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
temperature=self.temperature,
|
||||
max_tokens=self.max_output,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
response_format={
|
||||
"type": "text"
|
||||
}
|
||||
],
|
||||
temperature=self.temperature,
|
||||
max_tokens=self.max_output,
|
||||
top_p=1,
|
||||
frequency_penalty=0,
|
||||
presence_penalty=0,
|
||||
response_format={
|
||||
"type": "text"
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
print(resp.choices[0].message.content, flush=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ Google Cloud. Input is prompt, output is response.
|
|||
|
||||
import vertexai
|
||||
import time
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from google.oauth2 import service_account
|
||||
import google
|
||||
|
|
@ -61,6 +62,19 @@ class Processor(ConsumerProducer):
|
|||
}
|
||||
)
|
||||
|
||||
if not hasattr(__class__, "text_completion_metric"):
|
||||
__class__.text_completion_metric = Histogram(
|
||||
'text_completion_duration',
|
||||
'Text completion duration (seconds)',
|
||||
buckets=[
|
||||
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
|
||||
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
|
||||
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
|
||||
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
|
||||
120.0
|
||||
]
|
||||
)
|
||||
|
||||
self.parameters = {
|
||||
"temperature": temperature,
|
||||
"top_p": 1.0,
|
||||
|
|
@ -125,10 +139,12 @@ class Processor(ConsumerProducer):
|
|||
|
||||
prompt = v.prompt
|
||||
|
||||
resp = self.llm.generate_content(
|
||||
prompt, generation_config=self.generation_config,
|
||||
safety_settings=self.safety_settings
|
||||
)
|
||||
with __class__.text_completion_metric.time():
|
||||
|
||||
resp = self.llm.generate_content(
|
||||
prompt, generation_config=self.generation_config,
|
||||
safety_settings=self.safety_settings
|
||||
)
|
||||
|
||||
resp = resp.text
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue