Update LLM text-completion duration metric (#40)

* Added LLM duration metric, better buckets * Added heatmap to dashboard to replace 95/97/99 chart * Bump version
2026-07-20 18:51:03 +02:00 · 2024-08-26 11:46:36 +01:00 · 2024-08-26 11:46:36 +01:00 · 0159e938a2
commit 0159e938a2
parent d0e3fcf019
26 changed files with 502 additions and 429 deletions
--- a/trustgraph/model/text_completion/azure/llm.py
+++ b/trustgraph/model/text_completion/azure/llm.py
@ -6,6 +6,7 @@ serverless endpoint service.  Input is prompt, output is response.

 import requests
 import json
+from prometheus_client import Histogram

 from .... schema import TextCompletionRequest, TextCompletionResponse, Error
 from .... schema import text_completion_request_queue
@ -46,6 +47,19 @@ class Processor(ConsumerProducer):
            }
        )

+        if not hasattr(__class__, "text_completion_metric"):
+            __class__.text_completion_metric = Histogram(
+                'text_completion_duration',
+                'Text completion duration (seconds)',
+                buckets=[
+                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
+                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
+                    120.0
+                ]
+            )
+
        self.endpoint = endpoint
        self.token = token
        self.temperature = temperature
@ -108,14 +122,15 @@ class Processor(ConsumerProducer):

        print(f"Handling prompt {id}...", flush=True)

-        prompt = self.build_prompt(
-            "You are a helpful chatbot",
-            v.prompt
-        )
-
        try:

-            response = self.call_llm(prompt)
+            prompt = self.build_prompt(
+                "You are a helpful chatbot",
+                v.prompt
+            )
+
+            with __class__.text_completion_metric.time():
+                response = self.call_llm(prompt)

            print("Send response...", flush=True)

--- a/trustgraph/model/text_completion/bedrock/llm.py
+++ b/trustgraph/model/text_completion/bedrock/llm.py
@ -6,6 +6,7 @@ Input is prompt, output is response. Mistral is default.

 import boto3
 import json
+from prometheus_client import Histogram

 from .... schema import TextCompletionRequest, TextCompletionResponse, Error
 from .... schema import text_completion_request_queue
@ -52,6 +53,19 @@ class Processor(ConsumerProducer):
            }
        )

+        if not hasattr(__class__, "text_completion_metric"):
+            __class__.text_completion_metric = Histogram(
+                'text_completion_duration',
+                'Text completion duration (seconds)',
+                buckets=[
+                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
+                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
+                    120.0
+                ]
+            )
+
        self.model = model
        self.temperature = temperature
        self.max_output = max_output
@ -78,85 +92,90 @@ class Processor(ConsumerProducer):

        prompt = v.prompt

-       # Mistral Input Format
-        if self.model.startswith("mistral"):
-            promptbody = json.dumps({
-                "prompt": prompt,
-                "max_tokens": self.max_output,
-                "temperature": self.temperature,
-                "top_p": 0.99,
-                "top_k": 40
-            })
-
-        # Llama 3.1 Input Format
-        elif self.model.startswith("meta"):
-            promptbody = json.dumps({
-                "prompt": prompt,
-                "max_gen_len": self.max_output,
-                "temperature": self.temperature,
-                "top_p": 0.95,
-            })
-
-        # Anthropic Input Format
-        elif self.model.startswith("anthropic"):
-            promptbody = json.dumps({
-                "anthropic_version": "bedrock-2023-05-31",
-                "max_tokens": self.max_output,
-                "temperature": self.temperature,
-                "top_p": 0.999,
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": prompt
-                            }
-                        ]
-                    }
-                ]
-            })
-
-        # Jamba Input Format
-        elif self.model.startswith("ai21"):
-            promptbody = json.dumps({
-                "max_tokens": self.max_output,
-                "temperature": self.temperature,
-                "top_p": 0.9,
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": prompt
-                    }
-                ]
-            })
-
-        # Cohere Input Format
-        elif self.model.startswith("cohere"):
-            promptbody = json.dumps({
-                "max_tokens": self.max_output,
-                "temperature": self.temperature,
-                "message": prompt
-            })
-
-        # Use Mistral format as defualt
-        else:
-            promptbody = json.dumps({
-                "prompt": prompt,
-                "max_tokens": self.max_output,
-                "temperature": self.temperature,
-                "top_p": 0.99,
-                "top_k": 40
-            })
-
-        accept = 'application/json'
-        contentType = 'application/json'
-
        try:

+            # Mistral Input Format
+            if self.model.startswith("mistral"):
+                promptbody = json.dumps({
+                    "prompt": prompt,
+                    "max_tokens": self.max_output,
+                    "temperature": self.temperature,
+                    "top_p": 0.99,
+                    "top_k": 40
+                })
+
+            # Llama 3.1 Input Format
+            elif self.model.startswith("meta"):
+                promptbody = json.dumps({
+                    "prompt": prompt,
+                    "max_gen_len": self.max_output,
+                    "temperature": self.temperature,
+                    "top_p": 0.95,
+                })
+
+            # Anthropic Input Format
+            elif self.model.startswith("anthropic"):
+                promptbody = json.dumps({
+                    "anthropic_version": "bedrock-2023-05-31",
+                    "max_tokens": self.max_output,
+                    "temperature": self.temperature,
+                    "top_p": 0.999,
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": prompt
+                                }
+                            ]
+                        }
+                    ]
+                })
+
+            # Jamba Input Format
+            elif self.model.startswith("ai21"):
+                promptbody = json.dumps({
+                    "max_tokens": self.max_output,
+                    "temperature": self.temperature,
+                    "top_p": 0.9,
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": prompt
+                        }
+                    ]
+                })
+
+            # Cohere Input Format
+            elif self.model.startswith("cohere"):
+                promptbody = json.dumps({
+                    "max_tokens": self.max_output,
+                    "temperature": self.temperature,
+                    "message": prompt
+                })
+
+            # Use Mistral format as defualt
+            else:
+                promptbody = json.dumps({
+                    "prompt": prompt,
+                    "max_tokens": self.max_output,
+                    "temperature": self.temperature,
+                    "top_p": 0.99,
+                    "top_k": 40
+                })
+
+            accept = 'application/json'
+            contentType = 'application/json'
+
            # FIXME: Consider catching request limits and raise TooManyRequests
            # See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html
-            response = self.bedrock.invoke_model(body=promptbody, modelId=self.model, accept=accept, contentType=contentType)
+
+            with __class__.text_completion_metric.time():
+                response = self.bedrock.invoke_model(
+                    body=promptbody, modelId=self.model, accept=accept,
+                    contentType=contentType
+                )

            # Mistral Response Structure
            if self.model.startswith("mistral"):
--- a/trustgraph/model/text_completion/claude/llm.py
+++ b/trustgraph/model/text_completion/claude/llm.py
@ -5,6 +5,7 @@ Input is prompt, output is response.
 """

 import anthropic
+from prometheus_client import Histogram

 from .... schema import TextCompletionRequest, TextCompletionResponse, Error
 from .... schema import text_completion_request_queue
@ -47,6 +48,19 @@ class Processor(ConsumerProducer):
            }
        )

+        if not hasattr(__class__, "text_completion_metric"):
+            __class__.text_completion_metric = Histogram(
+                'text_completion_duration',
+                'Text completion duration (seconds)',
+                buckets=[
+                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
+                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
+                    120.0
+                ]
+            )
+
        self.model = model
        self.claude = anthropic.Anthropic(api_key=api_key)
        self.temperature = temperature
@ -69,23 +83,26 @@ class Processor(ConsumerProducer):
        try:

            # FIXME: Rate limits?
-            response = message = self.claude.messages.create(
-                model=self.model,
-                max_tokens=self.max_output,
-                temperature=self.temperature,
-                system = "You are a helpful chatbot.",
-                messages=[
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": prompt
-                            }
-                        ]
-                    }
-                ]
-            )
+
+            with __class__.text_completion_metric.time():
+
+                response = message = self.claude.messages.create(
+                    model=self.model,
+                    max_tokens=self.max_output,
+                    temperature=self.temperature,
+                    system = "You are a helpful chatbot.",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": prompt
+                                }
+                            ]
+                        }
+                    ]
+                )

            resp = response.content[0].text
            print(resp, flush=True)
--- a/trustgraph/model/text_completion/cohere/llm.py
+++ b/trustgraph/model/text_completion/cohere/llm.py
@ -5,6 +5,7 @@ Input is prompt, output is response.
 """

 import cohere
+from prometheus_client import Histogram

 from .... schema import TextCompletionRequest, TextCompletionResponse, Error
 from .... schema import text_completion_request_queue
@ -44,6 +45,19 @@ class Processor(ConsumerProducer):
            }
        )

+        if not hasattr(__class__, "text_completion_metric"):
+            __class__.text_completion_metric = Histogram(
+                'text_completion_duration',
+                'Text completion duration (seconds)',
+                buckets=[
+                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
+                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
+                    120.0
+                ]
+            )
+
        self.model = model
        self.temperature = temperature
        self.cohere = cohere.Client(api_key=api_key)
@ -64,15 +78,17 @@ class Processor(ConsumerProducer):

        try:

-            output = self.cohere.chat( 
-                model=self.model,
-                message=prompt,
-                preamble = "You are a helpful AI-assistant.",
-                temperature=self.temperature,
-                chat_history=[],
-                prompt_truncation='auto',
-                connectors=[]
-            )
+            with __class__.text_completion_metric.time():
+
+                output = self.cohere.chat( 
+                    model=self.model,
+                    message=prompt,
+                    preamble = "You are a helpful AI-assistant.",
+                    temperature=self.temperature,
+                    chat_history=[],
+                    prompt_truncation='auto',
+                    connectors=[]
+                )

            resp = output.text
            print(resp, flush=True)
--- a/trustgraph/model/text_completion/ollama/llm.py
+++ b/trustgraph/model/text_completion/ollama/llm.py
@ -5,7 +5,7 @@ Input is prompt, output is response.
 """

 from langchain_community.llms import Ollama
-from prometheus_client import Histogram, Info, Counter
+from prometheus_client import Histogram, Info

 from .... schema import TextCompletionRequest, TextCompletionResponse, Error
 from .... schema import text_completion_request_queue
@ -44,6 +44,19 @@ class Processor(ConsumerProducer):
            }
        )

+        if not hasattr(__class__, "text_completion_metric"):
+            __class__.text_completion_metric = Histogram(
+                'text_completion_duration',
+                'Text completion duration (seconds)',
+                buckets=[
+                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
+                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
+                    120.0
+                ]
+            )
+
        if not hasattr(__class__, "model_metric"):
            __class__.model_metric = Info(
                'model', 'Model information'
@ -69,7 +82,8 @@ class Processor(ConsumerProducer):

        try:

-            response = self.llm.invoke(prompt)
+            with __class__.text_completion_metric.time():
+                response = self.llm.invoke(prompt)

            print("Send response...", flush=True)

--- a/trustgraph/model/text_completion/openai/llm.py
+++ b/trustgraph/model/text_completion/openai/llm.py
@ -5,6 +5,7 @@ Input is prompt, output is response.
 """

 from openai import OpenAI
+from prometheus_client import Histogram

 from .... schema import TextCompletionRequest, TextCompletionResponse, Error
 from .... schema import text_completion_request_queue
@ -47,6 +48,19 @@ class Processor(ConsumerProducer):
            }
        )

+        if not hasattr(__class__, "text_completion_metric"):
+            __class__.text_completion_metric = Histogram(
+                'text_completion_duration',
+                'Text completion duration (seconds)',
+                buckets=[
+                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
+                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
+                    120.0
+                ]
+            )
+
        self.model = model
        self.temperature = temperature
        self.max_output = max_output
@ -69,28 +83,31 @@ class Processor(ConsumerProducer):
        try:

            # FIXME: Rate limits
-            resp = self.openai.chat.completions.create(
-                model=self.model,
-                messages=[
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": prompt
-                            }
-                        ]
+
+            with __class__.text_completion_metric.time():
+
+                resp = self.openai.chat.completions.create(
+                    model=self.model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": prompt
+                                }
+                            ]
+                        }
+                    ],
+                    temperature=self.temperature,
+                    max_tokens=self.max_output,
+                    top_p=1,
+                    frequency_penalty=0,
+                    presence_penalty=0,
+                    response_format={
+                        "type": "text"
                    }
-                ],
-                temperature=self.temperature,
-                max_tokens=self.max_output,
-                top_p=1,
-                frequency_penalty=0,
-                presence_penalty=0,
-                response_format={
-                    "type": "text"
-                }
-            )
+                )

            print(resp.choices[0].message.content, flush=True)

--- a/trustgraph/model/text_completion/vertexai/llm.py
+++ b/trustgraph/model/text_completion/vertexai/llm.py
@ -6,6 +6,7 @@ Google Cloud.   Input is prompt, output is response.

 import vertexai
 import time
+from prometheus_client import Histogram

 from google.oauth2 import service_account
 import google
@ -61,6 +62,19 @@ class Processor(ConsumerProducer):
            }
        )

+        if not hasattr(__class__, "text_completion_metric"):
+            __class__.text_completion_metric = Histogram(
+                'text_completion_duration',
+                'Text completion duration (seconds)',
+                buckets=[
+                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
+                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
+                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
+                    120.0
+                ]
+            )
+
        self.parameters = {
            "temperature": temperature,
            "top_p": 1.0,
@ -125,10 +139,12 @@ class Processor(ConsumerProducer):

            prompt = v.prompt

-            resp = self.llm.generate_content(
-                prompt, generation_config=self.generation_config,
-                safety_settings=self.safety_settings
-            )
+            with __class__.text_completion_metric.time():
+
+                resp = self.llm.generate_content(
+                    prompt, generation_config=self.generation_config,
+                    safety_settings=self.safety_settings
+                )

            resp = resp.text