trustgraph/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py


"""
Simple LLM service, performs text prompt completion using OpenAI.
Input is prompt, output is response.
"""

from openai import OpenAI
from prometheus_client import Histogram
import os

from .... schema import TextCompletionRequest, TextCompletionResponse, Error
from .... schema import text_completion_request_queue
from .... schema import text_completion_response_queue
from .... log_level import LogLevel
from .... base import ConsumerProducer
from .... exceptions import TooManyRequests

module = ".".join(__name__.split(".")[1:-1])

default_input_queue = text_completion_request_queue
default_output_queue = text_completion_response_queue
default_subscriber = module
default_model = 'gpt-3.5-turbo'
default_temperature = 0.0
default_max_output = 4096
default_api_key = os.getenv("OPENAI_KEY")

class Processor(ConsumerProducer):

    def __init__(self, **params):
    
        input_queue = params.get("input_queue", default_input_queue)
        output_queue = params.get("output_queue", default_output_queue)
        subscriber = params.get("subscriber", default_subscriber)
        model = params.get("model", default_model)
        api_key = params.get("api_key", default_api_key)
        temperature = params.get("temperature", default_temperature)
        max_output = params.get("max_output", default_max_output)

        if api_key is None:
            raise RuntimeError("OpenAI API key not specified")

        super(Processor, self).__init__(
            **params | {
                "input_queue": input_queue,
                "output_queue": output_queue,
                "subscriber": subscriber,
                "input_schema": TextCompletionRequest,
                "output_schema": TextCompletionResponse,
                "model": model,
                "temperature": temperature,
                "max_output": max_output,
            }
        )

        if not hasattr(__class__, "text_completion_metric"):
            __class__.text_completion_metric = Histogram(
                'text_completion_duration',
                'Text completion duration (seconds)',
                buckets=[
                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
                    120.0
                ]
            )

        self.model = model
        self.temperature = temperature
        self.max_output = max_output
        self.openai = OpenAI(api_key=api_key)

        print("Initialised", flush=True)

    def handle(self, msg):

        v = msg.value()

        # Sender-produced ID

        id = msg.properties()["id"]

        print(f"Handling prompt {id}...", flush=True)

        prompt = v.prompt

        try:

            # FIXME: Rate limits

            with __class__.text_completion_metric.time():

                resp = self.openai.chat.completions.create(
                    model=self.model,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": prompt
                                }
                            ]
                        }
                    ],
                    temperature=self.temperature,
                    max_tokens=self.max_output,
                    top_p=1,
                    frequency_penalty=0,
                    presence_penalty=0,
                    response_format={
                        "type": "text"
                    }
                )
            
            inputtokens = resp.usage.prompt_tokens
            outputtokens = resp.usage.completion_tokens
            print(resp.choices[0].message.content, flush=True)
            print(f"Input Tokens: {inputtokens}", flush=True)
            print(f"Output Tokens: {outputtokens}", flush=True)

            print("Send response...", flush=True)
            r = TextCompletionResponse(
                response=resp.choices[0].message.content,
                error=None,
                in_token=inputtokens,
                out_token=outputtokens,
                model=self.model
            )
            self.send(r, properties={"id": id})

            print("Done.", flush=True)

        # FIXME: Wrong exception, don't know what this LLM throws
        # for a rate limit
        except TooManyRequests:

            print("Send rate limit response...", flush=True)

            r = TextCompletionResponse(
                error=Error(
                    type = "rate-limit",
                    message = str(e),
                ),
                response=None,
                in_token=None,
                out_token=None,
                model=None,
            )

            self.producer.send(r, properties={"id": id})

            self.consumer.acknowledge(msg)

        except Exception as e:

            print(f"Exception: {e}")

            print("Send error response...", flush=True)

            r = TextCompletionResponse(
                error=Error(
                    type = "llm-error",
                    message = str(e),
                ),
                response=None,
                in_token=None,
                out_token=None,
                model=None,
            )

            self.producer.send(r, properties={"id": id})

            self.consumer.acknowledge(msg)

    @staticmethod
    def add_args(parser):

        ConsumerProducer.add_args(
            parser, default_input_queue, default_subscriber,
            default_output_queue,
        )

        parser.add_argument(
            '-m', '--model',
            default="gpt-3.5-turbo",
            help=f'LLM model (default: GPT-3.5-Turbo)'
        )

        parser.add_argument(
            '-k', '--api-key',
            help=f'OpenAI API key'
        )

        parser.add_argument(
            '-t', '--temperature',
            type=float,
            default=default_temperature,
            help=f'LLM temperature parameter (default: {default_temperature})'
        )

        parser.add_argument(
            '-x', '--max-output',
            type=int,
            default=default_max_output,
            help=f'LLM max output tokens (default: {default_max_output})'
        )

def run():

    Processor.start(module, __doc__)
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
			`"""`
			`Simple LLM service, performs text prompt completion using OpenAI.`
			`Input is prompt, output is response.`
			`"""`

			`from openai import OpenAI`
Update LLM text-completion duration metric (#40) * Added LLM duration metric, better buckets * Added heatmap to dashboard to replace 95/97/99 chart * Bump version 2024-08-26 11:46:36 +01:00			`from prometheus_client import Histogram`
Feature/environment var creds (#116) - Change templates to interpolate environment variables in docker compose - Change templates to invoke secrets for environment variable credentials in K8s configuration - Update LLMs to pull in credentials from environment variables if not specified 2024-10-15 00:34:52 +01:00			`import os`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`from .... schema import TextCompletionRequest, TextCompletionResponse, Error`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00			`from .... schema import text_completion_request_queue`
			`from .... schema import text_completion_response_queue`
			`from .... log_level import LogLevel`
			`from .... base import ConsumerProducer`
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`from .... exceptions import TooManyRequests`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
			`module = ".".join(__name__.split(".")[1:-1])`

			`default_input_queue = text_completion_request_queue`
			`default_output_queue = text_completion_response_queue`
			`default_subscriber = module`
			`default_model = 'gpt-3.5-turbo'`
Parameters, Parsing, renaming YAMLs and Neo4j YAMLS (#15) * Added some params * Parameter updates * Fixed Neo4j issue 2024-08-21 16:03:56 -07:00			`default_temperature = 0.0`
			`default_max_output = 4096`
Feature/environment var creds (#116) - Change templates to interpolate environment variables in docker compose - Change templates to invoke secrets for environment variable credentials in K8s configuration - Update LLMs to pull in credentials from environment variables if not specified 2024-10-15 00:34:52 +01:00			`default_api_key = os.getenv("OPENAI_KEY")`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
			`class Processor(ConsumerProducer):`

			`def __init__(self, **params):`

			`input_queue = params.get("input_queue", default_input_queue)`
			`output_queue = params.get("output_queue", default_output_queue)`
			`subscriber = params.get("subscriber", default_subscriber)`
			`model = params.get("model", default_model)`
Feature/environment var creds (#116) - Change templates to interpolate environment variables in docker compose - Change templates to invoke secrets for environment variable credentials in K8s configuration - Update LLMs to pull in credentials from environment variables if not specified 2024-10-15 00:34:52 +01:00			`api_key = params.get("api_key", default_api_key)`
Parameters, Parsing, renaming YAMLs and Neo4j YAMLS (#15) * Added some params * Parameter updates * Fixed Neo4j issue 2024-08-21 16:03:56 -07:00			`temperature = params.get("temperature", default_temperature)`
			`max_output = params.get("max_output", default_max_output)`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
Feature/environment var creds (#116) - Change templates to interpolate environment variables in docker compose - Change templates to invoke secrets for environment variable credentials in K8s configuration - Update LLMs to pull in credentials from environment variables if not specified 2024-10-15 00:34:52 +01:00			`if api_key is None:`
			`raise RuntimeError("OpenAI API key not specified")`

OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00			`super(Processor, self).__init__(`
			`**params \| {`
			`"input_queue": input_queue,`
			`"output_queue": output_queue,`
			`"subscriber": subscriber,`
			`"input_schema": TextCompletionRequest,`
			`"output_schema": TextCompletionResponse,`
			`"model": model,`
Parameters, Parsing, renaming YAMLs and Neo4j YAMLS (#15) * Added some params * Parameter updates * Fixed Neo4j issue 2024-08-21 16:03:56 -07:00			`"temperature": temperature,`
			`"max_output": max_output,`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00			`}`
			`)`

Update LLM text-completion duration metric (#40) * Added LLM duration metric, better buckets * Added heatmap to dashboard to replace 95/97/99 chart * Bump version 2024-08-26 11:46:36 +01:00			`if not hasattr(__class__, "text_completion_metric"):`
			`__class__.text_completion_metric = Histogram(`
			`'text_completion_duration',`
			`'Text completion duration (seconds)',`
			`buckets=[`
			`0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,`
			`8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,`
			`17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,`
			`30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,`
			`120.0`
			`]`
			`)`

OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00			`self.model = model`
Parameters, Parsing, renaming YAMLs and Neo4j YAMLS (#15) * Added some params * Parameter updates * Fixed Neo4j issue 2024-08-21 16:03:56 -07:00			`self.temperature = temperature`
			`self.max_output = max_output`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00			`self.openai = OpenAI(api_key=api_key)`

			`print("Initialised", flush=True)`

			`def handle(self, msg):`

			`v = msg.value()`

			`# Sender-produced ID`

			`id = msg.properties()["id"]`

			`print(f"Handling prompt {id}...", flush=True)`

			`prompt = v.prompt`
Rate limit handling (#11) * Added a rate limit exception * Reduce request/response timeouts because looks like there are major issues * Add rate limit exception catch to all consumers * Version to 0.6.3 2024-08-19 22:15:32 +01:00
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`try:`

			`# FIXME: Rate limits`
Update LLM text-completion duration metric (#40) * Added LLM duration metric, better buckets * Added heatmap to dashboard to replace 95/97/99 chart * Bump version 2024-08-26 11:46:36 +01:00
			`with __class__.text_completion_metric.time():`

			`resp = self.openai.chat.completions.create(`
			`model=self.model,`
			`messages=[`
			`{`
			`"role": "user",`
			`"content": [`
			`{`
			`"type": "text",`
			`"text": prompt`
			`}`
			`]`
			`}`
			`],`
			`temperature=self.temperature,`
			`max_tokens=self.max_output,`
			`top_p=1,`
			`frequency_penalty=0,`
			`presence_penalty=0,`
			`response_format={`
			`"type": "text"`
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`}`
Update LLM text-completion duration metric (#40) * Added LLM duration metric, better buckets * Added heatmap to dashboard to replace 95/97/99 chart * Bump version 2024-08-26 11:46:36 +01:00			`)`
Features/metering all llms (#70) * Added Anthropic support and None logic * Added Cohere API support * Added support for Llamafile and OpenAI * Added support for VertexAI * Added AzureAI support 2024-09-29 10:11:48 -07:00
			`inputtokens = resp.usage.prompt_tokens`
			`outputtokens = resp.usage.completion_tokens`
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`print(resp.choices[0].message.content, flush=True)`
Features/metering all llms (#70) * Added Anthropic support and None logic * Added Cohere API support * Added support for Llamafile and OpenAI * Added support for VertexAI * Added AzureAI support 2024-09-29 10:11:48 -07:00			`print(f"Input Tokens: {inputtokens}", flush=True)`
			`print(f"Output Tokens: {outputtokens}", flush=True)`
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00
			`print("Send response...", flush=True)`
Fix OpenAI reporting (#32) * Fix OpenAI reporting * bump version 2024-08-23 14:02:06 +01:00			`r = TextCompletionResponse(`
			`response=resp.choices[0].message.content,`
			`error=None,`
Features/metering all llms (#70) * Added Anthropic support and None logic * Added Cohere API support * Added support for Llamafile and OpenAI * Added support for VertexAI * Added AzureAI support 2024-09-29 10:11:48 -07:00			`in_token=inputtokens,`
			`out_token=outputtokens,`
			`model=self.model`
Fix OpenAI reporting (#32) * Fix OpenAI reporting * bump version 2024-08-23 14:02:06 +01:00			`)`
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`self.send(r, properties={"id": id})`

			`print("Done.", flush=True)`

			`# FIXME: Wrong exception, don't know what this LLM throws`
			`# for a rate limit`
			`except TooManyRequests:`

			`print("Send rate limit response...", flush=True)`

			`r = TextCompletionResponse(`
			`error=Error(`
			`type = "rate-limit",`
			`message = str(e),`
			`),`
			`response=None,`
Features/metering all llms (#70) * Added Anthropic support and None logic * Added Cohere API support * Added support for Llamafile and OpenAI * Added support for VertexAI * Added AzureAI support 2024-09-29 10:11:48 -07:00			`in_token=None,`
			`out_token=None,`
			`model=None,`
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`)`

			`self.producer.send(r, properties={"id": id})`

			`self.consumer.acknowledge(msg)`

			`except Exception as e:`

			`print(f"Exception: {e}")`

			`print("Send error response...", flush=True)`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`r = TextCompletionResponse(`
			`error=Error(`
			`type = "llm-error",`
			`message = str(e),`
			`),`
			`response=None,`
Features/metering all llms (#70) * Added Anthropic support and None logic * Added Cohere API support * Added support for Llamafile and OpenAI * Added support for VertexAI * Added AzureAI support 2024-09-29 10:11:48 -07:00			`in_token=None,`
			`out_token=None,`
			`model=None,`
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`)`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`self.producer.send(r, properties={"id": id})`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
Improve request/response handling (#18) * Request/response error handling with common client * Fixup error handling change 2024-08-22 17:02:18 +01:00			`self.consumer.acknowledge(msg)`
OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00
			`@staticmethod`
			`def add_args(parser):`

			`ConsumerProducer.add_args(`
			`parser, default_input_queue, default_subscriber,`
			`default_output_queue,`
			`)`

			`parser.add_argument(`
			`'-m', '--model',`
			`default="gpt-3.5-turbo",`
			`help=f'LLM model (default: GPT-3.5-Turbo)'`
			`)`

			`parser.add_argument(`
			`'-k', '--api-key',`
			`help=f'OpenAI API key'`
			`)`

Parameters, Parsing, renaming YAMLs and Neo4j YAMLS (#15) * Added some params * Parameter updates * Fixed Neo4j issue 2024-08-21 16:03:56 -07:00			`parser.add_argument(`
			`'-t', '--temperature',`
			`type=float,`
			`default=default_temperature,`
			`help=f'LLM temperature parameter (default: {default_temperature})'`
			`)`

			`parser.add_argument(`
			`'-x', '--max-output',`
			`type=int,`
			`default=default_max_output,`
			`help=f'LLM max output tokens (default: {default_max_output})'`
			`)`

OpenAI integration (#7) * Preliminary OpenAI support * Version to 0.5.9 --------- Co-authored-by: JackColquitt <daniel@kalntera.ai> 2024-08-12 15:37:04 +01:00			`def run():`

			`Processor.start(module, __doc__)`