diff --git a/templates/components.jsonnet b/templates/components.jsonnet index d0df569f..e94374ac 100644 --- a/templates/components.jsonnet +++ b/templates/components.jsonnet @@ -11,6 +11,7 @@ "claude": import "components/claude.jsonnet", "cohere": import "components/cohere.jsonnet", "googleaistudio": import "components/googleaistudio.jsonnet", + "lmstudio": import "components/lmstudio.jsonnet", "mistral": import "components/mistral.jsonnet", "ollama": import "components/ollama.jsonnet", "openai": import "components/openai.jsonnet", @@ -23,6 +24,7 @@ "claude-rag": import "components/claude-rag.jsonnet", "cohere-rag": import "components/cohere-rag.jsonnet", "googleaistudio-rag": import "components/googleaistudio-rag.jsonnet", + "lmstudio-rag": import "components/lmstudio-rag.jsonnet", "mistral-rag": import "components/mistral-rag.jsonnet", "ollama-rag": import "components/ollama-rag.jsonnet", "openai-rag": import "components/openai-rag.jsonnet", diff --git a/templates/components/lmstudio-rag.jsonnet b/templates/components/lmstudio-rag.jsonnet new file mode 100644 index 00000000..70a94087 --- /dev/null +++ b/templates/components/lmstudio-rag.jsonnet @@ -0,0 +1,63 @@ +local base = import "base/base.jsonnet"; +local images = import "values/images.jsonnet"; +local url = import "values/url.jsonnet"; +local prompts = import "prompts/mixtral.jsonnet"; + +{ + + with:: function(key, value) + self + { + ["lmstudio-rag-" + key]:: value, + }, + + "lmstudio-rag-max-output-tokens":: 4096, + "lmstudio-rag-temperature":: 0.0, + "lmstudio-rag-model":: "GPT-3.5-Turbo", + + "text-completion-rag" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("lmstudio-credentials") + .with_env_var("LMSTUDIO_URL", "lmstudio-url"); + + local containerRag = + engine.container("text-completion-rag") + .with_image(images.trustgraph_flow) + .with_command([ + "text-completion-lmstudio", + "-p", + url.pulsar, + "-x", + std.toString($["lmstudio-rag-max-output-tokens"]), + "-t", + "%0.3f" % $["lmstudio-rag-temperature"], + "-m", + $["lmstudio-rag-model"], + "-i", + "non-persistent://tg/request/text-completion-rag", + "-o", + "non-persistent://tg/response/text-completion-rag", + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSetRag = engine.containers( + "text-completion-rag", [ containerRag ] + ); + + local serviceRag = + engine.internalService(containerSetRag) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSetRag, + serviceRag, + ]) + + }, + +} + prompts + diff --git a/templates/components/lmstudio.jsonnet b/templates/components/lmstudio.jsonnet new file mode 100644 index 00000000..4fe1da58 --- /dev/null +++ b/templates/components/lmstudio.jsonnet @@ -0,0 +1,59 @@ +local base = import "base/base.jsonnet"; +local images = import "values/images.jsonnet"; +local url = import "values/url.jsonnet"; +local prompts = import "prompts/mixtral.jsonnet"; + +{ + + with:: function(key, value) + self + { + ["lmstudio-" + key]:: value, + }, + + "lmstudio-max-output-tokens":: 4096, + "lmstudio-temperature":: 0.0, + "lmstudio-model":: "GPT-3.5-Turbo", + + "text-completion" +: { + + create:: function(engine) + + local envSecrets = engine.envSecrets("lmstudio-credentials") + .with_env_var("LMSTUDIO_URL", "lmstudio-url"); + + local container = + engine.container("text-completion") + .with_image(images.trustgraph_flow) + .with_command([ + "text-completion-lmstudio", + "-p", + url.pulsar, + "-x", + std.toString($["lmstudio-max-output-tokens"]), + "-t", + "%0.3f" % $["lmstudio-temperature"], + "-m", + $["lmstudio-model"], + ]) + .with_env_var_secrets(envSecrets) + .with_limits("0.5", "128M") + .with_reservations("0.1", "128M"); + + local containerSet = engine.containers( + "text-completion", [ container ] + ); + + local service = + engine.internalService(containerSet) + .with_port(8080, 8080, "metrics"); + + engine.resources([ + envSecrets, + containerSet, + service, + ]) + + }, + +} + prompts + diff --git a/templates/components/mistral-ocr.jsonnet b/templates/components/mistral-ocr.jsonnet index 8049c514..a70addd5 100644 --- a/templates/components/mistral-ocr.jsonnet +++ b/templates/components/mistral-ocr.jsonnet @@ -43,5 +43,5 @@ local url = import "values/url.jsonnet"; }, -} + prompts +} diff --git a/templates/components/ocr.jsonnet b/templates/components/ocr.jsonnet index 4353b7f9..cdd49583 100644 --- a/templates/components/ocr.jsonnet +++ b/templates/components/ocr.jsonnet @@ -27,12 +27,11 @@ local url = import "values/url.jsonnet"; .with_port(8080, 8080, "metrics"); engine.resources([ - envSecrets, containerSet, service, ]) }, -} + prompts +} diff --git a/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py b/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py index 8130cf8a..45f1311c 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py @@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using Mistral. Input is prompt, output is response. """ -from mistralai import Mistral, RateLimitError +from mistralai import Mistral from prometheus_client import Histogram import os @@ -130,12 +130,18 @@ class Processor(ConsumerProducer): print("Done.", flush=True) - # FIXME: Wrong exception, don't know what this LLM throws - # for a rate limit - except Mistral.RateLimitError: + # FIXME: Wrong exception. The MistralAI library has retry logic + # so retry-able errors are retried transparently. It means we + # don't get rate limit events. - # Leave rate limit retries to the base handler - raise TooManyRequests() + # We could choose to turn off retry and handle all that here + # or subclass BackoffStrategy to keep the retry logic, but + # get the events out. + +# except Mistral.RateLimitError: + +# # Leave rate limit retries to the base handler +# raise TooManyRequests() except Exception as e: diff --git a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py index ebfae9ed..590c2e3f 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py @@ -24,6 +24,7 @@ default_model = 'gpt-3.5-turbo' default_temperature = 0.0 default_max_output = 4096 default_api_key = os.getenv("OPENAI_TOKEN") +default_base_url = os.getenv("OPENAI_BASE_URL", None) class Processor(ConsumerProducer): @@ -34,6 +35,7 @@ class Processor(ConsumerProducer): subscriber = params.get("subscriber", default_subscriber) model = params.get("model", default_model) api_key = params.get("api_key", default_api_key) + base_url = params.get("base_url", default_base_url) temperature = params.get("temperature", default_temperature) max_output = params.get("max_output", default_max_output) @@ -50,6 +52,7 @@ class Processor(ConsumerProducer): "model": model, "temperature": temperature, "max_output": max_output, + "base_url": base_url, } ) @@ -69,7 +72,7 @@ class Processor(ConsumerProducer): self.model = model self.temperature = temperature self.max_output = max_output - self.openai = OpenAI(api_key=api_key) + self.openai = OpenAI(base_url=base_url, api_key=api_key) print("Initialised", flush=True) @@ -132,7 +135,7 @@ class Processor(ConsumerProducer): # FIXME: Wrong exception, don't know what this LLM throws # for a rate limit - except openai.RateLimitError: + except RateLimitError: # Leave rate limit retries to the base handler raise TooManyRequests() @@ -180,6 +183,12 @@ class Processor(ConsumerProducer): help=f'OpenAI API key' ) + parser.add_argument( + '-u', '--url', + default=default_base_url, + help=f'OpenAI service base URL' + ) + parser.add_argument( '-t', '--temperature', type=float,