diff --git a/trustgraph-flow/scripts/text-completion-lmstudio b/trustgraph-flow/scripts/text-completion-lmstudio new file mode 100755 index 00000000..7b9e259e --- /dev/null +++ b/trustgraph-flow/scripts/text-completion-lmstudio @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.model.text_completion.lmstudio import run + +run() + diff --git a/trustgraph-flow/setup.py b/trustgraph-flow/setup.py index 553564aa..3e8a65fa 100644 --- a/trustgraph-flow/setup.py +++ b/trustgraph-flow/setup.py @@ -108,6 +108,7 @@ setuptools.setup( "scripts/text-completion-cohere", "scripts/text-completion-googleaistudio", "scripts/text-completion-llamafile", + "scripts/text-completion-lmstudio", "scripts/text-completion-mistral", "scripts/text-completion-ollama", "scripts/text-completion-openai", diff --git a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/__init__.py b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/__init__.py new file mode 100644 index 00000000..f2017af8 --- /dev/null +++ b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/__init__.py @@ -0,0 +1,3 @@ + +from . llm import * + diff --git a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/__main__.py b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/__main__.py new file mode 100755 index 00000000..91342d2d --- /dev/null +++ b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . llm import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py new file mode 100755 index 00000000..16ff2df4 --- /dev/null +++ b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py @@ -0,0 +1,193 @@ + +""" +Simple LLM service, performs text prompt completion using OpenAI. +Input is prompt, output is response. +""" + +from openai import OpenAI +from prometheus_client import Histogram +import os + +from .... schema import TextCompletionRequest, TextCompletionResponse, Error +from .... schema import text_completion_request_queue +from .... schema import text_completion_response_queue +from .... log_level import LogLevel +from .... base import ConsumerProducer +from .... exceptions import TooManyRequests + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = text_completion_request_queue +default_output_queue = text_completion_response_queue +default_subscriber = module +default_model = 'gemma3:9b' +default_url = os.getenv("LMSTUDIO_URL", "http://localhost:1234/") +default_temperature = 0.0 +default_max_output = 4096 + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + model = params.get("model", default_model) + url = params.get("url", default_url) + temperature = params.get("temperature", default_temperature) + max_output = params.get("max_output", default_max_output) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": TextCompletionRequest, + "output_schema": TextCompletionResponse, + "model": model, + "temperature": temperature, + "max_output": max_output, + "url" : url, + } + ) + + if not hasattr(__class__, "text_completion_metric"): + __class__.text_completion_metric = Histogram( + 'text_completion_duration', + 'Text completion duration (seconds)', + buckets=[ + 0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, + 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, + 30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0, + 120.0 + ] + ) + + self.model = model + self.url = url + "v1/" + self.temperature = temperature + self.max_output = max_output + self.openai = OpenAI( + base_url=self.url, + api_key = "sk-no-key-required", + ) + + print("Initialised", flush=True) + + async def handle(self, msg): + + v = msg.value() + + # Sender-produced ID + + id = msg.properties()["id"] + + print(f"Handling prompt {id}...", flush=True) + + prompt = v.system + "\n\n" + v.prompt + + try: + + # FIXME: Rate limits + + with __class__.text_completion_metric.time(): + + print(prompt) + + resp = self.openai.chat.completions.create( + model=self.model, + messages=[ + {"role": "user", "content": prompt} + ] + #temperature=self.temperature, + #max_tokens=self.max_output, + #top_p=1, + #frequency_penalty=0, + #presence_penalty=0, + #response_format={ + # "type": "text" + #} + ) + + print(resp) + + inputtokens = resp.usage.prompt_tokens + outputtokens = resp.usage.completion_tokens + + print(resp.choices[0].message.content, flush=True) + print(f"Input Tokens: {inputtokens}", flush=True) + print(f"Output Tokens: {outputtokens}", flush=True) + + print("Send response...", flush=True) + r = TextCompletionResponse( + response=resp.choices[0].message.content, + error=None, + in_token=inputtokens, + out_token=outputtokens, + model=self.model, + ) + await self.send(r, properties={"id": id}) + + print("Done.", flush=True) + + # SLM, presumably there aren't rate limits + + except Exception as e: + + print(f"Exception: {e}") + + print("Send error response...", flush=True) + + r = TextCompletionResponse( + error=Error( + type = "llm-error", + message = str(e), + ), + response=None, + in_token=None, + out_token=None, + model=None, + ) + + await self.send(r, properties={"id": id}) + + self.consumer.acknowledge(msg) + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + + parser.add_argument( + '-m', '--model', + default=default_model, + help=f'LLM model (default: gemma3:9b)' + ) + + parser.add_argument( + '-u', '--url', + default=default_url, + help=f'LMStudio URL (default: {default_url})' + ) + + parser.add_argument( + '-t', '--temperature', + type=float, + default=default_temperature, + help=f'LLM temperature parameter (default: {default_temperature})' + ) + + parser.add_argument( + '-x', '--max-output', + type=int, + default=default_max_output, + help=f'LLM max output tokens (default: {default_max_output})' + ) + +def run(): + Processor.launch(module, __doc__) +