trustgraph/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py
cybermaggedon 310a2deb06
Feature/streaming llm phase 1 (#566)
* Tidy up duplicate tech specs in doc directory

* Streaming LLM text-completion service tech spec.

* text-completion and prompt interfaces

* streaming change applied to all LLMs, so far tested with VertexAI

* Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again

* Added agent streaming, not working and has broken tests
2025-11-26 09:59:10 +00:00

193 lines
5.7 KiB
Python
Executable file

"""
Simple LLM service, performs text prompt completion using OpenAI.
Input is prompt, output is response.
"""
from openai import OpenAI
import os
import logging
# Module logger
logger = logging.getLogger(__name__)
from .... exceptions import TooManyRequests
from .... base import LlmService, LlmResult, LlmChunk
default_ident = "text-completion"
default_model = 'gemma3:9b'
default_url = os.getenv("LMSTUDIO_URL", "http://localhost:1234/")
default_temperature = 0.0
default_max_output = 4096
class Processor(LlmService):
def __init__(self, **params):
model = params.get("model", default_model)
url = params.get("url", default_url)
temperature = params.get("temperature", default_temperature)
max_output = params.get("max_output", default_max_output)
super(Processor, self).__init__(
**params | {
"model": model,
"temperature": temperature,
"max_output": max_output,
"url" : url,
}
)
self.default_model = model
self.url = url + "v1/"
self.temperature = temperature
self.max_output = max_output
self.openai = OpenAI(
base_url=self.url,
api_key = "sk-no-key-required",
)
logger.info("LMStudio LLM service initialized")
async def generate_content(self, system, prompt, model=None, temperature=None):
# Use provided model or fall back to default
model_name = model or self.default_model
# Use provided temperature or fall back to default
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model: {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
prompt = system + "\n\n" + prompt
try:
logger.debug(f"Prompt: {prompt}")
resp = self.openai.chat.completions.create(
model=model_name,
messages=[
{"role": "user", "content": prompt}
],
temperature=effective_temperature,
max_tokens=self.max_output,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
response_format={
"type": "text"
}
)
logger.debug(f"Full response: {resp}")
inputtokens = resp.usage.prompt_tokens
outputtokens = resp.usage.completion_tokens
logger.debug(f"LLM response: {resp.choices[0].message.content}")
logger.info(f"Input Tokens: {inputtokens}")
logger.info(f"Output Tokens: {outputtokens}")
resp = LlmResult(
text = resp.choices[0].message.content,
in_token = inputtokens,
out_token = outputtokens,
model = model_name
)
return resp
# SLM, presumably there aren't rate limits
except Exception as e:
logger.error(f"LMStudio LLM exception ({type(e).__name__}): {e}", exc_info=True)
raise e
def supports_streaming(self):
"""LM Studio supports streaming"""
return True
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
"""Stream content generation from LM Studio"""
model_name = model or self.default_model
effective_temperature = temperature if temperature is not None else self.temperature
logger.debug(f"Using model (streaming): {model_name}")
logger.debug(f"Using temperature: {effective_temperature}")
prompt = system + "\n\n" + prompt
try:
response = self.openai.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": prompt}],
temperature=effective_temperature,
max_tokens=self.max_output,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
response_format={"type": "text"},
stream=True
)
for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
yield LlmChunk(
text=chunk.choices[0].delta.content,
in_token=None,
out_token=None,
model=model_name,
is_final=False
)
yield LlmChunk(
text="",
in_token=None,
out_token=None,
model=model_name,
is_final=True
)
logger.debug("Streaming complete")
except Exception as e:
logger.error(f"LMStudio streaming exception ({type(e).__name__}): {e}", exc_info=True)
raise e
@staticmethod
def add_args(parser):
LlmService.add_args(parser)
parser.add_argument(
'-m', '--model',
default=default_model,
help=f'LLM model (default: gemma3:9b)'
)
parser.add_argument(
'-u', '--url',
default=default_url,
help=f'LMStudio URL (default: {default_url})'
)
parser.add_argument(
'-t', '--temperature',
type=float,
default=default_temperature,
help=f'LLM temperature parameter (default: {default_temperature})'
)
parser.add_argument(
'-x', '--max-output',
type=int,
default=default_max_output,
help=f'LLM max output tokens (default: {default_max_output})'
)
def run():
Processor.launch(default_ident, __doc__)