diff --git a/trustgraph-base/trustgraph/base/llm_service.py b/trustgraph-base/trustgraph/base/llm_service.py index 9fb31a66..8eb6751c 100644 --- a/trustgraph-base/trustgraph/base/llm_service.py +++ b/trustgraph-base/trustgraph/base/llm_service.py @@ -5,7 +5,7 @@ LLM text completion base class import time import logging -from prometheus_client import Histogram +from prometheus_client import Histogram, Info from .. schema import TextCompletionRequest, TextCompletionResponse, Error from .. exceptions import TooManyRequests @@ -62,6 +62,12 @@ class LlmService(FlowProcessor): ) ) + self.register_specification( + ParameterSpec( + name = "temperature", + ) + ) + if not hasattr(__class__, "text_completion_metric"): __class__.text_completion_metric = Histogram( 'text_completion_duration', @@ -76,6 +82,13 @@ class LlmService(FlowProcessor): ] ) + if not hasattr(__class__, "text_completion_model_metric"): + __class__.text_completion_model_metric = Info( + 'text_completion_model', + 'Text completion model', + ["processor", "flow"] + ) + async def on_request(self, msg, consumer, flow): try: @@ -92,11 +105,19 @@ class LlmService(FlowProcessor): ).time(): model = flow("model") + temperature = flow("temperature") response = await self.generate_content( - request.system, request.prompt, model + request.system, request.prompt, model, temperature ) + await __class__.text_completion_model_metric.labels( + id = flow.id, flow = flow.name + ).info({ + "model": model, + "temperature": temperature, + }) + await flow("response").send( TextCompletionResponse( error=None, diff --git a/trustgraph-bedrock/trustgraph/model/text_completion/bedrock/llm.py b/trustgraph-bedrock/trustgraph/model/text_completion/bedrock/llm.py index 25584780..dbe6f54c 100755 --- a/trustgraph-bedrock/trustgraph/model/text_completion/bedrock/llm.py +++ b/trustgraph-bedrock/trustgraph/model/text_completion/bedrock/llm.py @@ -231,28 +231,37 @@ class Processor(LlmService): return Default - def _get_or_create_variant(self, model_name): + def _get_or_create_variant(self, model_name, temperature=None): """Get cached model variant or create new one""" - if model_name not in self.model_variants: - logger.info(f"Creating model variant for '{model_name}'") + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature + + # Create a cache key that includes temperature to avoid conflicts + cache_key = f"{model_name}:{effective_temperature}" + + if cache_key not in self.model_variants: + logger.info(f"Creating model variant for '{model_name}' with temperature {effective_temperature}") variant_class = self.determine_variant(model_name) variant = variant_class() - variant.set_temperature(self.temperature) + variant.set_temperature(effective_temperature) variant.set_max_output(self.max_output) - self.model_variants[model_name] = variant + self.model_variants[cache_key] = variant - return self.model_variants[model_name] + return self.model_variants[cache_key] - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") try: # Get the appropriate variant for this model - variant = self._get_or_create_variant(model_name) + variant = self._get_or_create_variant(model_name, effective_temperature) promptbody = variant.encode_request(system, prompt) diff --git a/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py b/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py index af3dfcaf..d2d6f1ad 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py @@ -55,7 +55,9 @@ class Processor(LlmService): self.max_output = max_output self.default_model = model - def build_prompt(self, system, content): + def build_prompt(self, system, content, temperature=None): + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature data = { "messages": [ @@ -67,7 +69,7 @@ class Processor(LlmService): } ], "max_tokens": self.max_output, - "temperature": self.temperature, + "temperature": effective_temperature, "top_p": 1 } @@ -100,18 +102,22 @@ class Processor(LlmService): return result - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") try: prompt = self.build_prompt( system, - prompt + prompt, + effective_temperature ) response = self.call_llm(prompt) diff --git a/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py b/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py index f0588e5a..2442c283 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py @@ -62,12 +62,15 @@ class Processor(LlmService): azure_endpoint = endpoint, ) - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") prompt = system + "\n\n" + prompt @@ -86,7 +89,7 @@ class Processor(LlmService): ] } ], - temperature=self.temperature, + temperature=effective_temperature, max_tokens=self.max_output, top_p=1, ) diff --git a/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py b/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py index b6180038..5fecd3ac 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py @@ -48,19 +48,22 @@ class Processor(LlmService): logger.info("Claude LLM service initialized") - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") try: response = message = self.claude.messages.create( model=model_name, max_tokens=self.max_output, - temperature=self.temperature, + temperature=effective_temperature, system = system, messages=[ { diff --git a/trustgraph-flow/trustgraph/model/text_completion/cohere/llm.py b/trustgraph-flow/trustgraph/model/text_completion/cohere/llm.py index a5b1deda..9aebaa73 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/cohere/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/cohere/llm.py @@ -45,12 +45,15 @@ class Processor(LlmService): logger.info("Cohere LLM service initialized") - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") try: @@ -58,7 +61,7 @@ class Processor(LlmService): model=model_name, message=prompt, preamble = system, - temperature=self.temperature, + temperature=effective_temperature, chat_history=[], prompt_truncation='auto', connectors=[] diff --git a/trustgraph-flow/trustgraph/model/text_completion/googleaistudio/llm.py b/trustgraph-flow/trustgraph/model/text_completion/googleaistudio/llm.py index c1814129..b9abcefa 100644 --- a/trustgraph-flow/trustgraph/model/text_completion/googleaistudio/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/googleaistudio/llm.py @@ -86,12 +86,18 @@ class Processor(LlmService): logger.info("GoogleAIStudio LLM service initialized") - def _get_or_create_config(self, model_name): - """Get cached generation config or create new one""" - if model_name not in self.generation_configs: - logger.info(f"Creating generation config for '{model_name}'") - self.generation_configs[model_name] = types.GenerateContentConfig( - temperature = self.temperature, + def _get_or_create_config(self, model_name, temperature=None): + """Get or create generation config with dynamic temperature""" + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature + + # Create cache key that includes temperature to avoid conflicts + cache_key = f"{model_name}:{effective_temperature}" + + if cache_key not in self.generation_configs: + logger.info(f"Creating generation config for '{model_name}' with temperature {effective_temperature}") + self.generation_configs[cache_key] = types.GenerateContentConfig( + temperature = effective_temperature, top_p = 1, top_k = 40, max_output_tokens = self.max_output, @@ -99,16 +105,19 @@ class Processor(LlmService): safety_settings = self.safety_settings, ) - return self.generation_configs[model_name] + return self.generation_configs[cache_key] - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") - generation_config = self._get_or_create_config(model_name) + generation_config = self._get_or_create_config(model_name, effective_temperature) # Set system instruction per request (can't be cached) generation_config.system_instruction = system diff --git a/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py b/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py index 1571e3e7..2b343583 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py @@ -50,12 +50,15 @@ class Processor(LlmService): logger.info("Llamafile LLM service initialized") - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") prompt = system + "\n\n" + prompt @@ -65,15 +68,15 @@ class Processor(LlmService): model=model_name, messages=[ {"role": "user", "content": prompt} - ] - #temperature=self.temperature, - #max_tokens=self.max_output, - #top_p=1, - #frequency_penalty=0, - #presence_penalty=0, - #response_format={ - # "type": "text" - #} + ], + temperature=effective_temperature, + max_tokens=self.max_output, + top_p=1, + frequency_penalty=0, + presence_penalty=0, + response_format={ + "type": "text" + } ) inputtokens = resp.usage.prompt_tokens diff --git a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py index 0ed47517..a5464368 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/lmstudio/llm.py @@ -50,12 +50,15 @@ class Processor(LlmService): logger.info("LMStudio LLM service initialized") - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") prompt = system + "\n\n" + prompt @@ -67,15 +70,15 @@ class Processor(LlmService): model=model_name, messages=[ {"role": "user", "content": prompt} - ] - #temperature=self.temperature, - #max_tokens=self.max_output, - #top_p=1, - #frequency_penalty=0, - #presence_penalty=0, - #response_format={ - # "type": "text" - #} + ], + temperature=effective_temperature, + max_tokens=self.max_output, + top_p=1, + frequency_penalty=0, + presence_penalty=0, + response_format={ + "type": "text" + } ) logger.debug(f"Full response: {resp}") diff --git a/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py b/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py index c4ce26d5..bcd00a0c 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/mistral/llm.py @@ -48,12 +48,15 @@ class Processor(LlmService): logger.info("Mistral LLM service initialized") - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") prompt = system + "\n\n" + prompt @@ -72,7 +75,7 @@ class Processor(LlmService): ] } ], - temperature=self.temperature, + temperature=effective_temperature, max_tokens=self.max_output, top_p=1, frequency_penalty=0, diff --git a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py index fc19ace3..db9586ea 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py @@ -17,6 +17,7 @@ from .... base import LlmService, LlmResult default_ident = "text-completion" default_model = 'gemma2:9b' +default_temperature = 0.0 default_ollama = os.getenv("OLLAMA_HOST", 'http://localhost:11434') class Processor(LlmService): @@ -24,30 +25,36 @@ class Processor(LlmService): def __init__(self, **params): model = params.get("model", default_model) + temperature = params.get("temperature", default_temperature) ollama = params.get("ollama", default_ollama) super(Processor, self).__init__( **params | { "model": model, + "temperature": temperature, "ollama": ollama, } ) self.default_model = model + self.temperature = temperature self.llm = Client(host=ollama) - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") prompt = system + "\n\n" + prompt try: - response = self.llm.generate(model_name, prompt) + response = self.llm.generate(model_name, prompt, options={'temperature': effective_temperature}) response_text = response['response'] logger.debug("Sending response...") @@ -89,6 +96,13 @@ class Processor(LlmService): help=f'ollama (default: {default_ollama})' ) + parser.add_argument( + '-t', '--temperature', + type=float, + default=default_temperature, + help=f'LLM temperature parameter (default: {default_temperature})' + ) + def run(): Processor.launch(default_ident, __doc__) diff --git a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py index 79e0c86f..74ed2353 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py @@ -58,12 +58,15 @@ class Processor(LlmService): logger.info("OpenAI LLM service initialized") - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") prompt = system + "\n\n" + prompt @@ -82,7 +85,7 @@ class Processor(LlmService): ] } ], - temperature=self.temperature, + temperature=effective_temperature, max_tokens=self.max_output, top_p=1, frequency_penalty=0, diff --git a/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py b/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py index c8622c85..9bfc58be 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/tgi/llm.py @@ -51,12 +51,15 @@ class Processor(LlmService): logger.info(f"Using TGI service at {base_url}") logger.info("TGI LLM service initialized") - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") headers = { "Content-Type": "application/json", @@ -75,7 +78,7 @@ class Processor(LlmService): } ], "max_tokens": self.max_output, - "temperature": self.temperature, + "temperature": effective_temperature, } try: diff --git a/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py b/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py index 71b77d74..1cf7df49 100755 --- a/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py +++ b/trustgraph-flow/trustgraph/model/text_completion/vllm/llm.py @@ -52,12 +52,15 @@ class Processor(LlmService): logger.info(f"Using vLLM service at {base_url}") logger.info("vLLM LLM service initialized") - async def generate_content(self, system, prompt, model=None): + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") headers = { "Content-Type": "application/json", @@ -67,7 +70,7 @@ class Processor(LlmService): "model": model_name, "prompt": system + "\n\n" + prompt, "max_tokens": self.max_output, - "temperature": self.temperature, + "temperature": effective_temperature, } try: diff --git a/trustgraph-vertexai/trustgraph/model/text_completion/vertexai/llm.py b/trustgraph-vertexai/trustgraph/model/text_completion/vertexai/llm.py index 0ec9aca1..2fdfee62 100755 --- a/trustgraph-vertexai/trustgraph/model/text_completion/vertexai/llm.py +++ b/trustgraph-vertexai/trustgraph/model/text_completion/vertexai/llm.py @@ -152,29 +152,35 @@ class Processor(LlmService): return self.anthropic_client - def _get_gemini_model(self, model_name): + def _get_gemini_model(self, model_name, temperature=None): """Get or create a Gemini model instance""" if model_name not in self.model_clients: logger.info(f"Creating GenerativeModel instance for '{model_name}'") self.model_clients[model_name] = GenerativeModel(model_name) - # Create generation config for this model - self.generation_configs[model_name] = GenerationConfig( - temperature=self.temperature, - top_p=1.0, - top_k=10, - candidate_count=1, - max_output_tokens=self.max_output, - ) + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature - return self.model_clients[model_name], self.generation_configs[model_name] + # Create generation config with the effective temperature + generation_config = GenerationConfig( + temperature=effective_temperature, + top_p=1.0, + top_k=10, + candidate_count=1, + max_output_tokens=self.max_output, + ) - async def generate_content(self, system, prompt, model=None): + return self.model_clients[model_name], generation_config + + async def generate_content(self, system, prompt, model=None, temperature=None): # Use provided model or fall back to default model_name = model or self.default_model + # Use provided temperature or fall back to default + effective_temperature = temperature if temperature is not None else self.temperature logger.debug(f"Using model: {model_name}") + logger.debug(f"Using temperature: {effective_temperature}") try: if 'claude' in model_name.lower(): @@ -187,7 +193,7 @@ class Processor(LlmService): system=system, messages=[{"role": "user", "content": prompt}], max_tokens=self.api_params['max_output_tokens'], - temperature=self.api_params['temperature'], + temperature=effective_temperature, top_p=self.api_params['top_p'], top_k=self.api_params['top_k'], ) @@ -203,7 +209,7 @@ class Processor(LlmService): logger.debug(f"Sending request to Gemini model '{model_name}'...") full_prompt = system + "\n\n" + prompt - llm, generation_config = self._get_gemini_model(model_name) + llm, generation_config = self._get_gemini_model(model_name, effective_temperature) response = llm.generate_content( full_prompt, generation_config = generation_config,