trustgraph/trustgraph-base/trustgraph/base/llm_service.py


"""
LLM text completion base class
"""

import time
import logging
from prometheus_client import Histogram, Info

from .. schema import TextCompletionRequest, TextCompletionResponse, Error
from .. exceptions import TooManyRequests
from .. base import FlowProcessor, ConsumerSpec, ProducerSpec, ParameterSpec

# Module logger
logger = logging.getLogger(__name__)

default_ident = "text-completion"
default_concurrency = 1

class LlmResult:
    def __init__(
            self, text = None, in_token = None, out_token = None,
            model = None,
    ):
        self.text = text
        self.in_token = in_token
        self.out_token = out_token
        self.model = model
    __slots__ = ["text", "in_token", "out_token", "model"]

class LlmChunk:
    """Represents a streaming chunk from an LLM"""
    def __init__(
            self, text = None, in_token = None, out_token = None,
            model = None, is_final = False,
    ):
        self.text = text
        self.in_token = in_token
        self.out_token = out_token
        self.model = model
        self.is_final = is_final
    __slots__ = ["text", "in_token", "out_token", "model", "is_final"]

class LlmService(FlowProcessor):
    """
    Extensible service processing requests to Large Language Models (LLMs).
    
    This class handles the core logic of dispatching text completion or chat requests
    to integrated underlying LLM providers (e.g. OpenAI, vertex ai).
    """

    def __init__(self, **params):

        id = params.get("id", default_ident)
        concurrency = params.get("concurrency", 1)

        super(LlmService, self).__init__(**params | {
            "id": id,
            "concurrency": concurrency,
        })

        self.register_specification(
            ConsumerSpec(
                name = "request",
                schema = TextCompletionRequest,
                handler = self.on_request,
                concurrency = concurrency,
            )
        )

        self.register_specification(
            ProducerSpec(
                name = "response",
                schema = TextCompletionResponse
            )
        )

        self.register_specification(
            ParameterSpec(
                name = "model",
            )
        )

        self.register_specification(
            ParameterSpec(
                name = "temperature",
            )
        )

        if not hasattr(__class__, "text_completion_metric"):
            __class__.text_completion_metric = Histogram(
                'text_completion_duration',
                'Text completion duration (seconds)',
                ["id", "flow"],
                buckets=[
                    0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
                    8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
                    17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
                    30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
                    120.0
                ]
            )

        if not hasattr(__class__, "text_completion_model_metric"):
            __class__.text_completion_model_metric = Info(
                'text_completion_model',
                'Text completion model',
                ["processor", "flow"]
            )

    async def on_request(self, msg, consumer, flow):

        try:

            request = msg.value()

            # Sender-produced ID

            id = msg.properties()["id"]

            model = flow("model")
            temperature = flow("temperature")

            # Check if streaming is requested and supported
            streaming = getattr(request, 'streaming', False)

            if streaming and self.supports_streaming():

                # Streaming mode
                with __class__.text_completion_metric.labels(
                        id=self.id,
                        flow=f"{flow.name}-{consumer.name}",
                ).time():

                    async for chunk in self.generate_content_stream(
                        request.system, request.prompt, model, temperature
                    ):
                        await flow("response").send(
                            TextCompletionResponse(
                                error=None,
                                response=chunk.text,
                                in_token=chunk.in_token,
                                out_token=chunk.out_token,
                                model=chunk.model,
                                end_of_stream=chunk.is_final
                            ),
                            properties={"id": id}
                        )

            else:

                # Non-streaming mode (original behavior)
                with __class__.text_completion_metric.labels(
                        id=self.id,
                        flow=f"{flow.name}-{consumer.name}",
                ).time():

                    response = await self.generate_content(
                        request.system, request.prompt, model, temperature
                    )

                await flow("response").send(
                    TextCompletionResponse(
                        error=None,
                        response=response.text,
                        in_token=response.in_token,
                        out_token=response.out_token,
                        model=response.model,
                        end_of_stream=True
                    ),
                    properties={"id": id}
                )

            __class__.text_completion_model_metric.labels(
                processor = self.id,
                flow = flow.name
            ).info({
                "model": str(model) if model is not None else "",
                "temperature": str(temperature) if temperature is not None else "",
            })

        except TooManyRequests as e:
            raise e

        except Exception as e:

            # Apart from rate limits, treat all exceptions as unrecoverable

            logger.error(f"LLM service exception: {e}", exc_info=True)

            logger.debug("Sending error response...")

            await flow.producer["response"].send(
                TextCompletionResponse(
                    error=Error(
                        type = "llm-error",
                        message = str(e),
                    ),
                    response=None,
                    in_token=None,
                    out_token=None,
                    model=None,
                    end_of_stream=True
                ),
                properties={"id": id}
            )

    def supports_streaming(self):
        """
        Override in subclass to indicate streaming support.
        Returns False by default.
        """
        return False

    async def generate_content_stream(self, system, prompt, model=None, temperature=None):
        """
        Override in subclass to implement streaming.
        Should yield LlmChunk objects.
        The final chunk should have is_final=True.
        """
        raise NotImplementedError("Streaming not implemented for this provider")

    @staticmethod
    def add_args(parser):

        parser.add_argument(
            '-c', '--concurrency',
            type=int,
            default=default_concurrency,
            help=f'Concurrent processing threads (default: {default_concurrency})'
        )

        FlowProcessor.add_args(parser)
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
			`"""`
			`LLM text completion base class`
			`"""`

			`import time`
Implement logging strategy (#444) * Logging strategy and convert all prints() to logging invocations 2025-07-30 23:18:38 +01:00			`import logging`
Flow temperature parameter (#533) * Add temperature parameter to LlmService and roll out to all LLMs 2025-09-25 21:26:11 +01:00			`from prometheus_client import Histogram, Info`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
			`from .. schema import TextCompletionRequest, TextCompletionResponse, Error`
			`from .. exceptions import TooManyRequests`
Complete remaining parameter work (#530) * Fix CLI typo * Complete flow parameters work, still needs implementation in LLMs 2025-09-24 13:58:34 +01:00			`from .. base import FlowProcessor, ConsumerSpec, ProducerSpec, ParameterSpec`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
Implement logging strategy (#444) * Logging strategy and convert all prints() to logging invocations 2025-07-30 23:18:38 +01:00			`# Module logger`
			`logger = logging.getLogger(__name__)`

Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`default_ident = "text-completion"`
Add multi-threading to consumer base-class and all LLMs (#408) * Concurrency in consumers * Add concurrency to consumer spec * Add concurrency command-line option to all LLMs (default 1) 2025-06-04 10:49:56 +01:00			`default_concurrency = 1`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
			`class LlmResult:`
Add multi-threading to consumer base-class and all LLMs (#408) * Concurrency in consumers * Add concurrency to consumer spec * Add concurrency command-line option to all LLMs (default 1) 2025-06-04 10:49:56 +01:00			`def __init__(`
			`self, text = None, in_token = None, out_token = None,`
			`model = None,`
			`):`
Update LLMs to LlmService API (#353) 2025-04-25 19:57:42 +01:00			`self.text = text`
			`self.in_token = in_token`
			`self.out_token = out_token`
			`self.model = model`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`__slots__ = ["text", "in_token", "out_token", "model"]`

Feature/streaming llm phase 1 (#566) * Tidy up duplicate tech specs in doc directory * Streaming LLM text-completion service tech spec. * text-completion and prompt interfaces * streaming change applied to all LLMs, so far tested with VertexAI * Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again * Added agent streaming, not working and has broken tests 2025-11-26 09:59:10 +00:00			`class LlmChunk:`
			`"""Represents a streaming chunk from an LLM"""`
			`def __init__(`
			`self, text = None, in_token = None, out_token = None,`
			`model = None, is_final = False,`
			`):`
			`self.text = text`
			`self.in_token = in_token`
			`self.out_token = out_token`
			`self.model = model`
			`self.is_final = is_final`
			`__slots__ = ["text", "in_token", "out_token", "model", "is_final"]`

Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`class LlmService(FlowProcessor):`
Add docstrings to public classes (#812) Add class-level docstrings to five public classes in trustgraph-base: Flow, LlmService, ConsumerMetrics, ToolClient, and TriplesStoreService. Each docstring summarises the class's role in the system to aid discoverability for new contributors. Signed-off-by: Jenkins, Kenneth Alexander <kjenkins60@gatech.edu> 2026-04-16 04:03:41 -04:00			`"""`
			`Extensible service processing requests to Large Language Models (LLMs).`

			`This class handles the core logic of dispatching text completion or chat requests`
			`to integrated underlying LLM providers (e.g. OpenAI, vertex ai).`
			`"""`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
			`def __init__(self, **params):`

Fix label issue in metrics (#540) 2025-09-26 14:13:22 +01:00			`id = params.get("id", default_ident)`
Add multi-threading to consumer base-class and all LLMs (#408) * Concurrency in consumers * Add concurrency to consumer spec * Add concurrency command-line option to all LLMs (default 1) 2025-06-04 10:49:56 +01:00			`concurrency = params.get("concurrency", 1)`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
Add multi-threading to consumer base-class and all LLMs (#408) * Concurrency in consumers * Add concurrency to consumer spec * Add concurrency command-line option to all LLMs (default 1) 2025-06-04 10:49:56 +01:00			`super(LlmService, self).__init__(**params \| {`
			`"id": id,`
			`"concurrency": concurrency,`
			`})`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
			`self.register_specification(`
			`ConsumerSpec(`
			`name = "request",`
			`schema = TextCompletionRequest,`
Add multi-threading to consumer base-class and all LLMs (#408) * Concurrency in consumers * Add concurrency to consumer spec * Add concurrency command-line option to all LLMs (default 1) 2025-06-04 10:49:56 +01:00			`handler = self.on_request,`
			`concurrency = concurrency,`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`)`
			`)`

			`self.register_specification(`
			`ProducerSpec(`
			`name = "response",`
			`schema = TextCompletionResponse`
			`)`
			`)`

Complete remaining parameter work (#530) * Fix CLI typo * Complete flow parameters work, still needs implementation in LLMs 2025-09-24 13:58:34 +01:00			`self.register_specification(`
			`ParameterSpec(`
			`name = "model",`
			`)`
			`)`

Flow temperature parameter (#533) * Add temperature parameter to LlmService and roll out to all LLMs 2025-09-25 21:26:11 +01:00			`self.register_specification(`
			`ParameterSpec(`
			`name = "temperature",`
			`)`
			`)`

Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`if not hasattr(__class__, "text_completion_metric"):`
			`__class__.text_completion_metric = Histogram(`
			`'text_completion_duration',`
			`'Text completion duration (seconds)',`
			`["id", "flow"],`
			`buckets=[`
			`0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,`
			`8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,`
			`17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,`
			`30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,`
			`120.0`
			`]`
			`)`

Flow temperature parameter (#533) * Add temperature parameter to LlmService and roll out to all LLMs 2025-09-25 21:26:11 +01:00			`if not hasattr(__class__, "text_completion_model_metric"):`
			`__class__.text_completion_model_metric = Info(`
			`'text_completion_model',`
			`'Text completion model',`
			`["processor", "flow"]`
			`)`

Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`async def on_request(self, msg, consumer, flow):`

			`try:`

			`request = msg.value()`

			`# Sender-produced ID`

			`id = msg.properties()["id"]`

Feature/streaming llm phase 1 (#566) * Tidy up duplicate tech specs in doc directory * Streaming LLM text-completion service tech spec. * text-completion and prompt interfaces * streaming change applied to all LLMs, so far tested with VertexAI * Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again * Added agent streaming, not working and has broken tests 2025-11-26 09:59:10 +00:00			`model = flow("model")`
			`temperature = flow("temperature")`

			`# Check if streaming is requested and supported`
			`streaming = getattr(request, 'streaming', False)`

			`if streaming and self.supports_streaming():`

			`# Streaming mode`
			`with __class__.text_completion_metric.labels(`
			`id=self.id,`
			`flow=f"{flow.name}-{consumer.name}",`
			`).time():`

			`async for chunk in self.generate_content_stream(`
			`request.system, request.prompt, model, temperature`
			`):`
			`await flow("response").send(`
			`TextCompletionResponse(`
			`error=None,`
			`response=chunk.text,`
			`in_token=chunk.in_token,`
			`out_token=chunk.out_token,`
			`model=chunk.model,`
			`end_of_stream=chunk.is_final`
			`),`
			`properties={"id": id}`
			`)`

			`else:`

			`# Non-streaming mode (original behavior)`
			`with __class__.text_completion_metric.labels(`
			`id=self.id,`
			`flow=f"{flow.name}-{consumer.name}",`
			`).time():`

			`response = await self.generate_content(`
			`request.system, request.prompt, model, temperature`
			`)`

			`await flow("response").send(`
			`TextCompletionResponse(`
			`error=None,`
			`response=response.text,`
			`in_token=response.in_token,`
			`out_token=response.out_token,`
			`model=response.model,`
			`end_of_stream=True`
			`),`
			`properties={"id": id}`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`)`

Fix label issue in metrics (#540) 2025-09-26 14:13:22 +01:00			`__class__.text_completion_model_metric.labels(`
			`processor = self.id,`
			`flow = flow.name`
Flow temperature parameter (#533) * Add temperature parameter to LlmService and roll out to all LLMs 2025-09-25 21:26:11 +01:00			`).info({`
Fix label issue in metrics (#540) 2025-09-26 14:13:22 +01:00			`"model": str(model) if model is not None else "",`
			`"temperature": str(temperature) if temperature is not None else "",`
Flow temperature parameter (#533) * Add temperature parameter to LlmService and roll out to all LLMs 2025-09-25 21:26:11 +01:00			`})`

Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`except TooManyRequests as e:`
			`raise e`

			`except Exception as e:`

			`# Apart from rate limits, treat all exceptions as unrecoverable`

Implement logging strategy (#444) * Logging strategy and convert all prints() to logging invocations 2025-07-30 23:18:38 +01:00			`logger.error(f"LLM service exception: {e}", exc_info=True)`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
Implement logging strategy (#444) * Logging strategy and convert all prints() to logging invocations 2025-07-30 23:18:38 +01:00			`logger.debug("Sending error response...")`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00
			`await flow.producer["response"].send(`
			`TextCompletionResponse(`
			`error=Error(`
			`type = "llm-error",`
			`message = str(e),`
			`),`
			`response=None,`
			`in_token=None,`
			`out_token=None,`
			`model=None,`
Feature/streaming llm phase 1 (#566) * Tidy up duplicate tech specs in doc directory * Streaming LLM text-completion service tech spec. * text-completion and prompt interfaces * streaming change applied to all LLMs, so far tested with VertexAI * Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again * Added agent streaming, not working and has broken tests 2025-11-26 09:59:10 +00:00			`end_of_stream=True`
Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`),`
			`properties={"id": id}`
			`)`

Feature/streaming llm phase 1 (#566) * Tidy up duplicate tech specs in doc directory * Streaming LLM text-completion service tech spec. * text-completion and prompt interfaces * streaming change applied to all LLMs, so far tested with VertexAI * Skip Pinecone unit tests, upstream module issue is affecting things, tests are passing again * Added agent streaming, not working and has broken tests 2025-11-26 09:59:10 +00:00			`def supports_streaming(self):`
			`"""`
			`Override in subclass to indicate streaming support.`
			`Returns False by default.`
			`"""`
			`return False`

			`async def generate_content_stream(self, system, prompt, model=None, temperature=None):`
			`"""`
			`Override in subclass to implement streaming.`
			`Should yield LlmChunk objects.`
			`The final chunk should have is_final=True.`
			`"""`
			`raise NotImplementedError("Streaming not implemented for this provider")`

Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`@staticmethod`
			`def add_args(parser):`

Add multi-threading to consumer base-class and all LLMs (#408) * Concurrency in consumers * Add concurrency to consumer spec * Add concurrency command-line option to all LLMs (default 1) 2025-06-04 10:49:56 +01:00			`parser.add_argument(`
			`'-c', '--concurrency',`
			`type=int,`
			`default=default_concurrency,`
Concurrency implemented in more services (#409) 2025-06-04 11:45:21 +01:00			`help=f'Concurrent processing threads (default: {default_concurrency})'`
Add multi-threading to consumer base-class and all LLMs (#408) * Concurrency in consumers * Add concurrency to consumer spec * Add concurrency command-line option to all LLMs (default 1) 2025-06-04 10:49:56 +01:00			`)`

Feature/configure flows (#345) - Keeps processing in different flows separate so that data can go to different stores / collections etc. - Potentially supports different processing flows - Tidies the processing API with common base-classes for e.g. LLMs, and automatic configuration of 'clients' to use the right queue names in a flow 2025-04-22 20:21:38 +01:00			`FlowProcessor.add_args(parser)`