2025-04-22 20:21:38 +01:00
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
LLM text completion base class
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import time
|
2025-07-30 23:18:38 +01:00
|
|
|
import logging
|
2025-09-25 21:26:11 +01:00
|
|
|
from prometheus_client import Histogram, Info
|
2025-04-22 20:21:38 +01:00
|
|
|
|
|
|
|
|
from .. schema import TextCompletionRequest, TextCompletionResponse, Error
|
|
|
|
|
from .. exceptions import TooManyRequests
|
2025-09-24 13:58:34 +01:00
|
|
|
from .. base import FlowProcessor, ConsumerSpec, ProducerSpec, ParameterSpec
|
2025-04-22 20:21:38 +01:00
|
|
|
|
2025-07-30 23:18:38 +01:00
|
|
|
# Module logger
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
default_ident = "text-completion"
|
2025-06-04 10:49:56 +01:00
|
|
|
default_concurrency = 1
|
2025-04-22 20:21:38 +01:00
|
|
|
|
|
|
|
|
class LlmResult:
|
2025-06-04 10:49:56 +01:00
|
|
|
def __init__(
|
|
|
|
|
self, text = None, in_token = None, out_token = None,
|
|
|
|
|
model = None,
|
|
|
|
|
):
|
2025-04-25 19:57:42 +01:00
|
|
|
self.text = text
|
|
|
|
|
self.in_token = in_token
|
|
|
|
|
self.out_token = out_token
|
|
|
|
|
self.model = model
|
2025-04-22 20:21:38 +01:00
|
|
|
__slots__ = ["text", "in_token", "out_token", "model"]
|
|
|
|
|
|
2025-11-26 09:59:10 +00:00
|
|
|
class LlmChunk:
|
|
|
|
|
"""Represents a streaming chunk from an LLM"""
|
|
|
|
|
def __init__(
|
|
|
|
|
self, text = None, in_token = None, out_token = None,
|
|
|
|
|
model = None, is_final = False,
|
|
|
|
|
):
|
|
|
|
|
self.text = text
|
|
|
|
|
self.in_token = in_token
|
|
|
|
|
self.out_token = out_token
|
|
|
|
|
self.model = model
|
|
|
|
|
self.is_final = is_final
|
|
|
|
|
__slots__ = ["text", "in_token", "out_token", "model", "is_final"]
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
class LlmService(FlowProcessor):
|
2026-04-16 04:03:41 -04:00
|
|
|
"""
|
|
|
|
|
Extensible service processing requests to Large Language Models (LLMs).
|
|
|
|
|
|
|
|
|
|
This class handles the core logic of dispatching text completion or chat requests
|
|
|
|
|
to integrated underlying LLM providers (e.g. OpenAI, vertex ai).
|
|
|
|
|
"""
|
2025-04-22 20:21:38 +01:00
|
|
|
|
|
|
|
|
def __init__(self, **params):
|
|
|
|
|
|
2025-09-26 14:13:22 +01:00
|
|
|
id = params.get("id", default_ident)
|
2025-06-04 10:49:56 +01:00
|
|
|
concurrency = params.get("concurrency", 1)
|
2025-04-22 20:21:38 +01:00
|
|
|
|
2025-06-04 10:49:56 +01:00
|
|
|
super(LlmService, self).__init__(**params | {
|
|
|
|
|
"id": id,
|
|
|
|
|
"concurrency": concurrency,
|
|
|
|
|
})
|
2025-04-22 20:21:38 +01:00
|
|
|
|
|
|
|
|
self.register_specification(
|
|
|
|
|
ConsumerSpec(
|
|
|
|
|
name = "request",
|
|
|
|
|
schema = TextCompletionRequest,
|
2025-06-04 10:49:56 +01:00
|
|
|
handler = self.on_request,
|
|
|
|
|
concurrency = concurrency,
|
2025-04-22 20:21:38 +01:00
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
self.register_specification(
|
|
|
|
|
ProducerSpec(
|
|
|
|
|
name = "response",
|
|
|
|
|
schema = TextCompletionResponse
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
2025-09-24 13:58:34 +01:00
|
|
|
self.register_specification(
|
|
|
|
|
ParameterSpec(
|
|
|
|
|
name = "model",
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
2025-09-25 21:26:11 +01:00
|
|
|
self.register_specification(
|
|
|
|
|
ParameterSpec(
|
|
|
|
|
name = "temperature",
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
if not hasattr(__class__, "text_completion_metric"):
|
|
|
|
|
__class__.text_completion_metric = Histogram(
|
|
|
|
|
'text_completion_duration',
|
|
|
|
|
'Text completion duration (seconds)',
|
|
|
|
|
["id", "flow"],
|
|
|
|
|
buckets=[
|
|
|
|
|
0.25, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
|
|
|
|
|
8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
|
|
|
|
|
17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
|
|
|
|
|
30.0, 35.0, 40.0, 45.0, 50.0, 60.0, 80.0, 100.0,
|
|
|
|
|
120.0
|
|
|
|
|
]
|
|
|
|
|
)
|
|
|
|
|
|
2025-09-25 21:26:11 +01:00
|
|
|
if not hasattr(__class__, "text_completion_model_metric"):
|
|
|
|
|
__class__.text_completion_model_metric = Info(
|
|
|
|
|
'text_completion_model',
|
|
|
|
|
'Text completion model',
|
|
|
|
|
["processor", "flow"]
|
|
|
|
|
)
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
async def on_request(self, msg, consumer, flow):
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
|
|
request = msg.value()
|
|
|
|
|
|
|
|
|
|
# Sender-produced ID
|
|
|
|
|
|
|
|
|
|
id = msg.properties()["id"]
|
|
|
|
|
|
2025-11-26 09:59:10 +00:00
|
|
|
model = flow("model")
|
|
|
|
|
temperature = flow("temperature")
|
|
|
|
|
|
|
|
|
|
# Check if streaming is requested and supported
|
|
|
|
|
streaming = getattr(request, 'streaming', False)
|
|
|
|
|
|
|
|
|
|
if streaming and self.supports_streaming():
|
|
|
|
|
|
|
|
|
|
# Streaming mode
|
|
|
|
|
with __class__.text_completion_metric.labels(
|
|
|
|
|
id=self.id,
|
|
|
|
|
flow=f"{flow.name}-{consumer.name}",
|
|
|
|
|
).time():
|
|
|
|
|
|
|
|
|
|
async for chunk in self.generate_content_stream(
|
|
|
|
|
request.system, request.prompt, model, temperature
|
|
|
|
|
):
|
|
|
|
|
await flow("response").send(
|
|
|
|
|
TextCompletionResponse(
|
|
|
|
|
error=None,
|
|
|
|
|
response=chunk.text,
|
|
|
|
|
in_token=chunk.in_token,
|
|
|
|
|
out_token=chunk.out_token,
|
|
|
|
|
model=chunk.model,
|
|
|
|
|
end_of_stream=chunk.is_final
|
|
|
|
|
),
|
|
|
|
|
properties={"id": id}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
|
|
# Non-streaming mode (original behavior)
|
|
|
|
|
with __class__.text_completion_metric.labels(
|
|
|
|
|
id=self.id,
|
|
|
|
|
flow=f"{flow.name}-{consumer.name}",
|
|
|
|
|
).time():
|
|
|
|
|
|
|
|
|
|
response = await self.generate_content(
|
|
|
|
|
request.system, request.prompt, model, temperature
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
await flow("response").send(
|
|
|
|
|
TextCompletionResponse(
|
|
|
|
|
error=None,
|
|
|
|
|
response=response.text,
|
|
|
|
|
in_token=response.in_token,
|
|
|
|
|
out_token=response.out_token,
|
|
|
|
|
model=response.model,
|
|
|
|
|
end_of_stream=True
|
|
|
|
|
),
|
|
|
|
|
properties={"id": id}
|
2025-04-22 20:21:38 +01:00
|
|
|
)
|
|
|
|
|
|
2025-09-26 14:13:22 +01:00
|
|
|
__class__.text_completion_model_metric.labels(
|
|
|
|
|
processor = self.id,
|
|
|
|
|
flow = flow.name
|
2025-09-25 21:26:11 +01:00
|
|
|
).info({
|
2025-09-26 14:13:22 +01:00
|
|
|
"model": str(model) if model is not None else "",
|
|
|
|
|
"temperature": str(temperature) if temperature is not None else "",
|
2025-09-25 21:26:11 +01:00
|
|
|
})
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
except TooManyRequests as e:
|
|
|
|
|
raise e
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
|
|
|
|
# Apart from rate limits, treat all exceptions as unrecoverable
|
|
|
|
|
|
2025-07-30 23:18:38 +01:00
|
|
|
logger.error(f"LLM service exception: {e}", exc_info=True)
|
2025-04-22 20:21:38 +01:00
|
|
|
|
2025-07-30 23:18:38 +01:00
|
|
|
logger.debug("Sending error response...")
|
2025-04-22 20:21:38 +01:00
|
|
|
|
|
|
|
|
await flow.producer["response"].send(
|
|
|
|
|
TextCompletionResponse(
|
|
|
|
|
error=Error(
|
|
|
|
|
type = "llm-error",
|
|
|
|
|
message = str(e),
|
|
|
|
|
),
|
|
|
|
|
response=None,
|
|
|
|
|
in_token=None,
|
|
|
|
|
out_token=None,
|
|
|
|
|
model=None,
|
2025-11-26 09:59:10 +00:00
|
|
|
end_of_stream=True
|
2025-04-22 20:21:38 +01:00
|
|
|
),
|
|
|
|
|
properties={"id": id}
|
|
|
|
|
)
|
|
|
|
|
|
2025-11-26 09:59:10 +00:00
|
|
|
def supports_streaming(self):
|
|
|
|
|
"""
|
|
|
|
|
Override in subclass to indicate streaming support.
|
|
|
|
|
Returns False by default.
|
|
|
|
|
"""
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
async def generate_content_stream(self, system, prompt, model=None, temperature=None):
|
|
|
|
|
"""
|
|
|
|
|
Override in subclass to implement streaming.
|
|
|
|
|
Should yield LlmChunk objects.
|
|
|
|
|
The final chunk should have is_final=True.
|
|
|
|
|
"""
|
|
|
|
|
raise NotImplementedError("Streaming not implemented for this provider")
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
@staticmethod
|
|
|
|
|
def add_args(parser):
|
|
|
|
|
|
2025-06-04 10:49:56 +01:00
|
|
|
parser.add_argument(
|
|
|
|
|
'-c', '--concurrency',
|
|
|
|
|
type=int,
|
|
|
|
|
default=default_concurrency,
|
2025-06-04 11:45:21 +01:00
|
|
|
help=f'Concurrent processing threads (default: {default_concurrency})'
|
2025-06-04 10:49:56 +01:00
|
|
|
)
|
|
|
|
|
|
2025-04-22 20:21:38 +01:00
|
|
|
FlowProcessor.add_args(parser)
|
|
|
|
|
|