mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Refactor rate limit handling (#280)
* - Refactored retry for rate limits into the base class - ConsumerProducer is derived from Consumer to simplify code - Added rate_limit_count metrics for rate limit events * Add rate limit events to VertexAI and Google AI Studio * Added Grafana rate limit dashboard * Add rate limit handling to all LLMs
This commit is contained in:
parent
26a586034c
commit
0e03bc05a4
14 changed files with 174 additions and 298 deletions
|
|
@ -577,7 +577,7 @@
|
||||||
"disableTextWrap": false,
|
"disableTextWrap": false,
|
||||||
"editorMode": "builder",
|
"editorMode": "builder",
|
||||||
"exemplar": false,
|
"exemplar": false,
|
||||||
"expr": "increase(processing_count_total{status!=\"success\"}[$__rate_interval])",
|
"expr": "sum by(job) (increase(rate_limit_count_total[$__rate_interval]))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"fullMetaSearch": false,
|
"fullMetaSearch": false,
|
||||||
"includeNullMetadata": true,
|
"includeNullMetadata": true,
|
||||||
|
|
@ -588,7 +588,7 @@
|
||||||
"useBackend": false
|
"useBackend": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"title": "Errors",
|
"title": "Rate limit events",
|
||||||
"type": "timeseries"
|
"type": "timeseries"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,9 @@ import time
|
||||||
from . base_processor import BaseProcessor
|
from . base_processor import BaseProcessor
|
||||||
from .. exceptions import TooManyRequests
|
from .. exceptions import TooManyRequests
|
||||||
|
|
||||||
|
default_rate_limit_retry = 10
|
||||||
|
default_rate_limit_timeout = 7200
|
||||||
|
|
||||||
class Consumer(BaseProcessor):
|
class Consumer(BaseProcessor):
|
||||||
|
|
||||||
def __init__(self, **params):
|
def __init__(self, **params):
|
||||||
|
|
@ -22,11 +25,18 @@ class Consumer(BaseProcessor):
|
||||||
|
|
||||||
super(Consumer, self).__init__(**params)
|
super(Consumer, self).__init__(**params)
|
||||||
|
|
||||||
input_queue = params.get("input_queue")
|
self.input_queue = params.get("input_queue")
|
||||||
subscriber = params.get("subscriber")
|
self.subscriber = params.get("subscriber")
|
||||||
input_schema = params.get("input_schema")
|
self.input_schema = params.get("input_schema")
|
||||||
|
|
||||||
if input_schema == None:
|
self.rate_limit_retry = params.get(
|
||||||
|
"rate_limit_retry", default_rate_limit_retry
|
||||||
|
)
|
||||||
|
self.rate_limit_timeout = params.get(
|
||||||
|
"rate_limit_timeout", default_rate_limit_timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.input_schema == None:
|
||||||
raise RuntimeError("input_schema must be specified")
|
raise RuntimeError("input_schema must be specified")
|
||||||
|
|
||||||
if not hasattr(__class__, "request_metric"):
|
if not hasattr(__class__, "request_metric"):
|
||||||
|
|
@ -44,18 +54,27 @@ class Consumer(BaseProcessor):
|
||||||
'processing_count', 'Processing count', ["status"]
|
'processing_count', 'Processing count', ["status"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not hasattr(__class__, "rate_limit_metric"):
|
||||||
|
__class__.rate_limit_metric = Counter(
|
||||||
|
'rate_limit_count', 'Rate limit event count',
|
||||||
|
)
|
||||||
|
|
||||||
__class__.pubsub_metric.info({
|
__class__.pubsub_metric.info({
|
||||||
"input_queue": input_queue,
|
"input_queue": self.input_queue,
|
||||||
"subscriber": subscriber,
|
"subscriber": self.subscriber,
|
||||||
"input_schema": input_schema.__name__,
|
"input_schema": self.input_schema.__name__,
|
||||||
|
"rate_limit_retry": str(self.rate_limit_retry),
|
||||||
|
"rate_limit_timeout": str(self.rate_limit_timeout),
|
||||||
})
|
})
|
||||||
|
|
||||||
self.consumer = self.client.subscribe(
|
self.consumer = self.client.subscribe(
|
||||||
input_queue, subscriber,
|
self.input_queue, self.subscriber,
|
||||||
consumer_type=pulsar.ConsumerType.Shared,
|
consumer_type=pulsar.ConsumerType.Shared,
|
||||||
schema=JsonSchema(input_schema),
|
schema=JsonSchema(self.input_schema),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print("Initialised consumer.", flush=True)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
|
|
||||||
__class__.state_metric.state('running')
|
__class__.state_metric.state('running')
|
||||||
|
|
@ -64,31 +83,61 @@ class Consumer(BaseProcessor):
|
||||||
|
|
||||||
msg = self.consumer.receive()
|
msg = self.consumer.receive()
|
||||||
|
|
||||||
try:
|
expiry = time.time() + self.rate_limit_timeout
|
||||||
|
|
||||||
with __class__.request_metric.time():
|
# This loop is for retry on rate-limit / resource limits
|
||||||
self.handle(msg)
|
while True:
|
||||||
|
|
||||||
# Acknowledge successful processing of the message
|
if time.time() > expiry:
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
__class__.processing_metric.labels(status="success").inc()
|
print("Gave up waiting for rate-limit retry", flush=True)
|
||||||
|
|
||||||
except TooManyRequests:
|
# Message failed to be processed, this causes it to
|
||||||
self.consumer.negative_acknowledge(msg)
|
# be retried
|
||||||
print("TooManyRequests: will retry")
|
self.consumer.negative_acknowledge(msg)
|
||||||
__class__.processing_metric.labels(status="rate-limit").inc()
|
|
||||||
time.sleep(5)
|
__class__.processing_metric.labels(status="error").inc()
|
||||||
continue
|
|
||||||
|
# Break out of retry loop, processes next message
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
with __class__.request_metric.time():
|
||||||
|
self.handle(msg)
|
||||||
|
|
||||||
|
# Acknowledge successful processing of the message
|
||||||
|
self.consumer.acknowledge(msg)
|
||||||
|
|
||||||
|
__class__.processing_metric.labels(status="success").inc()
|
||||||
|
|
||||||
|
# Break out of retry loop
|
||||||
|
break
|
||||||
|
|
||||||
|
except TooManyRequests:
|
||||||
|
|
||||||
|
print("TooManyRequests: will retry...", flush=True)
|
||||||
|
|
||||||
|
__class__.rate_limit_metric.inc()
|
||||||
|
|
||||||
|
# Sleep
|
||||||
|
time.sleep(self.rate_limit_retry)
|
||||||
|
|
||||||
|
# Contine from retry loop, just causes a reprocessing
|
||||||
|
continue
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
print("Exception:", e, flush=True)
|
print("Exception:", e, flush=True)
|
||||||
|
|
||||||
# Message failed to be processed
|
# Message failed to be processed, this causes it to
|
||||||
self.consumer.negative_acknowledge(msg)
|
# be retried
|
||||||
|
self.consumer.negative_acknowledge(msg)
|
||||||
|
|
||||||
__class__.processing_metric.labels(status="error").inc()
|
__class__.processing_metric.labels(status="error").inc()
|
||||||
|
|
||||||
|
# Break out of retry loop, processes next message
|
||||||
|
break
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_args(parser, default_input_queue, default_subscriber):
|
def add_args(parser, default_input_queue, default_subscriber):
|
||||||
|
|
@ -107,3 +156,17 @@ class Consumer(BaseProcessor):
|
||||||
help=f'Queue subscriber name (default: {default_subscriber})'
|
help=f'Queue subscriber name (default: {default_subscriber})'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--rate-limit-retry',
|
||||||
|
type=int,
|
||||||
|
default=default_rate_limit_retry,
|
||||||
|
help=f'Rate limit retry (default: {default_rate_limit_retry})'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--rate-limit-timeout',
|
||||||
|
type=int,
|
||||||
|
default=default_rate_limit_timeout,
|
||||||
|
help=f'Rate limit timeout (default: {default_rate_limit_timeout})'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,111 +4,43 @@ import pulsar
|
||||||
from prometheus_client import Histogram, Info, Counter, Enum
|
from prometheus_client import Histogram, Info, Counter, Enum
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from . base_processor import BaseProcessor
|
from . consumer import Consumer
|
||||||
from .. exceptions import TooManyRequests
|
from .. exceptions import TooManyRequests
|
||||||
|
|
||||||
# FIXME: Derive from consumer? And producer?
|
class ConsumerProducer(Consumer):
|
||||||
|
|
||||||
class ConsumerProducer(BaseProcessor):
|
|
||||||
|
|
||||||
def __init__(self, **params):
|
def __init__(self, **params):
|
||||||
|
|
||||||
if not hasattr(__class__, "state_metric"):
|
super(ConsumerProducer, self).__init__(**params)
|
||||||
__class__.state_metric = Enum(
|
|
||||||
'processor_state', 'Processor state',
|
|
||||||
states=['starting', 'running', 'stopped']
|
|
||||||
)
|
|
||||||
__class__.state_metric.state('starting')
|
|
||||||
|
|
||||||
__class__.state_metric.state('starting')
|
self.output_queue = params.get("output_queue")
|
||||||
|
self.output_schema = params.get("output_schema")
|
||||||
input_queue = params.get("input_queue")
|
|
||||||
output_queue = params.get("output_queue")
|
|
||||||
subscriber = params.get("subscriber")
|
|
||||||
input_schema = params.get("input_schema")
|
|
||||||
output_schema = params.get("output_schema")
|
|
||||||
|
|
||||||
if not hasattr(__class__, "request_metric"):
|
|
||||||
__class__.request_metric = Histogram(
|
|
||||||
'request_latency', 'Request latency (seconds)'
|
|
||||||
)
|
|
||||||
|
|
||||||
if not hasattr(__class__, "output_metric"):
|
if not hasattr(__class__, "output_metric"):
|
||||||
__class__.output_metric = Counter(
|
__class__.output_metric = Counter(
|
||||||
'output_count', 'Output items created'
|
'output_count', 'Output items created'
|
||||||
)
|
)
|
||||||
|
|
||||||
if not hasattr(__class__, "pubsub_metric"):
|
|
||||||
__class__.pubsub_metric = Info(
|
|
||||||
'pubsub', 'Pub/sub configuration'
|
|
||||||
)
|
|
||||||
|
|
||||||
if not hasattr(__class__, "processing_metric"):
|
|
||||||
__class__.processing_metric = Counter(
|
|
||||||
'processing_count', 'Processing count', ["status"]
|
|
||||||
)
|
|
||||||
|
|
||||||
__class__.pubsub_metric.info({
|
__class__.pubsub_metric.info({
|
||||||
"input_queue": input_queue,
|
"input_queue": self.input_queue,
|
||||||
"output_queue": output_queue,
|
"output_queue": self.output_queue,
|
||||||
"subscriber": subscriber,
|
"subscriber": self.subscriber,
|
||||||
"input_schema": input_schema.__name__,
|
"input_schema": self.input_schema.__name__,
|
||||||
"output_schema": output_schema.__name__,
|
"output_schema": self.output_schema.__name__,
|
||||||
|
"rate_limit_retry": str(self.rate_limit_retry),
|
||||||
|
"rate_limit_timeout": str(self.rate_limit_timeout),
|
||||||
})
|
})
|
||||||
|
|
||||||
super(ConsumerProducer, self).__init__(**params)
|
if self.output_schema == None:
|
||||||
|
|
||||||
if input_schema == None:
|
|
||||||
raise RuntimeError("input_schema must be specified")
|
|
||||||
|
|
||||||
if output_schema == None:
|
|
||||||
raise RuntimeError("output_schema must be specified")
|
raise RuntimeError("output_schema must be specified")
|
||||||
|
|
||||||
self.producer = self.client.create_producer(
|
self.producer = self.client.create_producer(
|
||||||
topic=output_queue,
|
topic=self.output_queue,
|
||||||
schema=JsonSchema(output_schema),
|
schema=JsonSchema(self.output_schema),
|
||||||
chunking_enabled=True,
|
chunking_enabled=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.consumer = self.client.subscribe(
|
print("Initialised consumer/producer.")
|
||||||
input_queue, subscriber,
|
|
||||||
consumer_type=pulsar.ConsumerType.Shared,
|
|
||||||
schema=JsonSchema(input_schema),
|
|
||||||
)
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
|
|
||||||
__class__.state_metric.state('running')
|
|
||||||
|
|
||||||
while True:
|
|
||||||
|
|
||||||
msg = self.consumer.receive()
|
|
||||||
|
|
||||||
try:
|
|
||||||
|
|
||||||
with __class__.request_metric.time():
|
|
||||||
resp = self.handle(msg)
|
|
||||||
|
|
||||||
# Acknowledge successful processing of the message
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
__class__.processing_metric.labels(status="success").inc()
|
|
||||||
|
|
||||||
except TooManyRequests:
|
|
||||||
self.consumer.negative_acknowledge(msg)
|
|
||||||
print("TooManyRequests: will retry")
|
|
||||||
__class__.processing_metric.labels(status="rate-limit").inc()
|
|
||||||
time.sleep(5)
|
|
||||||
continue
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
|
|
||||||
print("Exception:", e, flush=True)
|
|
||||||
|
|
||||||
# Message failed to be processed
|
|
||||||
self.consumer.negative_acknowledge(msg)
|
|
||||||
|
|
||||||
__class__.processing_metric.labels(status="error").inc()
|
|
||||||
|
|
||||||
def send(self, msg, properties={}):
|
def send(self, msg, properties={}):
|
||||||
self.producer.send(msg, properties)
|
self.producer.send(msg, properties)
|
||||||
|
|
@ -120,19 +52,7 @@ class ConsumerProducer(BaseProcessor):
|
||||||
default_output_queue,
|
default_output_queue,
|
||||||
):
|
):
|
||||||
|
|
||||||
BaseProcessor.add_args(parser)
|
Consumer.add_args(parser, default_input_queue, default_subscriber)
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'-i', '--input-queue',
|
|
||||||
default=default_input_queue,
|
|
||||||
help=f'Input queue (default: {default_input_queue})'
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'-s', '--subscriber',
|
|
||||||
default=default_subscriber,
|
|
||||||
help=f'Queue subscriber name (default: {default_subscriber})'
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-o', '--output-queue',
|
'-o', '--output-queue',
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,3 @@ class LlmError(Exception):
|
||||||
class ParseError(Exception):
|
class ParseError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -267,6 +267,7 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
print(type(e))
|
||||||
print(f"Exception: {e}")
|
print(f"Exception: {e}")
|
||||||
|
|
||||||
print("Send error response...", flush=True)
|
print("Send error response...", flush=True)
|
||||||
|
|
|
||||||
|
|
@ -158,25 +158,15 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
except TooManyRequests:
|
except TooManyRequests:
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
print("Rate limit...")
|
||||||
|
|
||||||
r = TextCompletionResponse(
|
# Leave rate limit retries to the base handler
|
||||||
error=Error(
|
raise TooManyRequests()
|
||||||
type = "rate-limit",
|
|
||||||
message = str(e),
|
|
||||||
),
|
|
||||||
response=None,
|
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
# Apart from rate limits, treat all exceptions as unrecoverable
|
||||||
|
|
||||||
print(f"Exception: {e}")
|
print(f"Exception: {e}")
|
||||||
|
|
||||||
print("Send error response...", flush=True)
|
print("Send error response...", flush=True)
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,9 @@ Simple LLM service, performs text prompt completion using the Azure
|
||||||
OpenAI endpoit service. Input is prompt, output is response.
|
OpenAI endpoit service. Input is prompt, output is response.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import requests
|
|
||||||
import json
|
import json
|
||||||
from prometheus_client import Histogram
|
from prometheus_client import Histogram
|
||||||
from openai import AzureOpenAI
|
from openai import AzureOpenAI, RateLimitError
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
|
from .... schema import TextCompletionRequest, TextCompletionResponse, Error
|
||||||
|
|
@ -126,30 +125,27 @@ class Processor(ConsumerProducer):
|
||||||
print(f"Output Tokens: {outputtokens}", flush=True)
|
print(f"Output Tokens: {outputtokens}", flush=True)
|
||||||
print("Send response...", flush=True)
|
print("Send response...", flush=True)
|
||||||
|
|
||||||
r = TextCompletionResponse(response=resp.choices[0].message.content, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model)
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
except TooManyRequests:
|
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
|
||||||
|
|
||||||
r = TextCompletionResponse(
|
r = TextCompletionResponse(
|
||||||
error=Error(
|
response=resp.choices[0].message.content,
|
||||||
type = "rate-limit",
|
error=None,
|
||||||
message = str(e),
|
in_token=inputtokens,
|
||||||
),
|
out_token=outputtokens,
|
||||||
response=None,
|
model=self.model
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
self.producer.send(r, properties={"id": id})
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
except RateLimitError:
|
||||||
|
|
||||||
|
print("Send rate limit response...", flush=True)
|
||||||
|
|
||||||
|
# Leave rate limit retries to the base handler
|
||||||
|
raise TooManyRequests()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
# Apart from rate limits, treat all exceptions as unrecoverable
|
||||||
|
|
||||||
print(f"Exception: {e}")
|
print(f"Exception: {e}")
|
||||||
|
|
||||||
print("Send error response...", flush=True)
|
print("Send error response...", flush=True)
|
||||||
|
|
|
||||||
|
|
@ -87,8 +87,6 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# FIXME: Rate limits?
|
|
||||||
|
|
||||||
with __class__.text_completion_metric.time():
|
with __class__.text_completion_metric.time():
|
||||||
|
|
||||||
response = message = self.claude.messages.create(
|
response = message = self.claude.messages.create(
|
||||||
|
|
@ -117,34 +115,26 @@ class Processor(ConsumerProducer):
|
||||||
print(f"Output Tokens: {outputtokens}", flush=True)
|
print(f"Output Tokens: {outputtokens}", flush=True)
|
||||||
|
|
||||||
print("Send response...", flush=True)
|
print("Send response...", flush=True)
|
||||||
r = TextCompletionResponse(response=resp, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model)
|
r = TextCompletionResponse(
|
||||||
|
response=resp,
|
||||||
|
error=None,
|
||||||
|
in_token=inputtokens,
|
||||||
|
out_token=outputtokens,
|
||||||
|
model=self.model
|
||||||
|
)
|
||||||
self.send(r, properties={"id": id})
|
self.send(r, properties={"id": id})
|
||||||
|
|
||||||
print("Done.", flush=True)
|
print("Done.", flush=True)
|
||||||
|
|
||||||
# FIXME: Wrong exception, don't know what this LLM throws
|
except anthropic.RateLimitError:
|
||||||
# for a rate limit
|
|
||||||
except TooManyRequests:
|
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
# Leave rate limit retries to the base handler
|
||||||
|
raise TooManyRequests()
|
||||||
r = TextCompletionResponse(
|
|
||||||
error=Error(
|
|
||||||
type = "rate-limit",
|
|
||||||
message = str(e),
|
|
||||||
),
|
|
||||||
response=None,
|
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
# Apart from rate limits, treat all exceptions as unrecoverable
|
||||||
|
|
||||||
print(f"Exception: {e}")
|
print(f"Exception: {e}")
|
||||||
|
|
||||||
print("Send error response...", flush=True)
|
print("Send error response...", flush=True)
|
||||||
|
|
|
||||||
|
|
@ -112,27 +112,15 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
# FIXME: Wrong exception, don't know what this LLM throws
|
# FIXME: Wrong exception, don't know what this LLM throws
|
||||||
# for a rate limit
|
# for a rate limit
|
||||||
except TooManyRequests:
|
except cohere.TooManyRequestsError:
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
# Leave rate limit retries to the base handler
|
||||||
|
raise TooManyRequests()
|
||||||
r = TextCompletionResponse(
|
|
||||||
error=Error(
|
|
||||||
type = "rate-limit",
|
|
||||||
message = str(e),
|
|
||||||
),
|
|
||||||
response=None,
|
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
# Apart from rate limits, treat all exceptions as unrecoverable
|
||||||
|
|
||||||
print(f"Exception: {e}")
|
print(f"Exception: {e}")
|
||||||
|
|
||||||
print("Send error response...", flush=True)
|
print("Send error response...", flush=True)
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,8 @@ class Processor(ConsumerProducer):
|
||||||
HarmCategory.HARM_CATEGORY_HARASSMENT: block_level,
|
HarmCategory.HARM_CATEGORY_HARASSMENT: block_level,
|
||||||
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: block_level,
|
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: block_level,
|
||||||
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: block_level,
|
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: block_level,
|
||||||
# There is a documentation conflict on whether or not CIVIC_INTEGRITY is a valid category
|
# There is a documentation conflict on whether or not
|
||||||
|
# CIVIC_INTEGRITY is a valid category
|
||||||
# HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY: block_level,
|
# HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY: block_level,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -122,8 +123,6 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# FIXME: Rate limits?
|
|
||||||
|
|
||||||
with __class__.text_completion_metric.time():
|
with __class__.text_completion_metric.time():
|
||||||
|
|
||||||
chat_session = self.llm.start_chat(
|
chat_session = self.llm.start_chat(
|
||||||
|
|
@ -140,35 +139,30 @@ class Processor(ConsumerProducer):
|
||||||
print(f"Output Tokens: {outputtokens}", flush=True)
|
print(f"Output Tokens: {outputtokens}", flush=True)
|
||||||
|
|
||||||
print("Send response...", flush=True)
|
print("Send response...", flush=True)
|
||||||
r = TextCompletionResponse(response=resp, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model)
|
r = TextCompletionResponse(
|
||||||
|
response=resp,
|
||||||
|
error=None,
|
||||||
|
in_token=inputtokens,
|
||||||
|
out_token=outputtokens,
|
||||||
|
model=self.model
|
||||||
|
)
|
||||||
self.send(r, properties={"id": id})
|
self.send(r, properties={"id": id})
|
||||||
|
|
||||||
print("Done.", flush=True)
|
print("Done.", flush=True)
|
||||||
|
|
||||||
# FIXME: Wrong exception, don't know what this LLM throws
|
|
||||||
# for a rate limit
|
|
||||||
except ResourceExhausted as e:
|
except ResourceExhausted as e:
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
print("Hit rate limit:", e, flush=True)
|
||||||
|
|
||||||
r = TextCompletionResponse(
|
# Leave rate limit retries to the default handler
|
||||||
error=Error(
|
raise TooManyRequests()
|
||||||
type = "rate-limit",
|
|
||||||
message = str(e),
|
|
||||||
),
|
|
||||||
response=None,
|
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
print(f"Exception: {e}")
|
# Apart from rate limits, treat all exceptions as unrecoverable
|
||||||
|
|
||||||
|
print(type(e), flush=True)
|
||||||
|
print(f"Exception: {e}", flush=True)
|
||||||
|
|
||||||
print("Send error response...", flush=True)
|
print("Send error response...", flush=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -126,26 +126,7 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
print("Done.", flush=True)
|
print("Done.", flush=True)
|
||||||
|
|
||||||
# FIXME: Wrong exception, don't know what this LLM throws
|
# SLM, presumably there aren't rate limits
|
||||||
# for a rate limit
|
|
||||||
except TooManyRequests:
|
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
|
||||||
|
|
||||||
r = TextCompletionResponse(
|
|
||||||
error=Error(
|
|
||||||
type = "rate-limit",
|
|
||||||
message = str(e),
|
|
||||||
),
|
|
||||||
response=None,
|
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -100,26 +100,7 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
print("Done.", flush=True)
|
print("Done.", flush=True)
|
||||||
|
|
||||||
# FIXME: Wrong exception, don't know what this LLM throws
|
# SLM, presumably no rate limits
|
||||||
# for a rate limit
|
|
||||||
except TooManyRequests:
|
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
|
||||||
|
|
||||||
r = TextCompletionResponse(
|
|
||||||
error=Error(
|
|
||||||
type = "rate-limit",
|
|
||||||
message = str(e),
|
|
||||||
),
|
|
||||||
response=None,
|
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using OpenAI.
|
||||||
Input is prompt, output is response.
|
Input is prompt, output is response.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI, RateLimitError
|
||||||
from prometheus_client import Histogram
|
from prometheus_client import Histogram
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
@ -87,8 +87,6 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# FIXME: Rate limits
|
|
||||||
|
|
||||||
with __class__.text_completion_metric.time():
|
with __class__.text_completion_metric.time():
|
||||||
|
|
||||||
resp = self.openai.chat.completions.create(
|
resp = self.openai.chat.completions.create(
|
||||||
|
|
@ -134,27 +132,15 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
# FIXME: Wrong exception, don't know what this LLM throws
|
# FIXME: Wrong exception, don't know what this LLM throws
|
||||||
# for a rate limit
|
# for a rate limit
|
||||||
except TooManyRequests:
|
except openai.RateLimitError:
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
# Leave rate limit retries to the base handler
|
||||||
|
raise TooManyRequests()
|
||||||
r = TextCompletionResponse(
|
|
||||||
error=Error(
|
|
||||||
type = "rate-limit",
|
|
||||||
message = str(e),
|
|
||||||
),
|
|
||||||
response=None,
|
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
# Apart from rate limits, treat all exceptions as unrecoverable
|
||||||
|
|
||||||
print(f"Exception: {e}")
|
print(f"Exception: {e}")
|
||||||
|
|
||||||
print("Send error response...", flush=True)
|
print("Send error response...", flush=True)
|
||||||
|
|
|
||||||
|
|
@ -178,25 +178,15 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
except google.api_core.exceptions.ResourceExhausted as e:
|
except google.api_core.exceptions.ResourceExhausted as e:
|
||||||
|
|
||||||
print("Send rate limit response...", flush=True)
|
print("Hit rate limit:", e, flush=True)
|
||||||
|
|
||||||
r = TextCompletionResponse(
|
# Leave rate limit retries to the base handler
|
||||||
error=Error(
|
raise TooManyRequests()
|
||||||
type = "rate-limit",
|
|
||||||
message = str(e),
|
|
||||||
),
|
|
||||||
response=None,
|
|
||||||
in_token=None,
|
|
||||||
out_token=None,
|
|
||||||
model=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.producer.send(r, properties={"id": id})
|
|
||||||
|
|
||||||
self.consumer.acknowledge(msg)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
||||||
|
# Apart from rate limits, treat all exceptions as unrecoverable
|
||||||
|
|
||||||
print(f"Exception: {e}")
|
print(f"Exception: {e}")
|
||||||
|
|
||||||
print("Send error response...", flush=True)
|
print("Send error response...", flush=True)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue