Refactor rate limit handling (#280)

* - Refactored retry for rate limits into the base class
- ConsumerProducer is derived from Consumer to simplify code
- Added rate_limit_count metrics for rate limit events

* Add rate limit events to VertexAI and Google AI Studio

* Added Grafana rate limit dashboard

* Add rate limit handling to all LLMs
This commit is contained in:
cybermaggedon 2025-01-27 17:04:49 +00:00 committed by GitHub
parent 26a586034c
commit 0e03bc05a4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 174 additions and 298 deletions

View file

@ -577,7 +577,7 @@
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"exemplar": false, "exemplar": false,
"expr": "increase(processing_count_total{status!=\"success\"}[$__rate_interval])", "expr": "sum by(job) (increase(rate_limit_count_total[$__rate_interval]))",
"format": "time_series", "format": "time_series",
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": true, "includeNullMetadata": true,
@ -588,7 +588,7 @@
"useBackend": false "useBackend": false
} }
], ],
"title": "Errors", "title": "Rate limit events",
"type": "timeseries" "type": "timeseries"
}, },
{ {

View file

@ -7,6 +7,9 @@ import time
from . base_processor import BaseProcessor from . base_processor import BaseProcessor
from .. exceptions import TooManyRequests from .. exceptions import TooManyRequests
default_rate_limit_retry = 10
default_rate_limit_timeout = 7200
class Consumer(BaseProcessor): class Consumer(BaseProcessor):
def __init__(self, **params): def __init__(self, **params):
@ -22,11 +25,18 @@ class Consumer(BaseProcessor):
super(Consumer, self).__init__(**params) super(Consumer, self).__init__(**params)
input_queue = params.get("input_queue") self.input_queue = params.get("input_queue")
subscriber = params.get("subscriber") self.subscriber = params.get("subscriber")
input_schema = params.get("input_schema") self.input_schema = params.get("input_schema")
if input_schema == None: self.rate_limit_retry = params.get(
"rate_limit_retry", default_rate_limit_retry
)
self.rate_limit_timeout = params.get(
"rate_limit_timeout", default_rate_limit_timeout
)
if self.input_schema == None:
raise RuntimeError("input_schema must be specified") raise RuntimeError("input_schema must be specified")
if not hasattr(__class__, "request_metric"): if not hasattr(__class__, "request_metric"):
@ -44,18 +54,27 @@ class Consumer(BaseProcessor):
'processing_count', 'Processing count', ["status"] 'processing_count', 'Processing count', ["status"]
) )
if not hasattr(__class__, "rate_limit_metric"):
__class__.rate_limit_metric = Counter(
'rate_limit_count', 'Rate limit event count',
)
__class__.pubsub_metric.info({ __class__.pubsub_metric.info({
"input_queue": input_queue, "input_queue": self.input_queue,
"subscriber": subscriber, "subscriber": self.subscriber,
"input_schema": input_schema.__name__, "input_schema": self.input_schema.__name__,
"rate_limit_retry": str(self.rate_limit_retry),
"rate_limit_timeout": str(self.rate_limit_timeout),
}) })
self.consumer = self.client.subscribe( self.consumer = self.client.subscribe(
input_queue, subscriber, self.input_queue, self.subscriber,
consumer_type=pulsar.ConsumerType.Shared, consumer_type=pulsar.ConsumerType.Shared,
schema=JsonSchema(input_schema), schema=JsonSchema(self.input_schema),
) )
print("Initialised consumer.", flush=True)
def run(self): def run(self):
__class__.state_metric.state('running') __class__.state_metric.state('running')
@ -64,31 +83,61 @@ class Consumer(BaseProcessor):
msg = self.consumer.receive() msg = self.consumer.receive()
try: expiry = time.time() + self.rate_limit_timeout
with __class__.request_metric.time(): # This loop is for retry on rate-limit / resource limits
self.handle(msg) while True:
# Acknowledge successful processing of the message if time.time() > expiry:
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc() print("Gave up waiting for rate-limit retry", flush=True)
except TooManyRequests: # Message failed to be processed, this causes it to
self.consumer.negative_acknowledge(msg) # be retried
print("TooManyRequests: will retry") self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="rate-limit").inc()
time.sleep(5) __class__.processing_metric.labels(status="error").inc()
continue
# Break out of retry loop, processes next message
break
try:
with __class__.request_metric.time():
self.handle(msg)
# Acknowledge successful processing of the message
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc()
# Break out of retry loop
break
except TooManyRequests:
print("TooManyRequests: will retry...", flush=True)
__class__.rate_limit_metric.inc()
# Sleep
time.sleep(self.rate_limit_retry)
# Contine from retry loop, just causes a reprocessing
continue
except Exception as e: except Exception as e:
print("Exception:", e, flush=True) print("Exception:", e, flush=True)
# Message failed to be processed # Message failed to be processed, this causes it to
self.consumer.negative_acknowledge(msg) # be retried
self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="error").inc() __class__.processing_metric.labels(status="error").inc()
# Break out of retry loop, processes next message
break
@staticmethod @staticmethod
def add_args(parser, default_input_queue, default_subscriber): def add_args(parser, default_input_queue, default_subscriber):
@ -107,3 +156,17 @@ class Consumer(BaseProcessor):
help=f'Queue subscriber name (default: {default_subscriber})' help=f'Queue subscriber name (default: {default_subscriber})'
) )
parser.add_argument(
'--rate-limit-retry',
type=int,
default=default_rate_limit_retry,
help=f'Rate limit retry (default: {default_rate_limit_retry})'
)
parser.add_argument(
'--rate-limit-timeout',
type=int,
default=default_rate_limit_timeout,
help=f'Rate limit timeout (default: {default_rate_limit_timeout})'
)

View file

@ -4,111 +4,43 @@ import pulsar
from prometheus_client import Histogram, Info, Counter, Enum from prometheus_client import Histogram, Info, Counter, Enum
import time import time
from . base_processor import BaseProcessor from . consumer import Consumer
from .. exceptions import TooManyRequests from .. exceptions import TooManyRequests
# FIXME: Derive from consumer? And producer? class ConsumerProducer(Consumer):
class ConsumerProducer(BaseProcessor):
def __init__(self, **params): def __init__(self, **params):
if not hasattr(__class__, "state_metric"): super(ConsumerProducer, self).__init__(**params)
__class__.state_metric = Enum(
'processor_state', 'Processor state',
states=['starting', 'running', 'stopped']
)
__class__.state_metric.state('starting')
__class__.state_metric.state('starting') self.output_queue = params.get("output_queue")
self.output_schema = params.get("output_schema")
input_queue = params.get("input_queue")
output_queue = params.get("output_queue")
subscriber = params.get("subscriber")
input_schema = params.get("input_schema")
output_schema = params.get("output_schema")
if not hasattr(__class__, "request_metric"):
__class__.request_metric = Histogram(
'request_latency', 'Request latency (seconds)'
)
if not hasattr(__class__, "output_metric"): if not hasattr(__class__, "output_metric"):
__class__.output_metric = Counter( __class__.output_metric = Counter(
'output_count', 'Output items created' 'output_count', 'Output items created'
) )
if not hasattr(__class__, "pubsub_metric"):
__class__.pubsub_metric = Info(
'pubsub', 'Pub/sub configuration'
)
if not hasattr(__class__, "processing_metric"):
__class__.processing_metric = Counter(
'processing_count', 'Processing count', ["status"]
)
__class__.pubsub_metric.info({ __class__.pubsub_metric.info({
"input_queue": input_queue, "input_queue": self.input_queue,
"output_queue": output_queue, "output_queue": self.output_queue,
"subscriber": subscriber, "subscriber": self.subscriber,
"input_schema": input_schema.__name__, "input_schema": self.input_schema.__name__,
"output_schema": output_schema.__name__, "output_schema": self.output_schema.__name__,
"rate_limit_retry": str(self.rate_limit_retry),
"rate_limit_timeout": str(self.rate_limit_timeout),
}) })
super(ConsumerProducer, self).__init__(**params) if self.output_schema == None:
if input_schema == None:
raise RuntimeError("input_schema must be specified")
if output_schema == None:
raise RuntimeError("output_schema must be specified") raise RuntimeError("output_schema must be specified")
self.producer = self.client.create_producer( self.producer = self.client.create_producer(
topic=output_queue, topic=self.output_queue,
schema=JsonSchema(output_schema), schema=JsonSchema(self.output_schema),
chunking_enabled=True, chunking_enabled=True,
) )
self.consumer = self.client.subscribe( print("Initialised consumer/producer.")
input_queue, subscriber,
consumer_type=pulsar.ConsumerType.Shared,
schema=JsonSchema(input_schema),
)
def run(self):
__class__.state_metric.state('running')
while True:
msg = self.consumer.receive()
try:
with __class__.request_metric.time():
resp = self.handle(msg)
# Acknowledge successful processing of the message
self.consumer.acknowledge(msg)
__class__.processing_metric.labels(status="success").inc()
except TooManyRequests:
self.consumer.negative_acknowledge(msg)
print("TooManyRequests: will retry")
__class__.processing_metric.labels(status="rate-limit").inc()
time.sleep(5)
continue
except Exception as e:
print("Exception:", e, flush=True)
# Message failed to be processed
self.consumer.negative_acknowledge(msg)
__class__.processing_metric.labels(status="error").inc()
def send(self, msg, properties={}): def send(self, msg, properties={}):
self.producer.send(msg, properties) self.producer.send(msg, properties)
@ -120,19 +52,7 @@ class ConsumerProducer(BaseProcessor):
default_output_queue, default_output_queue,
): ):
BaseProcessor.add_args(parser) Consumer.add_args(parser, default_input_queue, default_subscriber)
parser.add_argument(
'-i', '--input-queue',
default=default_input_queue,
help=f'Input queue (default: {default_input_queue})'
)
parser.add_argument(
'-s', '--subscriber',
default=default_subscriber,
help=f'Queue subscriber name (default: {default_subscriber})'
)
parser.add_argument( parser.add_argument(
'-o', '--output-queue', '-o', '--output-queue',

View file

@ -8,7 +8,3 @@ class LlmError(Exception):
class ParseError(Exception): class ParseError(Exception):
pass pass

View file

@ -267,6 +267,7 @@ class Processor(ConsumerProducer):
except Exception as e: except Exception as e:
print(type(e))
print(f"Exception: {e}") print(f"Exception: {e}")
print("Send error response...", flush=True) print("Send error response...", flush=True)

View file

@ -158,25 +158,15 @@ class Processor(ConsumerProducer):
except TooManyRequests: except TooManyRequests:
print("Send rate limit response...", flush=True) print("Rate limit...")
r = TextCompletionResponse( # Leave rate limit retries to the base handler
error=Error( raise TooManyRequests()
type = "rate-limit",
message = str(e),
),
response=None,
in_token=None,
out_token=None,
model=None,
)
self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg)
except Exception as e: except Exception as e:
# Apart from rate limits, treat all exceptions as unrecoverable
print(f"Exception: {e}") print(f"Exception: {e}")
print("Send error response...", flush=True) print("Send error response...", flush=True)

View file

@ -4,10 +4,9 @@ Simple LLM service, performs text prompt completion using the Azure
OpenAI endpoit service. Input is prompt, output is response. OpenAI endpoit service. Input is prompt, output is response.
""" """
import requests
import json import json
from prometheus_client import Histogram from prometheus_client import Histogram
from openai import AzureOpenAI from openai import AzureOpenAI, RateLimitError
import os import os
from .... schema import TextCompletionRequest, TextCompletionResponse, Error from .... schema import TextCompletionRequest, TextCompletionResponse, Error
@ -126,30 +125,27 @@ class Processor(ConsumerProducer):
print(f"Output Tokens: {outputtokens}", flush=True) print(f"Output Tokens: {outputtokens}", flush=True)
print("Send response...", flush=True) print("Send response...", flush=True)
r = TextCompletionResponse(response=resp.choices[0].message.content, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model)
self.producer.send(r, properties={"id": id})
except TooManyRequests:
print("Send rate limit response...", flush=True)
r = TextCompletionResponse( r = TextCompletionResponse(
error=Error( response=resp.choices[0].message.content,
type = "rate-limit", error=None,
message = str(e), in_token=inputtokens,
), out_token=outputtokens,
response=None, model=self.model
in_token=None,
out_token=None,
model=None,
) )
self.producer.send(r, properties={"id": id}) self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg) except RateLimitError:
print("Send rate limit response...", flush=True)
# Leave rate limit retries to the base handler
raise TooManyRequests()
except Exception as e: except Exception as e:
# Apart from rate limits, treat all exceptions as unrecoverable
print(f"Exception: {e}") print(f"Exception: {e}")
print("Send error response...", flush=True) print("Send error response...", flush=True)

View file

@ -87,8 +87,6 @@ class Processor(ConsumerProducer):
try: try:
# FIXME: Rate limits?
with __class__.text_completion_metric.time(): with __class__.text_completion_metric.time():
response = message = self.claude.messages.create( response = message = self.claude.messages.create(
@ -117,34 +115,26 @@ class Processor(ConsumerProducer):
print(f"Output Tokens: {outputtokens}", flush=True) print(f"Output Tokens: {outputtokens}", flush=True)
print("Send response...", flush=True) print("Send response...", flush=True)
r = TextCompletionResponse(response=resp, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model) r = TextCompletionResponse(
response=resp,
error=None,
in_token=inputtokens,
out_token=outputtokens,
model=self.model
)
self.send(r, properties={"id": id}) self.send(r, properties={"id": id})
print("Done.", flush=True) print("Done.", flush=True)
# FIXME: Wrong exception, don't know what this LLM throws except anthropic.RateLimitError:
# for a rate limit
except TooManyRequests:
print("Send rate limit response...", flush=True) # Leave rate limit retries to the base handler
raise TooManyRequests()
r = TextCompletionResponse(
error=Error(
type = "rate-limit",
message = str(e),
),
response=None,
in_token=None,
out_token=None,
model=None,
)
self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg)
except Exception as e: except Exception as e:
# Apart from rate limits, treat all exceptions as unrecoverable
print(f"Exception: {e}") print(f"Exception: {e}")
print("Send error response...", flush=True) print("Send error response...", flush=True)

View file

@ -112,27 +112,15 @@ class Processor(ConsumerProducer):
# FIXME: Wrong exception, don't know what this LLM throws # FIXME: Wrong exception, don't know what this LLM throws
# for a rate limit # for a rate limit
except TooManyRequests: except cohere.TooManyRequestsError:
print("Send rate limit response...", flush=True) # Leave rate limit retries to the base handler
raise TooManyRequests()
r = TextCompletionResponse(
error=Error(
type = "rate-limit",
message = str(e),
),
response=None,
in_token=None,
out_token=None,
model=None,
)
self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg)
except Exception as e: except Exception as e:
# Apart from rate limits, treat all exceptions as unrecoverable
print(f"Exception: {e}") print(f"Exception: {e}")
print("Send error response...", flush=True) print("Send error response...", flush=True)

View file

@ -88,7 +88,8 @@ class Processor(ConsumerProducer):
HarmCategory.HARM_CATEGORY_HARASSMENT: block_level, HarmCategory.HARM_CATEGORY_HARASSMENT: block_level,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: block_level, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: block_level,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: block_level, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: block_level,
# There is a documentation conflict on whether or not CIVIC_INTEGRITY is a valid category # There is a documentation conflict on whether or not
# CIVIC_INTEGRITY is a valid category
# HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY: block_level, # HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY: block_level,
} }
@ -122,8 +123,6 @@ class Processor(ConsumerProducer):
try: try:
# FIXME: Rate limits?
with __class__.text_completion_metric.time(): with __class__.text_completion_metric.time():
chat_session = self.llm.start_chat( chat_session = self.llm.start_chat(
@ -140,35 +139,30 @@ class Processor(ConsumerProducer):
print(f"Output Tokens: {outputtokens}", flush=True) print(f"Output Tokens: {outputtokens}", flush=True)
print("Send response...", flush=True) print("Send response...", flush=True)
r = TextCompletionResponse(response=resp, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model) r = TextCompletionResponse(
response=resp,
error=None,
in_token=inputtokens,
out_token=outputtokens,
model=self.model
)
self.send(r, properties={"id": id}) self.send(r, properties={"id": id})
print("Done.", flush=True) print("Done.", flush=True)
# FIXME: Wrong exception, don't know what this LLM throws
# for a rate limit
except ResourceExhausted as e: except ResourceExhausted as e:
print("Send rate limit response...", flush=True) print("Hit rate limit:", e, flush=True)
r = TextCompletionResponse( # Leave rate limit retries to the default handler
error=Error( raise TooManyRequests()
type = "rate-limit",
message = str(e),
),
response=None,
in_token=None,
out_token=None,
model=None,
)
self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg)
except Exception as e: except Exception as e:
print(f"Exception: {e}") # Apart from rate limits, treat all exceptions as unrecoverable
print(type(e), flush=True)
print(f"Exception: {e}", flush=True)
print("Send error response...", flush=True) print("Send error response...", flush=True)

View file

@ -126,26 +126,7 @@ class Processor(ConsumerProducer):
print("Done.", flush=True) print("Done.", flush=True)
# FIXME: Wrong exception, don't know what this LLM throws # SLM, presumably there aren't rate limits
# for a rate limit
except TooManyRequests:
print("Send rate limit response...", flush=True)
r = TextCompletionResponse(
error=Error(
type = "rate-limit",
message = str(e),
),
response=None,
in_token=None,
out_token=None,
model=None,
)
self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg)
except Exception as e: except Exception as e:

View file

@ -100,26 +100,7 @@ class Processor(ConsumerProducer):
print("Done.", flush=True) print("Done.", flush=True)
# FIXME: Wrong exception, don't know what this LLM throws # SLM, presumably no rate limits
# for a rate limit
except TooManyRequests:
print("Send rate limit response...", flush=True)
r = TextCompletionResponse(
error=Error(
type = "rate-limit",
message = str(e),
),
response=None,
in_token=None,
out_token=None,
model=None,
)
self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg)
except Exception as e: except Exception as e:

View file

@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using OpenAI.
Input is prompt, output is response. Input is prompt, output is response.
""" """
from openai import OpenAI from openai import OpenAI, RateLimitError
from prometheus_client import Histogram from prometheus_client import Histogram
import os import os
@ -87,8 +87,6 @@ class Processor(ConsumerProducer):
try: try:
# FIXME: Rate limits
with __class__.text_completion_metric.time(): with __class__.text_completion_metric.time():
resp = self.openai.chat.completions.create( resp = self.openai.chat.completions.create(
@ -134,27 +132,15 @@ class Processor(ConsumerProducer):
# FIXME: Wrong exception, don't know what this LLM throws # FIXME: Wrong exception, don't know what this LLM throws
# for a rate limit # for a rate limit
except TooManyRequests: except openai.RateLimitError:
print("Send rate limit response...", flush=True) # Leave rate limit retries to the base handler
raise TooManyRequests()
r = TextCompletionResponse(
error=Error(
type = "rate-limit",
message = str(e),
),
response=None,
in_token=None,
out_token=None,
model=None,
)
self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg)
except Exception as e: except Exception as e:
# Apart from rate limits, treat all exceptions as unrecoverable
print(f"Exception: {e}") print(f"Exception: {e}")
print("Send error response...", flush=True) print("Send error response...", flush=True)

View file

@ -178,25 +178,15 @@ class Processor(ConsumerProducer):
except google.api_core.exceptions.ResourceExhausted as e: except google.api_core.exceptions.ResourceExhausted as e:
print("Send rate limit response...", flush=True) print("Hit rate limit:", e, flush=True)
r = TextCompletionResponse( # Leave rate limit retries to the base handler
error=Error( raise TooManyRequests()
type = "rate-limit",
message = str(e),
),
response=None,
in_token=None,
out_token=None,
model=None,
)
self.producer.send(r, properties={"id": id})
self.consumer.acknowledge(msg)
except Exception as e: except Exception as e:
# Apart from rate limits, treat all exceptions as unrecoverable
print(f"Exception: {e}") print(f"Exception: {e}")
print("Send error response...", flush=True) print("Send error response...", flush=True)