Refactor rate limit handling (#280)

* - Refactored retry for rate limits into the base class - ConsumerProducer is derived from Consumer to simplify code - Added rate_limit_count metrics for rate limit events * Add rate limit events to VertexAI and Google AI Studio * Added Grafana rate limit dashboard * Add rate limit handling to all LLMs
2026-04-28 01:46:22 +02:00 · 2025-01-27 17:04:49 +00:00 · 2025-01-27 17:04:49 +00:00 · 0e03bc05a4
commit 0e03bc05a4
parent 26a586034c
14 changed files with 174 additions and 298 deletions
--- a/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/azure/llm.py
@ -158,25 +158,15 @@ class Processor(ConsumerProducer):

        except TooManyRequests:

-            print("Send rate limit response...", flush=True)
+            print("Rate limit...")

-            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
-            )
-
-            self.producer.send(r, properties={"id": id})
-
-            self.consumer.acknowledge(msg)
+            # Leave rate limit retries to the base handler
+            raise TooManyRequests()

        except Exception as e:

+            # Apart from rate limits, treat all exceptions as unrecoverable
+
            print(f"Exception: {e}")

            print("Send error response...", flush=True)
--- a/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/azure_openai/llm.py
@ -4,10 +4,9 @@ Simple LLM service, performs text prompt completion using the Azure
 OpenAI endpoit service.  Input is prompt, output is response.
 """

-import requests
 import json
 from prometheus_client import Histogram
-from openai import AzureOpenAI
+from openai import AzureOpenAI, RateLimitError
 import os

 from .... schema import TextCompletionRequest, TextCompletionResponse, Error
@ -126,30 +125,27 @@ class Processor(ConsumerProducer):
            print(f"Output Tokens: {outputtokens}", flush=True)
            print("Send response...", flush=True)

-            r = TextCompletionResponse(response=resp.choices[0].message.content, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model)
-            self.producer.send(r, properties={"id": id})
-
-        except TooManyRequests:
-
-            print("Send rate limit response...", flush=True)
-
            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
+                response=resp.choices[0].message.content,
+                error=None,
+                in_token=inputtokens,
+                out_token=outputtokens,
+                model=self.model
            )

            self.producer.send(r, properties={"id": id})

-            self.consumer.acknowledge(msg)
+        except RateLimitError:
+
+            print("Send rate limit response...", flush=True)
+
+            # Leave rate limit retries to the base handler
+            raise TooManyRequests()

        except Exception as e:

+            # Apart from rate limits, treat all exceptions as unrecoverable
+
            print(f"Exception: {e}")

            print("Send error response...", flush=True)
--- a/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/claude/llm.py
@ -87,8 +87,6 @@ class Processor(ConsumerProducer):

        try:

-            # FIXME: Rate limits?
-
            with __class__.text_completion_metric.time():

                response = message = self.claude.messages.create(
@ -117,34 +115,26 @@ class Processor(ConsumerProducer):
            print(f"Output Tokens: {outputtokens}", flush=True)

            print("Send response...", flush=True)
-            r = TextCompletionResponse(response=resp, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model)
+            r = TextCompletionResponse(
+                response=resp,
+                error=None,
+                in_token=inputtokens,
+                out_token=outputtokens,
+                model=self.model
+            )
            self.send(r, properties={"id": id})

            print("Done.", flush=True)

-        # FIXME: Wrong exception, don't know what this LLM throws
-        # for a rate limit
-        except TooManyRequests:
+        except anthropic.RateLimitError:

-            print("Send rate limit response...", flush=True)
-
-            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
-            )
-
-            self.producer.send(r, properties={"id": id})
-
-            self.consumer.acknowledge(msg)
+            # Leave rate limit retries to the base handler
+            raise TooManyRequests()

        except Exception as e:

+            # Apart from rate limits, treat all exceptions as unrecoverable
+
            print(f"Exception: {e}")

            print("Send error response...", flush=True)
--- a/trustgraph-flow/trustgraph/model/text_completion/cohere/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/cohere/llm.py
@ -112,27 +112,15 @@ class Processor(ConsumerProducer):

        # FIXME: Wrong exception, don't know what this LLM throws
        # for a rate limit
-        except TooManyRequests:
+        except cohere.TooManyRequestsError:

-            print("Send rate limit response...", flush=True)
-
-            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
-            )
-
-            self.producer.send(r, properties={"id": id})
-
-            self.consumer.acknowledge(msg)
+            # Leave rate limit retries to the base handler
+            raise TooManyRequests()

        except Exception as e:

+            # Apart from rate limits, treat all exceptions as unrecoverable
+
            print(f"Exception: {e}")

            print("Send error response...", flush=True)
--- a/trustgraph-flow/trustgraph/model/text_completion/googleaistudio/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/googleaistudio/llm.py
@ -88,7 +88,8 @@ class Processor(ConsumerProducer):
            HarmCategory.HARM_CATEGORY_HARASSMENT: block_level,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: block_level,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: block_level,
-            # There is a documentation conflict on whether or not CIVIC_INTEGRITY is a valid category
+            # There is a documentation conflict on whether or not
+            # CIVIC_INTEGRITY is a valid category
            # HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY: block_level,
        }

@ -122,8 +123,6 @@ class Processor(ConsumerProducer):

        try:

-            # FIXME: Rate limits?
-
            with __class__.text_completion_metric.time():

                chat_session = self.llm.start_chat(
@ -140,35 +139,30 @@ class Processor(ConsumerProducer):
            print(f"Output Tokens: {outputtokens}", flush=True)

            print("Send response...", flush=True)
-            r = TextCompletionResponse(response=resp, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model)
+            r = TextCompletionResponse(
+                response=resp,
+                error=None,
+                in_token=inputtokens,
+                out_token=outputtokens,
+                model=self.model
+            )
            self.send(r, properties={"id": id})

            print("Done.", flush=True)

-        # FIXME: Wrong exception, don't know what this LLM throws
-        # for a rate limit
        except ResourceExhausted as e:

-            print("Send rate limit response...", flush=True)
+            print("Hit rate limit:", e, flush=True)

-            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
-            )
-
-            self.producer.send(r, properties={"id": id})
-
-            self.consumer.acknowledge(msg)
+            # Leave rate limit retries to the default handler
+            raise TooManyRequests()

        except Exception as e:

-            print(f"Exception: {e}")
+            # Apart from rate limits, treat all exceptions as unrecoverable
+
+            print(type(e), flush=True)
+            print(f"Exception: {e}", flush=True)

            print("Send error response...", flush=True)

--- a/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/llamafile/llm.py
@ -126,26 +126,7 @@ class Processor(ConsumerProducer):

            print("Done.", flush=True)

-        # FIXME: Wrong exception, don't know what this LLM throws
-        # for a rate limit
-        except TooManyRequests:
-
-            print("Send rate limit response...", flush=True)
-
-            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
-            )
-
-            self.producer.send(r, properties={"id": id})
-
-            self.consumer.acknowledge(msg)
+        # SLM, presumably there aren't rate limits

        except Exception as e:

--- a/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/ollama/llm.py
@ -100,26 +100,7 @@ class Processor(ConsumerProducer):

            print("Done.", flush=True)

-        # FIXME: Wrong exception, don't know what this LLM throws
-        # for a rate limit
-        except TooManyRequests:
-
-            print("Send rate limit response...", flush=True)
-
-            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
-            )
-
-            self.producer.send(r, properties={"id": id})
-
-            self.consumer.acknowledge(msg)
+        # SLM, presumably no rate limits

        except Exception as e:

--- a/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py
+++ b/trustgraph-flow/trustgraph/model/text_completion/openai/llm.py
@ -4,7 +4,7 @@ Simple LLM service, performs text prompt completion using OpenAI.
 Input is prompt, output is response.
 """

-from openai import OpenAI
+from openai import OpenAI, RateLimitError
 from prometheus_client import Histogram
 import os

@ -87,8 +87,6 @@ class Processor(ConsumerProducer):

        try:

-            # FIXME: Rate limits
-
            with __class__.text_completion_metric.time():

                resp = self.openai.chat.completions.create(
@ -134,27 +132,15 @@ class Processor(ConsumerProducer):

        # FIXME: Wrong exception, don't know what this LLM throws
        # for a rate limit
-        except TooManyRequests:
+        except openai.RateLimitError:

-            print("Send rate limit response...", flush=True)
-
-            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
-            )
-
-            self.producer.send(r, properties={"id": id})
-
-            self.consumer.acknowledge(msg)
+            # Leave rate limit retries to the base handler
+            raise TooManyRequests()

        except Exception as e:

+            # Apart from rate limits, treat all exceptions as unrecoverable
+
            print(f"Exception: {e}")

            print("Send error response...", flush=True)