Refactor rate limit handling (#280)

* - Refactored retry for rate limits into the base class - ConsumerProducer is derived from Consumer to simplify code - Added rate_limit_count metrics for rate limit events * Add rate limit events to VertexAI and Google AI Studio * Added Grafana rate limit dashboard * Add rate limit handling to all LLMs
2026-04-28 01:46:22 +02:00 · 2025-01-27 17:04:49 +00:00 · 2025-01-27 17:04:49 +00:00 · 0e03bc05a4
commit 0e03bc05a4
parent 26a586034c
14 changed files with 174 additions and 298 deletions
--- a/trustgraph-vertexai/trustgraph/model/text_completion/vertexai/llm.py
+++ b/trustgraph-vertexai/trustgraph/model/text_completion/vertexai/llm.py
@ -178,25 +178,15 @@ class Processor(ConsumerProducer):

        except google.api_core.exceptions.ResourceExhausted as e:

-            print("Send rate limit response...", flush=True)
+            print("Hit rate limit:", e, flush=True)

-            r = TextCompletionResponse(
-                error=Error(
-                    type = "rate-limit",
-                    message = str(e),
-                ),
-                response=None,
-                in_token=None,
-                out_token=None,
-                model=None,
-            )
-
-            self.producer.send(r, properties={"id": id})
-
-            self.consumer.acknowledge(msg)
+            # Leave rate limit retries to the base handler
+            raise TooManyRequests()

        except Exception as e:

+            # Apart from rate limits, treat all exceptions as unrecoverable
+
            print(f"Exception: {e}")

            print("Send error response...", flush=True)