Metering for all LLMs (#76)

* Fixed VertexAI token counts

* Slight fix for Cohere token count

* Slight tweak to AzureAI

* Fix for prometheus for AzureAI
This commit is contained in:
Jack Colquitt 2024-09-29 15:59:17 -07:00 committed by GitHub
parent 74a14639bd
commit 2f23309f05
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 11 additions and 7 deletions

View file

@ -22,6 +22,7 @@ default_output_queue = text_completion_response_queue
default_subscriber = module
default_temperature = 0.0
default_max_output = 4192
default_model = "AzureAI"
class Processor(ConsumerProducer):
@ -34,6 +35,7 @@ class Processor(ConsumerProducer):
token = params.get("token")
temperature = params.get("temperature", default_temperature)
max_output = params.get("max_output", default_max_output)
model = default_model
super(Processor, self).__init__(
**params | {
@ -44,6 +46,7 @@ class Processor(ConsumerProducer):
"output_schema": TextCompletionResponse,
"temperature": temperature,
"max_output": max_output,
"model": model,
}
)
@ -64,6 +67,7 @@ class Processor(ConsumerProducer):
self.token = token
self.temperature = temperature
self.max_output = max_output
self.model = model
def build_prompt(self, system, content):
@ -140,7 +144,7 @@ class Processor(ConsumerProducer):
print("Send response...", flush=True)
r = TextCompletionResponse(response=resp, error=None, in_token=inputtokens, out_token=outputtokens, model="AzureAI")
r = TextCompletionResponse(response=resp, error=None, in_token=inputtokens, out_token=outputtokens, model=self.model)
self.producer.send(r, properties={"id": id})
except TooManyRequests:

View file

@ -91,8 +91,8 @@ class Processor(ConsumerProducer):
)
resp = output.text
inputtokens = output.meta.billed_units.input_tokens
outputtokens = output.meta.billed_units.output_tokens
inputtokens = int(output.meta.billed_units.input_tokens)
outputtokens = int(output.meta.billed_units.output_tokens)
print(resp, flush=True)
print(f"Input Tokens: {inputtokens}", flush=True)

View file

@ -142,14 +142,14 @@ class Processor(ConsumerProducer):
with __class__.text_completion_metric.time():
resp = self.llm.generate_content(
response = self.llm.generate_content(
prompt, generation_config=self.generation_config,
safety_settings=self.safety_settings
)
resp = resp.text
inputtokens = resp.usage_metadata.prompt_token_count
outputtokens = resp.usage_metadata.candidates_token_count
resp = response.text
inputtokens = int(response.usage_metadata.prompt_token_count)
outputtokens = int(response.usage_metadata.candidates_token_count)
print(resp, flush=True)
print(f"Input Tokens: {inputtokens}", flush=True)
print(f"Output Tokens: {outputtokens}", flush=True)