mirror of
https://github.com/katanemo/plano.git
synced 2026-04-27 01:36:33 +02:00
use passed in model name in chat completion request (#445)
This commit is contained in:
parent
bd8004d1ae
commit
eb48f3d5bb
20 changed files with 364 additions and 89 deletions
|
|
@ -1,7 +1,7 @@
|
|||
use crate::configuration;
|
||||
use configuration::{Limit, Ratelimit, TimeUnit};
|
||||
use governor::{DefaultKeyedRateLimiter, InsufficientCapacity, Quota};
|
||||
use log::debug;
|
||||
use log::trace;
|
||||
use std::fmt::Display;
|
||||
use std::num::{NonZero, NonZeroU32};
|
||||
use std::sync::RwLock;
|
||||
|
|
@ -99,9 +99,11 @@ impl RatelimitMap {
|
|||
selector: Header,
|
||||
tokens_used: NonZeroU32,
|
||||
) -> Result<(), Error> {
|
||||
debug!(
|
||||
trace!(
|
||||
"Checking limit for provider={}, with selector={:?}, consuming tokens={:?}",
|
||||
provider, selector, tokens_used
|
||||
provider,
|
||||
selector,
|
||||
tokens_used
|
||||
);
|
||||
|
||||
let provider_limits = match self.datastore.get(&provider) {
|
||||
|
|
|
|||
|
|
@ -1,19 +1,25 @@
|
|||
use log::trace;
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
#[allow(dead_code)]
|
||||
pub enum Error {
|
||||
#[error("Unknown model: {model_name}")]
|
||||
UnknownModel { model_name: String },
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn token_count(model_name: &str, text: &str) -> Result<usize, Error> {
|
||||
pub fn token_count(model_name: &str, text: &str) -> Result<usize, String> {
|
||||
trace!("getting token count model={}", model_name);
|
||||
//HACK: add support for tokenizing mistral and other models
|
||||
//filed issue https://github.com/katanemo/arch/issues/222
|
||||
|
||||
let updated_model = match model_name.starts_with("gpt") {
|
||||
false => {
|
||||
trace!(
|
||||
"tiktoken_rs: unsupported model: {}, using gpt-4 to compute token count",
|
||||
model_name
|
||||
);
|
||||
|
||||
"gpt-4"
|
||||
}
|
||||
true => model_name,
|
||||
};
|
||||
|
||||
// Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
|
||||
let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel {
|
||||
model_name: model_name.to_string(),
|
||||
})?;
|
||||
let bpe = tiktoken_rs::get_bpe_from_model(updated_model).map_err(|e| e.to_string())?;
|
||||
Ok(bpe.encode_ordinary(text).len())
|
||||
}
|
||||
|
||||
|
|
@ -30,14 +36,4 @@ mod test {
|
|||
token_count(model_name, text).expect("correct tokenization")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unrecognized_model() {
|
||||
assert_eq!(
|
||||
Error::UnknownModel {
|
||||
model_name: "unknown".to_string()
|
||||
},
|
||||
token_count("unknown", "").expect_err("unknown model")
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue