diff --git a/crates/common/src/tokenizer.rs b/crates/common/src/tokenizer.rs index 11aada32..d2611375 100644 --- a/crates/common/src/tokenizer.rs +++ b/crates/common/src/tokenizer.rs @@ -1,14 +1,7 @@ use log::trace; -#[derive(thiserror::Error, Debug, PartialEq, Eq)] #[allow(dead_code)] -pub enum Error { - #[error("Unknown model: {model_name}")] - UnknownModel { model_name: String }, -} - -#[allow(dead_code)] -pub fn token_count(model_name: &str, text: &str) -> Result { +pub fn token_count(model_name: &str, text: &str) -> Result { trace!("getting token count model={}", model_name); //HACK: add support for tokenizing mistral and other models //filed issue https://github.com/katanemo/arch/issues/222 @@ -26,9 +19,7 @@ pub fn token_count(model_name: &str, text: &str) -> Result { }; // Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton? - let bpe = tiktoken_rs::get_bpe_from_model(updated_model).map_err(|_| Error::UnknownModel { - model_name: updated_model.to_string(), - })?; + let bpe = tiktoken_rs::get_bpe_from_model(updated_model).map_err(|e| e.to_string())?; Ok(bpe.encode_ordinary(text).len()) } @@ -45,12 +36,4 @@ mod test { token_count(model_name, text).expect("correct tokenization") ); } - - #[test] - fn unrecognized_model() { - assert_eq!( - 2, - token_count("unknown model", "hello world").expect("correct tokenization") - ) - } }