Use heuristic based tokenizer

This commit is contained in:
Adil Hafeez 2025-05-27 17:35:43 -07:00
parent d1542b988a
commit a74118238c
No known key found for this signature in database
GPG key ID: 9B18EF7691369645
3 changed files with 41 additions and 48 deletions

View file

@ -1,18 +1,5 @@
use log::debug;
pub trait Tokenizer {
/// Returns the number of tokens in the given text.
fn token_count(&self, text: &str, model_name: &str) -> Result<usize, String>;
}
pub struct TiktokenTokenizer {}
impl Tokenizer for TiktokenTokenizer {
fn token_count(&self, text: &str, model_name: &str) -> Result<usize, String> {
token_count(model_name, text)
}
}
#[allow(dead_code)]
pub fn token_count(model_name: &str, text: &str) -> Result<usize, String> {
debug!("getting token count model={}", model_name);