split wasm filter (#186)

* split wasm filter * fix int and unit tests * rename public_types => common and move common code there * rename * fix int test
2026-04-28 02:23:56 +02:00 · 2024-10-16 14:20:26 -07:00 · 2024-10-16 14:20:26 -07:00 · 3bd2ffe9fb
commit 3bd2ffe9fb
parent b1746b38b4
41 changed files with 5755 additions and 351 deletions
--- a/crates/common/src/tokenizer.rs
+++ b/crates/common/src/tokenizer.rs
@ -0,0 +1,39 @@
+use log::debug;
+
+#[derive(Debug, PartialEq, Eq)]
+#[allow(dead_code)]
+pub enum Error {
+    UnknownModel,
+    FailedToTokenize,
+}
+
+#[allow(dead_code)]
+pub fn token_count(model_name: &str, text: &str) -> Result<usize, Error> {
+    debug!("getting token count model={}", model_name);
+    // Consideration: is it more expensive to instantiate the BPE object every time, or to contend the singleton?
+    let bpe = tiktoken_rs::get_bpe_from_model(model_name).map_err(|_| Error::UnknownModel)?;
+    Ok(bpe.encode_ordinary(text).len())
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn encode_ordinary() {
+        let model_name = "gpt-3.5-turbo";
+        let text = "How many tokens does this sentence have?";
+        assert_eq!(
+            8,
+            token_count(model_name, text).expect("correct tokenization")
+        );
+    }
+
+    #[test]
+    fn unrecognized_model() {
+        assert_eq!(
+            Error::UnknownModel,
+            token_count("unknown", "").expect_err("unknown model")
+        )
+    }
+}