Fix gpu dependency and only leverage onnx when GPU is available (#157)

* replacing appending instead of write

* fix eetq dependency

* gpu guard required eetq

* fix bug when gpu is available

* fix for gpu device

* reverse

* fix

* replace gpu -> cuda
This commit is contained in:
Co Tran 2024-10-09 11:42:05 -07:00 committed by GitHub
parent 5c4a6bc8ff
commit 8b5db45507
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 18 additions and 14 deletions

View file

@ -1,5 +1,5 @@
pub const DEFAULT_EMBEDDING_MODEL: &str = "katanemo/bge-large-en-v1.5-onnx"; pub const DEFAULT_EMBEDDING_MODEL: &str = "katanemo/bge-large-en-v1.5";
pub const DEFAULT_INTENT_MODEL: &str = "katanemo/deberta-base-nli-onnx"; pub const DEFAULT_INTENT_MODEL: &str = "katanemo/deberta-base-nli";
pub const DEFAULT_PROMPT_TARGET_THRESHOLD: f64 = 0.8; pub const DEFAULT_PROMPT_TARGET_THRESHOLD: f64 = 0.8;
pub const DEFAULT_HALLUCINATED_THRESHOLD: f64 = 0.1; pub const DEFAULT_HALLUCINATED_THRESHOLD: f64 = 0.1;
pub const RATELIMIT_SELECTOR_HEADER_KEY: &str = "x-arch-ratelimit-selector"; pub const RATELIMIT_SELECTOR_HEADER_KEY: &str = "x-arch-ratelimit-selector";

View file

@ -1,3 +1,3 @@
jailbreak: jailbreak:
cpu: "katanemo/Arch-Guard-cpu" cpu: "katanemo/Arch-Guard-cpu"
gpu: "katanemo/Arch-Guard-gpu" gpu: "katanemo/Arch-Guard"

View file

@ -1,6 +1,6 @@
import os import os
import sentence_transformers import sentence_transformers
from transformers import AutoTokenizer, pipeline from transformers import AutoTokenizer, AutoModel, pipeline
import sqlite3 import sqlite3
import torch import torch
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForSequenceClassification # type: ignore from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForSequenceClassification # type: ignore
@ -18,16 +18,17 @@ def get_device():
return device return device
def load_transformers( def load_transformers(model_name=os.getenv("MODELS", "katanemo/bge-large-en-v1.5")):
model_name=os.getenv("MODELS", "katanemo/bge-large-en-v1.5-onnx")
):
print("Loading Embedding Model") print("Loading Embedding Model")
transformers = {} transformers = {}
device = get_device() device = get_device()
transformers["tokenizer"] = AutoTokenizer.from_pretrained(model_name) transformers["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
transformers["model"] = ORTModelForFeatureExtraction.from_pretrained( if device != "cuda":
model_name, device_map=device transformers["model"] = ORTModelForFeatureExtraction.from_pretrained(
) model_name, file_name="onnx/model.onnx"
)
else:
transformers["model"] = AutoModel.from_pretrained(model_name, device_map=device)
transformers["model_name"] = model_name transformers["model_name"] = model_name
return transformers return transformers
@ -64,13 +65,16 @@ def load_guard_model(
def load_zero_shot_models( def load_zero_shot_models(
model_name=os.getenv("ZERO_SHOT_MODELS", "katanemo/deberta-base-nli-onnx") model_name=os.getenv("ZERO_SHOT_MODELS", "katanemo/deberta-base-nli")
): ):
zero_shot_model = {} zero_shot_model = {}
device = get_device() device = get_device()
zero_shot_model["model"] = ORTModelForSequenceClassification.from_pretrained( if device != "cuda":
model_name zero_shot_model["model"] = ORTModelForSequenceClassification.from_pretrained(
) model_name, file_name="onnx/model.onnx"
)
else:
zero_shot_model["model"] = AutoModel.from_pretrained(model_name)
zero_shot_model["tokenizer"] = AutoTokenizer.from_pretrained(model_name) zero_shot_model["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
# create pipeline # create pipeline