mirror of
https://github.com/katanemo/plano.git
synced 2026-05-24 14:05:14 +02:00
Salmanap/fix network agent demo (#153)
* staging my changes to re-based from main * adding debug statements to rust * merged with main * ready to push network agent * removed the incomplete sql example --------- Co-authored-by: Salman Paracha <salmanparacha@MacBook-Pro-261.local>
This commit is contained in:
parent
6acfea7787
commit
b63a01fe82
41 changed files with 252 additions and 1987 deletions
|
|
@ -35,11 +35,11 @@ def start_server():
|
|||
print("Server is already running. Use 'model_server restart' to restart it.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Starting Archgw Model Server")
|
||||
print(f"Starting Archgw Model Server - Loading some awesomeness, this may take a little time.)")
|
||||
process = subprocess.Popen(
|
||||
["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "51000"],
|
||||
start_new_session=True,
|
||||
stdout=subprocess.DEVNULL, # Suppress standard output. There is a logger that model_server prints to
|
||||
stdout=subprocess.DEVNULL, # Suppress standard output. There is a logger that model_server prints to
|
||||
stderr=subprocess.DEVNULL, # Suppress standard error. There is a logger that model_server prints to
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForSequenc
|
|||
|
||||
|
||||
def get_device():
|
||||
|
||||
if torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
elif torch.backends.mps.is_available():
|
||||
|
|
@ -14,10 +15,12 @@ def get_device():
|
|||
else:
|
||||
device = "cpu"
|
||||
|
||||
print(f"Devices Avialble: {device}")
|
||||
return device
|
||||
|
||||
|
||||
def load_transformers(model_name=os.getenv("MODELS", "katanemo/bge-large-en-v1.5-onnx")):
|
||||
print("Loading Embedding Model")
|
||||
transformers = {}
|
||||
device = get_device()
|
||||
transformers["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
|
||||
|
|
@ -33,6 +36,7 @@ def load_guard_model(
|
|||
model_name,
|
||||
hardware_config="cpu",
|
||||
):
|
||||
print("Loading Guard Model")
|
||||
guard_model = {}
|
||||
guard_model["tokenizer"] = AutoTokenizer.from_pretrained(
|
||||
model_name, trust_remote_code=True
|
||||
|
|
@ -58,9 +62,7 @@ def load_guard_model(
|
|||
return guard_model
|
||||
|
||||
|
||||
def load_zero_shot_models(
|
||||
model_name=os.getenv("ZERO_SHOT_MODELS", "katanemo/deberta-base-nli-onnx")
|
||||
):
|
||||
def load_zero_shot_models(model_name=os.getenv("ZERO_SHOT_MODELS", "katanemo/deberta-base-nli-onnx")):
|
||||
zero_shot_model = {}
|
||||
device = get_device()
|
||||
zero_shot_model["model"] = ORTModelForSequenceClassification.from_pretrained(
|
||||
|
|
@ -79,6 +81,5 @@ def load_zero_shot_models(
|
|||
|
||||
return zero_shot_model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(get_device())
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ guard_model_config = load_yaml_config("guard_model_config.yaml")
|
|||
|
||||
mode = os.getenv("MODE", "cloud")
|
||||
logger.info(f"Serving model mode: {mode}")
|
||||
print(f"Serving model mode: {mode}")
|
||||
if mode not in ["cloud", "local-gpu", "local-cpu"]:
|
||||
raise ValueError(f"Invalid mode: {mode}")
|
||||
if mode == "local-cpu":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue