mirror of
https://github.com/katanemo/plano.git
synced 2026-04-25 00:36:34 +02:00
247 lines
7.9 KiB
Python
247 lines
7.9 KiB
Python
from fastapi import FastAPI, Response, HTTPException
|
|
from pydantic import BaseModel
|
|
from app.load_models import (
|
|
load_transformers,
|
|
load_guard_model,
|
|
load_zero_shot_models,
|
|
get_device,
|
|
)
|
|
import os
|
|
from app.utils import (
|
|
GuardHandler,
|
|
split_text_into_chunks,
|
|
load_yaml_config,
|
|
get_model_server_logger,
|
|
)
|
|
import torch
|
|
import yaml
|
|
import string
|
|
import time
|
|
import logging
|
|
from app.arch_fc.arch_fc import chat_completion as arch_fc_chat_completion, ChatMessage
|
|
import os.path
|
|
|
|
|
|
logger = get_model_server_logger()
|
|
logger.info(f"Devices Avialble: {get_device()}")
|
|
|
|
transformers = load_transformers()
|
|
zero_shot_models = load_zero_shot_models()
|
|
guard_model_config = load_yaml_config("guard_model_config.yaml")
|
|
|
|
mode = os.getenv("MODE", "cloud")
|
|
logger.info(f"Serving model mode: {mode}")
|
|
print(f"Serving model mode: {mode}")
|
|
if mode not in ["cloud", "local-gpu", "local-cpu"]:
|
|
raise ValueError(f"Invalid mode: {mode}")
|
|
if mode == "local-cpu":
|
|
hardware = "cpu"
|
|
else:
|
|
hardware = "gpu" if torch.cuda.is_available() else "cpu"
|
|
|
|
jailbreak_model = load_guard_model(guard_model_config["jailbreak"][hardware], hardware)
|
|
guard_handler = GuardHandler(toxic_model=None, jailbreak_model=jailbreak_model)
|
|
|
|
app = FastAPI()
|
|
|
|
|
|
class EmbeddingRequest(BaseModel):
|
|
input: str
|
|
model: str
|
|
|
|
|
|
@app.get("/healthz")
|
|
async def healthz():
|
|
return {"status": "ok"}
|
|
|
|
|
|
@app.get("/models")
|
|
async def models():
|
|
models = []
|
|
|
|
models.append({"id": transformers["model_name"], "object": "model"})
|
|
|
|
return {"data": models, "object": "list"}
|
|
|
|
|
|
@app.post("/embeddings")
|
|
async def embedding(req: EmbeddingRequest, res: Response):
|
|
logger.info(f"Embedding req: {req}")
|
|
if req.model != transformers["model_name"]:
|
|
raise HTTPException(status_code=400, detail="unknown model: " + req.model)
|
|
|
|
start = time.time()
|
|
encoded_input = transformers["tokenizer"](
|
|
req.input, padding=True, truncation=True, return_tensors="pt"
|
|
)
|
|
embeddings = transformers["model"](**encoded_input)
|
|
embeddings = embeddings[0][:, 0]
|
|
# normalize embeddings
|
|
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1).detach().numpy()
|
|
logger.info(f"Embedding Call Complete Time: {time.time()-start}")
|
|
data = []
|
|
|
|
for embedding in embeddings.tolist():
|
|
data.append({"object": "embedding", "embedding": embedding, "index": len(data)})
|
|
|
|
usage = {
|
|
"prompt_tokens": 0,
|
|
"total_tokens": 0,
|
|
}
|
|
return {"data": data, "model": req.model, "object": "list", "usage": usage}
|
|
|
|
|
|
class GuardRequest(BaseModel):
|
|
input: str
|
|
task: str
|
|
|
|
|
|
@app.post("/guard")
|
|
async def guard(req: GuardRequest, res: Response):
|
|
"""
|
|
Guard API, take input as text and return the prediction of toxic and jailbreak
|
|
result format: dictionary
|
|
"toxic_prob": toxic_prob,
|
|
"jailbreak_prob": jailbreak_prob,
|
|
"time": end - start,
|
|
"toxic_verdict": toxic_verdict,
|
|
"jailbreak_verdict": jailbreak_verdict,
|
|
"""
|
|
max_words = 300
|
|
start = time.time()
|
|
if req.task in ["both", "toxic", "jailbreak"]:
|
|
guard_handler.task = req.task
|
|
if len(req.input.split()) < max_words:
|
|
final_result = guard_handler.guard_predict(req.input)
|
|
else:
|
|
# text is long, split into chunks
|
|
chunks = split_text_into_chunks(req.input)
|
|
final_result = {
|
|
"toxic_prob": [],
|
|
"jailbreak_prob": [],
|
|
"time": 0,
|
|
"toxic_verdict": False,
|
|
"jailbreak_verdict": False,
|
|
"toxic_sentence": [],
|
|
"jailbreak_sentence": [],
|
|
}
|
|
if guard_handler.task == "both":
|
|
for chunk in chunks:
|
|
result_chunk = guard_handler.guard_predict(chunk)
|
|
final_result["time"] += result_chunk["time"]
|
|
if result_chunk["toxic_verdict"]:
|
|
final_result["toxic_verdict"] = True
|
|
final_result["toxic_sentence"].append(
|
|
result_chunk["toxic_sentence"]
|
|
)
|
|
final_result["toxic_prob"].append(result_chunk["toxic_prob"].item())
|
|
if result_chunk["jailbreak_verdict"]:
|
|
final_result["jailbreak_verdict"] = True
|
|
final_result["jailbreak_sentence"].append(
|
|
result_chunk["jailbreak_sentence"]
|
|
)
|
|
final_result["jailbreak_prob"].append(
|
|
result_chunk["jailbreak_prob"]
|
|
)
|
|
else:
|
|
task = guard_handler.task
|
|
for chunk in chunks:
|
|
result_chunk = guard_handler.guard_predict(chunk)
|
|
final_result["time"] += result_chunk["time"]
|
|
if result_chunk[f"{task}_verdict"]:
|
|
final_result[f"{task}_verdict"] = True
|
|
final_result[f"{task}_sentence"].append(
|
|
result_chunk[f"{task}_sentence"]
|
|
)
|
|
final_result[f"{task}_prob"].append(
|
|
result_chunk[f"{task}_prob"].item()
|
|
)
|
|
end = time.time()
|
|
logger.info(f"Time taken for Guard: {end - start}")
|
|
return final_result
|
|
|
|
|
|
class ZeroShotRequest(BaseModel):
|
|
input: str
|
|
labels: list[str]
|
|
model: str
|
|
|
|
|
|
def remove_punctuations(s, lower=True):
|
|
s = s.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
|
|
s = " ".join(s.split())
|
|
if lower:
|
|
s = s.lower()
|
|
return s
|
|
|
|
|
|
@app.post("/zeroshot")
|
|
async def zeroshot(req: ZeroShotRequest, res: Response):
|
|
logger.info(f"zero-shot request: {req}")
|
|
if req.model != zero_shot_models["model_name"]:
|
|
raise HTTPException(status_code=400, detail="unknown model: " + req.model)
|
|
|
|
classifier = zero_shot_models["pipeline"]
|
|
labels_without_punctuations = [remove_punctuations(label) for label in req.labels]
|
|
start = time.time()
|
|
predicted_classes = classifier(
|
|
req.input, candidate_labels=labels_without_punctuations, multi_label=True
|
|
)
|
|
label_map = dict(zip(labels_without_punctuations, req.labels))
|
|
|
|
orig_map = [label_map[label] for label in predicted_classes["labels"]]
|
|
final_scores = dict(zip(orig_map, predicted_classes["scores"]))
|
|
predicted_class = label_map[predicted_classes["labels"][0]]
|
|
logger.info(f"zero-shot taking {time.time()-start} seconds")
|
|
|
|
return {
|
|
"predicted_class": predicted_class,
|
|
"predicted_class_score": final_scores[predicted_class],
|
|
"scores": final_scores,
|
|
"model": req.model,
|
|
}
|
|
|
|
|
|
class HallucinationRequest(BaseModel):
|
|
prompt: str
|
|
parameters: dict
|
|
model: str
|
|
|
|
|
|
@app.post("/hallucination")
|
|
async def hallucination(req: HallucinationRequest, res: Response):
|
|
"""
|
|
Hallucination API, take input as text and return the prediction of hallucination for each parameter
|
|
parameters: dictionary of parameters and values
|
|
example {"name": "John", "age": "25"}
|
|
prompt: input prompt from the user
|
|
"""
|
|
if req.model != zero_shot_models["model_name"]:
|
|
raise HTTPException(status_code=400, detail="unknown model: " + req.model)
|
|
|
|
start = time.time()
|
|
classifier = zero_shot_models["pipeline"]
|
|
candidate_labels = [f"{k} is {v}" for k, v in req.parameters.items()]
|
|
hypothesis_template = "{}"
|
|
result = classifier(
|
|
req.prompt,
|
|
candidate_labels=candidate_labels,
|
|
hypothesis_template=hypothesis_template,
|
|
multi_label=True,
|
|
)
|
|
result_score = result["scores"]
|
|
result_params = {k[0]: s for k, s in zip(req.parameters.items(), result_score)}
|
|
logger.info(
|
|
f"hallucination result: {result_params}, taking {time.time()-start} seconds"
|
|
)
|
|
|
|
return {
|
|
"params_scores": result_params,
|
|
"model": req.model,
|
|
}
|
|
|
|
|
|
@app.post("/v1/chat/completions")
|
|
async def chat_completion(req: ChatMessage, res: Response):
|
|
result = await arch_fc_chat_completion(req, res)
|
|
return result
|