Support for many new model output types (#93)

* General model wrappers and methods supporting multi-label classifiers
* Support for pytorch multi-label binary classifier
* New model output types + single implementation of score method that supports multiple output types. 
* Anonymization with pytorch multi-output binary model
* Support for multi-label binary models in minimizer. 
* Support for multi-label logits/probabilities
---------
Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailgold 2024-07-03 09:04:59 -04:00 committed by GitHub
parent e00535d120
commit 57e38ea4fa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 913 additions and 172 deletions

View file

@ -4,25 +4,29 @@ import pandas as pd
import scipy
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_diabetes
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from torch import nn, optim
from torch import nn, optim, sigmoid, where
from torch.nn import functional
from scipy.special import expit
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from apt.utils.datasets.datasets import PytorchData
from apt.utils.models.pytorch_model import PyTorchClassifier
from apt.minimization import GeneralizeToRepresentative
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_german_credit_dataset_pd
from apt.utils.datasets import ArrayDataset
from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor, KerasClassifier
from apt.utils.models import SklearnClassifier, SklearnRegressor, KerasClassifier, \
CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES, CLASSIFIER_SINGLE_OUTPUT_CATEGORICAL, \
CLASSIFIER_SINGLE_OUTPUT_CLASS_LOGITS, CLASSIFIER_MULTI_OUTPUT_BINARY_LOGITS
tf.compat.v1.disable_eager_execution()
@ -216,7 +220,7 @@ def test_minimizer_params(cells):
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
expected_generalizations = {'categories': {}, 'category_representatives': {},
@ -258,7 +262,7 @@ def test_minimizer_params_not_transform(cells):
samples = ArrayDataset(x, y, features)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False)
@ -270,7 +274,7 @@ def test_minimizer_fit(data_two_features):
x, y, features, _ = data_two_features
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
@ -287,6 +291,7 @@ def test_minimizer_fit(data_two_features):
compare_generalizations(gener, expected_generalizations)
check_features(features, expected_generalizations, transformed, x)
assert (np.equal(x, transformed).all())
ncp = gen.ncp.transform_score
check_ncp(ncp, expected_generalizations)
@ -299,7 +304,7 @@ def test_minimizer_ncp(data_two_features):
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
ad1 = ArrayDataset(x1, features_names=features)
@ -342,7 +347,7 @@ def test_minimizer_ncp_categorical(data_four_features):
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(encoded, y))
ad = ArrayDataset(x)
ad1 = ArrayDataset(x1)
@ -382,7 +387,7 @@ def test_minimizer_fit_not_transform(data_two_features):
x, y, features, x1 = data_two_features
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
@ -412,7 +417,7 @@ def test_minimizer_fit_pandas(data_four_features):
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
@ -450,7 +455,7 @@ def test_minimizer_params_categorical(cells_categorical):
preprocessor, encoded = create_encoder(numeric_features, categorical_features, x)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
@ -474,7 +479,7 @@ def test_minimizer_fit_qi(data_three_features):
qi = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
@ -508,7 +513,7 @@ def test_minimizer_fit_pandas_qi(data_five_features):
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(encoded, y))
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
@ -543,7 +548,7 @@ def test_minimize_ndarray_iris():
qi = ['sepal length (cm)', 'petal length (cm)']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CATEGORICAL)
model.fit(ArrayDataset(x_train, y_train))
predictions = model.predict(ArrayDataset(x_train))
if predictions.shape[1] > 1:
@ -586,7 +591,7 @@ def test_minimize_pandas_adult():
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(encoded, y_train))
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
@ -642,7 +647,7 @@ def test_german_credit_pandas():
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(encoded, y_train))
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
@ -760,7 +765,7 @@ def test_x_y():
qi = [0, 2]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
@ -800,7 +805,7 @@ def test_x_y_features_names():
qi = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
@ -1202,7 +1207,7 @@ def test_keras_model():
base_est.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model = KerasClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = KerasClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x_test)
predictions = model.predict(ad)
@ -1269,8 +1274,11 @@ def test_minimizer_pytorch(data_three_features):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_est.parameters(), lr=0.01)
model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
optimizer=optimizer, input_shape=(3,),
model = PyTorchClassifier(model=base_est,
output_type=CLASSIFIER_SINGLE_OUTPUT_CLASS_LOGITS,
loss=criterion,
optimizer=optimizer,
input_shape=(3,),
nb_classes=2)
model.fit(PytorchData(x, y), save_entire_model=False, nb_epochs=10)
@ -1308,8 +1316,11 @@ def test_minimizer_pytorch_iris():
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_est.parameters(), lr=0.01)
model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
optimizer=optimizer, input_shape=(4,),
model = PyTorchClassifier(model=base_est,
output_type=CLASSIFIER_SINGLE_OUTPUT_CLASS_LOGITS,
loss=criterion,
optimizer=optimizer,
input_shape=(4,),
nb_classes=3)
model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10)
@ -1329,6 +1340,78 @@ def test_minimizer_pytorch_iris():
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_minimizer_pytorch_multi_label_binary():
class multi_label_binary_model(nn.Module):
def __init__(self, num_labels, num_features):
super(multi_label_binary_model, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(num_features, 256),
nn.Tanh(), )
self.classifier1 = nn.Linear(256, num_labels)
def forward(self, x):
return self.classifier1(self.fc1(x))
# missing sigmoid on each output
class FocalLoss(nn.Module):
def __init__(self, gamma=2, alpha=0.5):
super(FocalLoss, self).__init__()
self.gamma = gamma
self.alpha = alpha
def forward(self, input, target):
bce_loss = functional.binary_cross_entropy_with_logits(input, target, reduction='none')
p = sigmoid(input)
p = where(target >= 0.5, p, 1 - p)
modulating_factor = (1 - p) ** self.gamma
alpha = self.alpha * target + (1 - self.alpha) * (1 - target)
focal_loss = alpha * modulating_factor * bce_loss
return focal_loss.mean()
(x_train, y_train), _ = get_iris_dataset_np()
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
qi = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# make multi-label binary
y_train = np.column_stack((y_train, y_train, y_train))
y_train[y_train > 1] = 1
x_train = x_train.astype(np.float32)
y_train = y_train.astype(np.float32)
orig_model = multi_label_binary_model(3, 4)
criterion = FocalLoss()
optimizer = optim.RMSprop(orig_model.parameters(), lr=0.01)
model = PyTorchClassifier(model=orig_model,
output_type=CLASSIFIER_MULTI_OUTPUT_BINARY_LOGITS,
loss=criterion,
optimizer=optimizer,
input_shape=(24,),
nb_classes=3)
model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10)
predictions = model.predict(PytorchData(x_train, y_train))
predictions = expit(predictions)
predictions[predictions < 0.5] = 0
predictions[predictions >= 0.5] = 1
target_accuracy = 0.99
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
gener = gen.generalizations
check_features(features, gener, transformed, x_train)
ncp = gen.ncp.transform_score
check_ncp(ncp, gener)
rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
def test_untouched():
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,
'categories': {'gender': ['male']}, "representative": {"age": 26, "height": 149}},
@ -1362,7 +1445,7 @@ def test_errors():
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
predictions = model.predict(ad)