Support for many new model output types (#93)

* General model wrappers and methods supporting multi-label classifiers * Support for pytorch multi-label binary classifier * New model output types + single implementation of score method that supports multiple output types. * Anonymization with pytorch multi-output binary model * Support for multi-label binary models in minimizer. * Support for multi-label logits/probabilities --------- Signed-off-by: abigailt <abigailt@il.ibm.com>
2026-05-15 06:52:37 +02:00 · 2024-07-03 09:04:59 -04:00 · 2024-07-03 09:04:59 -04:00 · 57e38ea4fa
commit 57e38ea4fa
parent e00535d120
13 changed files with 913 additions and 172 deletions
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -4,25 +4,29 @@ import pandas as pd
 import scipy

 from sklearn.compose import ColumnTransformer
-
 from sklearn.datasets import load_diabetes
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

-from torch import nn, optim
+from torch import nn, optim, sigmoid, where
+from torch.nn import functional
+from scipy.special import expit

 import tensorflow as tf
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import Dense, Input

+from apt.utils.datasets.datasets import PytorchData
+from apt.utils.models.pytorch_model import PyTorchClassifier
 from apt.minimization import GeneralizeToRepresentative
-from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_german_credit_dataset_pd
 from apt.utils.datasets import ArrayDataset
-from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor, KerasClassifier
-
+from apt.utils.models import SklearnClassifier, SklearnRegressor, KerasClassifier, \
+    CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES, CLASSIFIER_SINGLE_OUTPUT_CATEGORICAL, \
+    CLASSIFIER_SINGLE_OUTPUT_CLASS_LOGITS, CLASSIFIER_MULTI_OUTPUT_BINARY_LOGITS
 tf.compat.v1.disable_eager_execution()


@ -216,7 +220,7 @@ def test_minimizer_params(cells):

    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))

    expected_generalizations = {'categories': {}, 'category_representatives': {},
@ -258,7 +262,7 @@ def test_minimizer_params_not_transform(cells):
    samples = ArrayDataset(x, y, features)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))

    gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False)
@ -270,7 +274,7 @@ def test_minimizer_fit(data_two_features):
    x, y, features, _ = data_two_features
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))
    ad = ArrayDataset(x)
    predictions = model.predict(ad)
@ -287,6 +291,7 @@ def test_minimizer_fit(data_two_features):

    compare_generalizations(gener, expected_generalizations)
    check_features(features, expected_generalizations, transformed, x)
+    assert (np.equal(x, transformed).all())
    ncp = gen.ncp.transform_score
    check_ncp(ncp, expected_generalizations)

@ -299,7 +304,7 @@ def test_minimizer_ncp(data_two_features):

    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))
    ad = ArrayDataset(x)
    ad1 = ArrayDataset(x1, features_names=features)
@ -342,7 +347,7 @@ def test_minimizer_ncp_categorical(data_four_features):

    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y))
    ad = ArrayDataset(x)
    ad1 = ArrayDataset(x1)
@ -382,7 +387,7 @@ def test_minimizer_fit_not_transform(data_two_features):
    x, y, features, x1 = data_two_features
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))
    ad = ArrayDataset(x)
    predictions = model.predict(ad)
@ -412,7 +417,7 @@ def test_minimizer_fit_pandas(data_four_features):

    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y))
    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
@ -450,7 +455,7 @@ def test_minimizer_params_categorical(cells_categorical):
    preprocessor, encoded = create_encoder(numeric_features, categorical_features, x)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y))
    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
@ -474,7 +479,7 @@ def test_minimizer_fit_qi(data_three_features):
    qi = ['age', 'weight']
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))
    ad = ArrayDataset(x)
    predictions = model.predict(ad)
@ -508,7 +513,7 @@ def test_minimizer_fit_pandas_qi(data_five_features):

    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y))
    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
@ -543,7 +548,7 @@ def test_minimize_ndarray_iris():
    qi = ['sepal length (cm)', 'petal length (cm)']
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CATEGORICAL)
    model.fit(ArrayDataset(x_train, y_train))
    predictions = model.predict(ArrayDataset(x_train))
    if predictions.shape[1] > 1:
@ -586,7 +591,7 @@ def test_minimize_pandas_adult():

    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y_train))
    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
@ -642,7 +647,7 @@ def test_german_credit_pandas():

    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y_train))
    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
@ -760,7 +765,7 @@ def test_x_y():
    qi = [0, 2]
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))
    ad = ArrayDataset(x)
    predictions = model.predict(ad)
@ -800,7 +805,7 @@ def test_x_y_features_names():
    qi = ['age', 'weight']
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))
    ad = ArrayDataset(x)
    predictions = model.predict(ad)
@ -1202,7 +1207,7 @@ def test_keras_model():

    base_est.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

-    model = KerasClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = KerasClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(x, y))
    ad = ArrayDataset(x_test)
    predictions = model.predict(ad)
@ -1269,8 +1274,11 @@ def test_minimizer_pytorch(data_three_features):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(base_est.parameters(), lr=0.01)

-    model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
-                              optimizer=optimizer, input_shape=(3,),
+    model = PyTorchClassifier(model=base_est,
+                              output_type=CLASSIFIER_SINGLE_OUTPUT_CLASS_LOGITS,
+                              loss=criterion,
+                              optimizer=optimizer,
+                              input_shape=(3,),
                              nb_classes=2)
    model.fit(PytorchData(x, y), save_entire_model=False, nb_epochs=10)

@ -1308,8 +1316,11 @@ def test_minimizer_pytorch_iris():
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(base_est.parameters(), lr=0.01)

-    model = PyTorchClassifier(model=base_est, output_type=ModelOutputType.CLASSIFIER_LOGITS, loss=criterion,
-                              optimizer=optimizer, input_shape=(4,),
+    model = PyTorchClassifier(model=base_est,
+                              output_type=CLASSIFIER_SINGLE_OUTPUT_CLASS_LOGITS,
+                              loss=criterion,
+                              optimizer=optimizer,
+                              input_shape=(4,),
                              nb_classes=3)
    model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10)

@ -1329,6 +1340,78 @@ def test_minimizer_pytorch_iris():
    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)


+def test_minimizer_pytorch_multi_label_binary():
+    class multi_label_binary_model(nn.Module):
+        def __init__(self, num_labels, num_features):
+            super(multi_label_binary_model, self).__init__()
+
+            self.fc1 = nn.Sequential(
+                nn.Linear(num_features, 256),
+                nn.Tanh(), )
+
+            self.classifier1 = nn.Linear(256, num_labels)
+
+        def forward(self, x):
+            return self.classifier1(self.fc1(x))
+            # missing sigmoid on each output
+
+    class FocalLoss(nn.Module):
+        def __init__(self, gamma=2, alpha=0.5):
+            super(FocalLoss, self).__init__()
+            self.gamma = gamma
+            self.alpha = alpha
+
+        def forward(self, input, target):
+            bce_loss = functional.binary_cross_entropy_with_logits(input, target, reduction='none')
+
+            p = sigmoid(input)
+            p = where(target >= 0.5, p, 1 - p)
+
+            modulating_factor = (1 - p) ** self.gamma
+            alpha = self.alpha * target + (1 - self.alpha) * (1 - target)
+            focal_loss = alpha * modulating_factor * bce_loss
+
+            return focal_loss.mean()
+
+    (x_train, y_train), _ = get_iris_dataset_np()
+    features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
+    qi = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
+
+    # make multi-label binary
+    y_train = np.column_stack((y_train, y_train, y_train))
+    y_train[y_train > 1] = 1
+    x_train = x_train.astype(np.float32)
+    y_train = y_train.astype(np.float32)
+
+    orig_model = multi_label_binary_model(3, 4)
+    criterion = FocalLoss()
+    optimizer = optim.RMSprop(orig_model.parameters(), lr=0.01)
+
+    model = PyTorchClassifier(model=orig_model,
+                              output_type=CLASSIFIER_MULTI_OUTPUT_BINARY_LOGITS,
+                              loss=criterion,
+                              optimizer=optimizer,
+                              input_shape=(24,),
+                              nb_classes=3)
+    model.fit(PytorchData(x_train, y_train), save_entire_model=False, nb_epochs=10)
+    predictions = model.predict(PytorchData(x_train, y_train))
+    predictions = expit(predictions)
+    predictions[predictions < 0.5] = 0
+    predictions[predictions >= 0.5] = 1
+
+    target_accuracy = 0.99
+    gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
+    transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
+    gener = gen.generalizations
+
+    check_features(features, gener, transformed, x_train)
+    ncp = gen.ncp.transform_score
+    check_ncp(ncp, gener)
+
+    rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions))
+    assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF)
+
+
 def test_untouched():
    cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,
              'categories': {'gender': ['male']}, "representative": {"age": 26, "height": 149}},
@ -1362,7 +1445,7 @@ def test_errors():
    y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model = SklearnClassifier(base_est, CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES)
    model.fit(ArrayDataset(X, y))
    ad = ArrayDataset(X)
    predictions = model.predict(ad)