diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index e934085..15c771b 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -93,6 +93,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if is_regression: self.estimator = SklearnRegressor(estimator) else: + #TODO: maybe we should get model output type from user in this case self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_SINGLE_OUTPUT_CLASS_PROBABILITIES) self.target_accuracy = target_accuracy @@ -679,7 +680,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # this is a leaf # if it is a regression problem we do not use label label = self._calculate_cell_label(node) if not self.is_regression else 1 - hist = [int(i) for i in self._dt.tree_.value[node][0]] if not self.is_regression else [] + hist = self._dt.tree_.value[node] cell = {'label': label, 'hist': hist, 'ranges': {}, 'id': int(node)} return [cell] @@ -710,8 +711,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return cells def _calculate_cell_label(self, node): - label_hist = self._dt.tree_.value[node][0] - return int(self._dt.classes_[np.argmax(label_hist)]) + label_hist = self._dt.tree_.value[node] + if isinstance(self._dt.classes_, list): + return [self._dt.classes_[output][class_index] + for output, class_index in enumerate(np.argmax(label_hist, axis=1))] + return [self._dt.classes_[np.argmax(label_hist[0])]] def _modify_cells(self): cells = [] @@ -808,9 +812,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # else: nothing to do, stay with previous cells def _calculate_level_cell_label(self, left_cell, right_cell, new_cell): - new_cell['hist'] = [x + y for x, y in - zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else [] - new_cell['label'] = int(self._dt.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1 + new_cell['hist'] = left_cell['hist'] + right_cell['hist'] + # [x + y for x, y in + # zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else [] + if isinstance(self._dt.classes_, list): + new_cell['label'] = [self._dt.classes_[output][class_index] + for output, class_index in enumerate(np.argmax(new_cell['hist'], axis=1))] + else: + new_cell['label'] = [self._dt.classes_[np.argmax(new_cell['hist'][0])]] + def _get_nodes_level(self, level): # level = distance from lowest leaf @@ -838,26 +848,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # return all nodes with depth == level or leaves higher than level return [i for i, x in enumerate(node_depth) if x == depth or (x < depth and is_leaves[i])] - def _attach_cells_representatives(self, prepared_data, originalTrainFeatures, labelFeature, level_nodes): + def _attach_cells_representatives(self, prepared_data, original_train_features, label_feature, level_nodes): # prepared data include one hot encoded categorical data, # if there is no categorical data prepared data is original data nodeIds = self._find_sample_nodes(prepared_data, level_nodes) - labels_df = pd.DataFrame(labelFeature, columns=['label']) for cell in self.cells: cell['representative'] = {} # get all rows in cell indexes = [i for i, x in enumerate(nodeIds) if x == cell['id']] - original_rows = originalTrainFeatures.iloc[indexes] + original_rows = original_train_features.iloc[indexes] sample_rows = prepared_data.iloc[indexes] - sample_labels = labels_df.iloc[indexes]['label'].values.tolist() + # get rows with matching label - if self.is_regression: + if self.is_regression or (len(label_feature.shape) > 1 and label_feature.shape[1] > 1): match_samples = sample_rows match_rows = original_rows else: - indexes = [i for i, label in enumerate(sample_labels) if label == cell['label']] + labels_df = pd.DataFrame(label_feature, columns=['label']) + sample_labels = labels_df.iloc[indexes]['label'].values.tolist() + indexes = [i for i, label in enumerate(sample_labels) if label == cell['label'][0]] match_samples = sample_rows.iloc[indexes] match_rows = original_rows.iloc[indexes] + # find the "middle" of the cluster array = match_samples.values # Only works with numpy 1.9.0 and higher!!! diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 056f17c..d364bf9 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -4,21 +4,25 @@ import pandas as pd import scipy from sklearn.compose import ColumnTransformer - from sklearn.datasets import load_diabetes from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from torch import nn, optim +from torch import nn, optim, sigmoid, where +from torch.nn import functional +from scipy.special import expit import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Input +from apt.utils.datasets.datasets import PytorchData +from apt.utils.models import ModelOutputType +from apt.utils.models.pytorch_model import PyTorchClassifier from apt.minimization import GeneralizeToRepresentative -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_german_credit_dataset_pd from apt.utils.datasets import ArrayDataset from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor, KerasClassifier @@ -1335,6 +1339,79 @@ def test_minimizer_pytorch_iris(): assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) +def test_minimizer_pytorch_multi_label_binary(): + class multi_label_binary_model(nn.Module): + def __init__(self, num_labels, num_features): + super(multi_label_binary_model, self).__init__() + + self.fc1 = nn.Sequential( + nn.Linear(num_features, 256), + nn.Tanh(), ) + + self.classifier1 = nn.Linear(256, num_labels) + + def forward(self, x): + return self.classifier1(self.fc1(x)) + # missing sigmoid on each output + + class FocalLoss(nn.Module): + def __init__(self, gamma=2, alpha=0.5): + super(FocalLoss, self).__init__() + self.gamma = gamma + self.alpha = alpha + + def forward(self, input, target): + bce_loss = functional.binary_cross_entropy_with_logits(input, target, reduction='none') + + p = sigmoid(input) + p = where(target >= 0.5, p, 1-p) + + modulating_factor = (1 - p)**self.gamma + alpha = self.alpha * target + (1 - self.alpha) * (1 - target) + focal_loss = alpha * modulating_factor * bce_loss + + return focal_loss.mean() + + (x_train, y_train), _ = get_iris_dataset_np() + features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + qi = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] + + # make multi-label binary + y_train = np.column_stack((y_train, y_train, y_train)) + y_train[y_train > 1] = 1 + x_train = x_train.astype(np.float32) + y_train = y_train.astype(np.float32) + + orig_model = multi_label_binary_model(3, 4) + criterion = FocalLoss() + optimizer = optim.RMSprop(orig_model.parameters(), lr=0.01) + + model = PyTorchClassifier(model=orig_model, + output_type=ModelOutputType.CLASSIFIER_MULTI_OUTPUT_BINARY_LOGITS, + loss=criterion, + optimizer=optimizer, + input_shape=(24,), + nb_classes=3) + model.fit(PytorchData(x_train, y_train), save_entire_model=False, + nb_epochs=10) + predictions = model.predict(PytorchData(x_train, y_train)) + predictions = expit(predictions) + predictions[predictions < 0.5] = 0 + predictions[predictions >= 0.5] = 1 + + target_accuracy = 0.99 + gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi) + transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features)) + gener = gen.generalizations + + check_features(features, gener, transformed, x_train) + ncp = gen.ncp.transform_score + check_ncp(ncp, gener) + + rel_accuracy = model.score(ArrayDataset(transformed.astype(np.float32), predictions)) + assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= ACCURACY_DIFF) + + def test_untouched(): cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0, 'categories': {'gender': ['male']}, "representative": {"age": 26, "height": 149}},