New model wrappers (#32)

* keras wrapper + blackbox classifier wrapper (fix #7) * fix error in NCP calculation * Update notebooks * Fix #25 (incorrect attack_feature indexes for social feature in notebook) * Consistent naming of internal parameters
2026-07-23 17:01:03 +02:00 · 2022-05-12 15:44:29 +03:00 · 2022-05-12 15:44:29 +03:00 · fe676fa426
commit fe676fa426
parent fd6be8e778
15 changed files with 1407 additions and 656 deletions
--- a/apt/anonymization/anonymizer.py
+++ b/apt/anonymization/anonymizer.py
@ -101,11 +101,11 @@ class Anonymize:
            # build DT just on QI features
            x_anonymizer_train = x_prepared[:, self.quasi_identifiers]
        if self.is_regression:
-            self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
+            self._anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
        else:
-            self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
+            self._anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)

-        self.anonymizer.fit(x_anonymizer_train, y)
+        self._anonymizer.fit(x_anonymizer_train, y)
        cells_by_id = self._calculate_cells(x, x_anonymizer_train)
        return self._anonymize_data(x, x_anonymizer_train, cells_by_id)

@ -113,16 +113,16 @@ class Anonymize:
        # x is original data, x_anonymizer_train is only QIs + 1-hot encoded
        cells_by_id = {}
        leaves = []
-        for node, feature in enumerate(self.anonymizer.tree_.feature):
+        for node, feature in enumerate(self._anonymizer.tree_.feature):
            if feature == -2:  # leaf node
                leaves.append(node)
-                hist = [int(i) for i in self.anonymizer.tree_.value[node][0]]
+                hist = [int(i) for i in self._anonymizer.tree_.value[node][0]]
                # TODO we may change the method for choosing representative for cell
                # label_hist = self.anonymizer.tree_.value[node][0]
                # label = int(self.anonymizer.classes_[np.argmax(label_hist)])
                cell = {'label': 1, 'hist': hist, 'id': int(node)}
                cells_by_id[cell['id']] = cell
-        self.nodes = leaves
+        self._nodes = leaves
        self._find_representatives(x, x_anonymizer_train, cells_by_id.values())
        return cells_by_id

@ -153,8 +153,8 @@ class Anonymize:
                    cell['representative'][feature] = min_value

    def _find_sample_nodes(self, samples):
-        paths = self.anonymizer.decision_path(samples).toarray()
-        node_set = set(self.nodes)
+        paths = self._anonymizer.decision_path(samples).toarray()
+        node_set = set(self._nodes)
        return [(list(set([i for i, v in enumerate(p) if v == 1]) & node_set))[0] for p in paths]

    def _find_sample_cells(self, samples, cells_by_id):
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -12,7 +12,7 @@ from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from sklearn.utils.validation import check_is_fitted
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.model_selection import train_test_split

@ -68,7 +68,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            if is_regression:
                self.estimator = SklearnRegressor(estimator)
            else:
-                self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_VECTOR)
+                self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
        self.target_accuracy = target_accuracy
        self.cells = cells
        self.categorical_features = []
@ -124,7 +124,16 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                 'categories' that contains sub-groups of categories for categorical features, and
                                 'untouched' that contains the features that could not be generalized.
        """
-        return self.generalizations_
+        return self._generalizations
+
+    @property
+    def ncp(self):
+        """
+        Return the NCP score of the generalizations.
+
+        :return: ncp score as float.
+        """
+        return self._ncp

    def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
                      features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
@ -172,27 +181,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                dataset = ArrayDataset(X, y, features_names)

        if dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
-            self.n_features_ = dataset.get_samples().shape[1]
-
+            self._n_features = dataset.get_samples().shape[1]
        elif dataset and dataset.features_names:
-            self.n_features_ = len(dataset.features_names)
+            self._n_features = len(dataset.features_names)
        else:
-            self.n_features_ = 0
+            self._n_features = 0

        if dataset and dataset.features_names:
            self._features = dataset.features_names
        # if features is None, use numbers instead of names
-        elif self.n_features_ != 0:
-            self._features = [str(i) for i in range(self.n_features_)]
+        elif self._n_features != 0:
+            self._features = [str(i) for i in range(self._n_features)]
        else:
            self._features = None

-        if self.cells:
-            self.cells_ = self.cells
-        else:
-            self.cells_ = {}
-        self.categorical_values = {}
-
        # Going to fit
        # (currently not dealing with option to fit with only X and y and no estimator)
        if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
@ -231,28 +233,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                        fd['max'] = max(values)
                        fd['range'] = max(values) - min(values)
                    else:
-                        fd['range'] = len(values)
+                        fd['range'] = len(np.unique(values))
                    feature_data[feature] = fd

            # prepare data for DT
-            categorical_features = [f for f in self._features if f in self.categorical_features and
-                                    f in self.features_to_minimize]
-
-            numeric_transformer = Pipeline(
-                steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
-            )
-
-            numeric_features = [f for f in self._features if f not in self.categorical_features and
-                                f in self.features_to_minimize]
-            categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
-
-            preprocessor_QI_features = ColumnTransformer(
-                transformers=[
-                    ("num", numeric_transformer, numeric_features),
-                    ("cat", categorical_transformer, categorical_features),
-                ]
-            )
-            preprocessor_QI_features.fit(x_QI)

            # preprocessor to fit data that have features not included in QI (to get accuracy)
            numeric_features = [f for f in self._features if f not in self.categorical_features]
@ -267,44 +251,68 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                ]
            )
            preprocessor.fit(x)
-            x_prepared = preprocessor.transform(X_train)
+
            if self.train_only_QI:
+                categorical_features = [f for f in self._features if f in self.categorical_features and
+                                        f in self.features_to_minimize]
+
+                numeric_transformer = Pipeline(
+                        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
+                )
+
+                numeric_features = [f for f in self._features if f not in self.categorical_features and
+                                    f in self.features_to_minimize]
+                categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
+
+                preprocessor_QI_features = ColumnTransformer(
+                        transformers=[
+                            ("num", numeric_transformer, numeric_features),
+                            ("cat", categorical_transformer, categorical_features),
+                        ]
+                )
+                preprocessor_QI_features.fit(x_QI)
                x_prepared = preprocessor_QI_features.transform(X_train_QI)
+            else:
+                x_prepared = preprocessor.transform(X_train)

            self._preprocessor = preprocessor

-            self.cells_ = {}
+            self.cells = []
+            self._categorical_values = {}
+
            if self.is_regression:
-                self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
+                self._dt = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
            else:
-                self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+                self._dt = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                                  min_samples_leaf=1)
-            self.dt_.fit(x_prepared, y_train)
+            self._dt.fit(x_prepared, y_train)
+
            self._modify_categorical_features(used_data)

-            x_prepared = pd.DataFrame(x_prepared, columns=self.categorical_data.columns)
+            x_prepared = pd.DataFrame(x_prepared, columns=self._categorical_data.columns)

            self._calculate_cells()
            self._modify_cells()
            # features that are not from QI should not be part of generalizations
            for feature in self._features:
                if feature not in self.features_to_minimize:
-                    self._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
+                    self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)

            nodes = self._get_nodes_level(0)
            self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)

-            # self.cells_ currently holds the generalization created from the tree leaves
+            # self._cells currently holds the generalization created from the tree leaves
            self._calculate_generalizations()

            # apply generalizations to test data
-            x_prepared_test = preprocessor.transform(X_test)
            if self.train_only_QI:
                x_prepared_test = preprocessor_QI_features.transform(X_test_QI)
+            else:
+                x_prepared_test = preprocessor.transform(X_test)

-            x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self.categorical_data.columns)
+            x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self._categorical_data.columns)

-            generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
+            generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)

            # check accuracy
            accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
@ -317,22 +325,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                level = 1
                while accuracy > self.target_accuracy:
                    try:
-                        cells_previous_iter = self.cells_
-                        generalization_prev_iter = self.generalizations_
-                        cells_by_id_prev = self.cells_by_id_
+                        cells_previous_iter = self.cells
+                        generalization_prev_iter = self._generalizations
+                        cells_by_id_prev = self._cells_by_id
                        nodes = self._get_nodes_level(level)
                        self._calculate_level_cells(level)
                        self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)

                        self._calculate_generalizations()
-                        generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
-                                                       self.cells_by_id_)
+                        generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells,
+                                                       self._cells_by_id)
                        accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
                        # if accuracy passed threshold roll back to previous iteration generalizations
                        if accuracy < self.target_accuracy:
-                            self.cells_ = cells_previous_iter
-                            self.generalizations_ = generalization_prev_iter
-                            self.cells_by_id_ = cells_by_id_prev
+                            self.cells = cells_previous_iter
+                            self._generalizations = generalization_prev_iter
+                            self._cells_by_id = cells_by_id_prev
                            break
                        else:
                            print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
@ -352,14 +360,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                        break

                    self._calculate_generalizations()
-                    generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
+                    generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
                    accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
                    print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))

-            # self.cells_ currently holds the chosen generalization based on target accuracy
+            # self._cells currently holds the chosen generalization based on target accuracy

            # calculate iLoss
-            self.ncp_ = self._calculate_ncp(X_test, self.generalizations_, feature_data)
+            self._ncp = self._calculate_ncp(X_test, self._generalizations, feature_data)

        # Return the transformer
        return self
@ -398,7 +406,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        if dataset and dataset.get_samples() is not None:
            x = pd.DataFrame(dataset.get_samples(), columns=self._features)

-        if x.shape[1] != self.n_features_ and self.n_features_ != 0:
+        if x.shape[1] != self._n_features and self._n_features != 0:
            raise ValueError('Shape of input is different from what was seen'
                             'in `fit`')

@ -410,23 +418,23 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        mapped = np.zeros(x.shape[0])  # to mark records we already mapped

        # iterate over cells (leaves in decision tree)
-        for i in range(len(self.cells_)):
+        for i in range(len(self.cells)):
            # Copy the representatives from the cells into another data structure:
            # iterate over features in test data
            for feature in self._features:
                # if feature has a representative value in the cell and should not
                # be left untouched, take the representative value
-                if feature in self.cells_[i]['representative'] and \
-                        ('untouched' not in self.cells_[i]
-                         or feature not in self.cells_[i]['untouched']):
-                    representatives.loc[i, feature] = self.cells_[i]['representative'][feature]
+                if feature in self.cells[i]['representative'] and \
+                        ('untouched' not in self.cells[i]
+                         or feature not in self.cells[i]['untouched']):
+                    representatives.loc[i, feature] = self.cells[i]['representative'][feature]
                # else, drop the feature (removes from representatives columns that
                # do not have a representative value or should remain untouched)
                elif feature in representatives.columns.tolist():
                    representatives = representatives.drop(feature, axis=1)

            # get the indexes of all records that map to this cell
-            indexes = self._get_record_indexes_for_cell(x, self.cells_[i], mapped)
+            indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)

            # replace the values in the representative columns with the representative
            # values (leaves others untouched)
@ -467,8 +475,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        return True

    def _modify_categorical_features(self, X):
-        self.categorical_values = {}
-        self.oneHotVectorFeaturesToFeatures = {}
+        self._categorical_values = {}
+        self._one_hot_vector_features_to_features = {}
        features_to_remove = []
        used_features = self._features
        if self.train_only_QI:
@ -478,17 +486,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                try:
                    all_values = X.loc[:, feature]
                    values = list(all_values.unique())
-                    self.categorical_values[feature] = values
+                    self._categorical_values[feature] = values
                    X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False)
                    ohe = pd.get_dummies(X[feature], prefix=feature)
-                    for oneHotVectorFeature in ohe.columns:
-                        self.oneHotVectorFeaturesToFeatures[oneHotVectorFeature] = feature
+                    for one_hot_vector_feature in ohe.columns:
+                        self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
                    X = pd.concat([X, ohe], axis=1)
                    features_to_remove.append(feature)
                except KeyError:
                    print("feature " + feature + "not found in training data")

-        self.categorical_data = X.drop(features_to_remove, axis=1)
+        self._categorical_data = X.drop(features_to_remove, axis=1)

    def _cell_contains_numeric(self, f, range, x):
        i = self._features.index(f)
@ -513,24 +521,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        return False

    def _calculate_cells(self):
-        self.cells_by_id_ = {}
-        self.cells_ = self._calculate_cells_recursive(0)
+        self._cells_by_id = {}
+        self.cells = self._calculate_cells_recursive(0)

    def _calculate_cells_recursive(self, node):
-        feature_index = self.dt_.tree_.feature[node]
+        feature_index = self._dt.tree_.feature[node]
        if feature_index == -2:
            # this is a leaf
            # if it is a regression problem we do not use label
            label = self._calculate_cell_label(node) if not self.is_regression else 1
-            hist = [int(i) for i in self.dt_.tree_.value[node][0]] if not self.is_regression else []
+            hist = [int(i) for i in self._dt.tree_.value[node][0]] if not self.is_regression else []
            cell = {'label': label, 'hist': hist, 'ranges': {}, 'id': int(node)}
            return [cell]

        cells = []
-        feature = self.categorical_data.columns[feature_index]
-        threshold = self.dt_.tree_.threshold[node]
-        left_child = self.dt_.tree_.children_left[node]
-        right_child = self.dt_.tree_.children_right[node]
+        feature = self._categorical_data.columns[feature_index]
+        threshold = self._dt.tree_.threshold[node]
+        left_child = self._dt.tree_.children_left[node]
+        right_child = self._dt.tree_.children_right[node]

        left_child_cells = self._calculate_cells_recursive(left_child)
        for cell in left_child_cells:
@ -539,7 +547,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            if cell['ranges'][feature]['end'] is None:
                cell['ranges'][feature]['end'] = threshold
            cells.append(cell)
-            self.cells_by_id_[cell['id']] = cell
+            self._cells_by_id[cell['id']] = cell

        right_child_cells = self._calculate_cells_recursive(right_child)
        for cell in right_child_cells:
@ -548,26 +556,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            if cell['ranges'][feature]['start'] is None:
                cell['ranges'][feature]['start'] = threshold
            cells.append(cell)
-            self.cells_by_id_[cell['id']] = cell
+            self._cells_by_id[cell['id']] = cell

        return cells

    def _calculate_cell_label(self, node):
-        label_hist = self.dt_.tree_.value[node][0]
-        return int(self.dt_.classes_[np.argmax(label_hist)])
+        label_hist = self._dt.tree_.value[node][0]
+        return int(self._dt.classes_[np.argmax(label_hist)])

    def _modify_cells(self):
        cells = []
-        features = self.categorical_data.columns
-        for cell in self.cells_:
+        features = self._categorical_data.columns
+        for cell in self.cells:
            new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {}, 'categories': {}, 'hist': cell['hist'],
                        'representative': None}
            for feature in features:
-                if feature in self.oneHotVectorFeaturesToFeatures.keys():
+                if feature in self._one_hot_vector_features_to_features.keys():
                    # feature is categorical and should be mapped
-                    categorical_feature = self.oneHotVectorFeaturesToFeatures[feature]
+                    categorical_feature = self._one_hot_vector_features_to_features[feature]
                    if categorical_feature not in new_cell['categories'].keys():
-                        new_cell['categories'][categorical_feature] = self.categorical_values[
+                        new_cell['categories'][categorical_feature] = self._categorical_values[
                            categorical_feature].copy()
                    if feature in cell['ranges'].keys():
                        categorical_value = feature[len(categorical_feature) + 1:]
@ -584,11 +592,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    else:
                        new_cell['ranges'][feature] = {'start': None, 'end': None}
            cells.append(new_cell)
-            self.cells_by_id_[new_cell['id']] = new_cell
-        self.cells_ = cells
+            self._cells_by_id[new_cell['id']] = new_cell
+        self.cells = cells

    def _calculate_level_cells(self, level):
-        if level < 0 or level > self.dt_.get_depth():
+        if level < 0 or level > self._dt.get_depth():
            raise TypeError("Illegal level %d' % level", level)

        if level > 0:
@ -597,13 +605,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            nodes = self._get_nodes_level(level)
            if nodes:
                for node in nodes:
-                    if self.dt_.tree_.feature[node] == -2:  # leaf node
-                        new_cell = self.cells_by_id_[node]
+                    if self._dt.tree_.feature[node] == -2:  # leaf node
+                        new_cell = self._cells_by_id[node]
                    else:
-                        left_child = self.dt_.tree_.children_left[node]
-                        right_child = self.dt_.tree_.children_right[node]
-                        left_cell = self.cells_by_id_[left_child]
-                        right_cell = self.cells_by_id_[right_child]
+                        left_child = self._dt.tree_.children_left[node]
+                        right_child = self._dt.tree_.children_right[node]
+                        left_cell = self._cells_by_id[left_child]
+                        right_cell = self._cells_by_id[right_child]
                        new_cell = {'id': int(node), 'ranges': {}, 'categories': {}, 'untouched': [],
                                    'label': None, 'representative': None}
                        for feature in left_cell['ranges'].keys():
@ -620,28 +628,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                        self._calculate_level_cell_label(left_cell, right_cell, new_cell)
                    new_cells.append(new_cell)
                    new_cells_by_id[new_cell['id']] = new_cell
-                self.cells_ = new_cells
-                self.cells_by_id_ = new_cells_by_id
+                self.cells = new_cells
+                self._cells_by_id = new_cells_by_id
            # else: nothing to do, stay with previous cells

    def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
        new_cell['hist'] = [x + y for x, y in
                            zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
-        new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
+        new_cell['label'] = int(self._dt.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1

    def _get_nodes_level(self, level):
        # level = distance from lowest leaf
-        node_depth = np.zeros(shape=self.dt_.tree_.node_count, dtype=np.int64)
-        is_leaves = np.zeros(shape=self.dt_.tree_.node_count, dtype=bool)
+        node_depth = np.zeros(shape=self._dt.tree_.node_count, dtype=np.int64)
+        is_leaves = np.zeros(shape=self._dt.tree_.node_count, dtype=bool)
        stack = [(0, -1)]  # seed is the root node id and its parent depth
        while len(stack) > 0:
            node_id, parent_depth = stack.pop()
            # depth = distance from root
            node_depth[node_id] = parent_depth + 1

-            if self.dt_.tree_.children_left[node_id] != self.dt_.tree_.children_right[node_id]:
-                stack.append((self.dt_.tree_.children_left[node_id], parent_depth + 1))
-                stack.append((self.dt_.tree_.children_right[node_id], parent_depth + 1))
+            if self._dt.tree_.children_left[node_id] != self._dt.tree_.children_right[node_id]:
+                stack.append((self._dt.tree_.children_left[node_id], parent_depth + 1))
+                stack.append((self._dt.tree_.children_right[node_id], parent_depth + 1))
            else:
                is_leaves[node_id] = True

@ -660,7 +668,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        # if there is no categorical data prepared data is original data
        nodeIds = self._find_sample_nodes(prepared_data, level_nodes)
        labels_df = pd.DataFrame(labelFeature, columns=['label'])
-        for cell in self.cells_:
+        for cell in self.cells:
            cell['representative'] = {}
            # get all rows in cell
            indexes = [i for i, x in enumerate(nodeIds) if x == cell['id']]
@ -695,14 +703,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                cell['representative'][feature] = row[feature]

    def _find_sample_nodes(self, samples, nodes):
-        paths = self.dt_.decision_path(samples).toarray()
+        paths = self._dt.decision_path(samples).toarray()
        nodeSet = set(nodes)
        return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]

    def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
        # prepared data include one hot encoded categorical data + QI
        representatives = pd.DataFrame(columns=self._features)  # empty except for columns
-        generalized = pd.DataFrame(prepared_data, columns=self.categorical_data.columns, copy=True)
+        generalized = pd.DataFrame(prepared_data, columns=self._categorical_data.columns, copy=True)
        original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
        mapping_to_cells = self._map_to_cells(generalized, level_nodes, cells_by_id)
        # iterate over cells (leaves in decision tree)
@ -755,7 +763,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                              current_accuracy)
        if feature is None:
            return None
-        GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
+        GeneralizeToRepresentative._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
        return feature

    def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
@ -763,7 +771,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        # if there is no categorical data prepared data is original data
        # We want to remove features with low iLoss (NCP) and high accuracy gain
        # (after removing them)
-        ranges = self.generalizations_['ranges']
+        ranges = self._generalizations['ranges']
        range_counts = self._find_range_count(original_data, ranges)
        total = prepared_data.size
        range_min = sys.float_info.max
@ -772,15 +780,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        category_counts = self._find_categories_count(original_data, categories)

        for feature in ranges.keys():
-            if feature not in self.generalizations_['untouched']:
+            if feature not in self._generalizations['untouched']:
                feature_ncp = self._calc_ncp_numeric(ranges[feature],
                                                     range_counts[feature],
                                                     feature_data[feature],
                                                     total)
                if feature_ncp > 0:
                    # divide by accuracy gain
-                    new_cells = copy.deepcopy(self.cells_)
-                    cells_by_id = copy.deepcopy(self.cells_by_id_)
+                    new_cells = copy.deepcopy(self.cells)
+                    cells_by_id = copy.deepcopy(self._cells_by_id)
                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
                    generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
                    accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
@ -802,8 +810,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                                         total)
                if feature_ncp > 0:
                    # divide by accuracy loss
-                    new_cells = copy.deepcopy(self.cells_)
-                    cells_by_id = copy.deepcopy(self.cells_by_id_)
+                    new_cells = copy.deepcopy(self.cells)
+                    cells_by_id = copy.deepcopy(self._cells_by_id)
                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
                    generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
                    accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
@ -821,12 +829,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        return remove_feature

    def _calculate_generalizations(self):
-        self.generalizations_ = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells_),
-                                 'categories': GeneralizeToRepresentative._calculate_categories(self.cells_),
-                                 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells_)}
+        self._generalizations = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells),
+                                 'categories': GeneralizeToRepresentative._calculate_categories(self.cells),
+                                 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}

    def _find_range_count(self, samples, ranges):
-        samples_df = pd.DataFrame(samples, columns=self.categorical_data.columns)
+        samples_df = pd.DataFrame(samples, columns=self._categorical_data.columns)
        range_counts = {}
        last_value = None
        for r in ranges.keys():
--- a/apt/utils/dataset_utils.py
+++ b/apt/utils/dataset_utils.py
@ -6,7 +6,7 @@ from os import path, mkdir
 from six.moves.urllib.request import urlretrieve


-def get_iris_dataset(test_set: float = 0.3):
+def get_iris_dataset_np(test_set: float = 0.3):
    """
    Loads the Iris dataset from scikit-learn.

@ -29,7 +29,7 @@ def _load_iris(test_set_size: float = 0.3):
    return (x_train, y_train), (x_test, y_test)


-def get_diabetes_dataset(test_set: float = 0.3):
+def get_diabetes_dataset_np(test_set: float = 0.3):
    """
    Loads the Diabetes dataset from scikit-learn.

@ -52,7 +52,7 @@ def _load_diabetes(test_set_size: float = 0.3):
    return (x_train, y_train), (x_test, y_test)


-def get_german_credit_dataset(test_set: float = 0.3):
+def get_german_credit_dataset_pd(test_set: float = 0.3):
    """
    Loads the UCI German credit dataset from `tests/datasets/german` or downloads it from
    https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/ if necessary.
@ -122,11 +122,16 @@ def _modify_german_dataset(data):
            return 1
        else:
            raise Exception('Bad value')
+
+    def modify_label(value):
+        return value - 1
+
    data['Foreign_worker'] = data['Foreign_worker'].apply(modify_Foreign_worker)
    data['Telephone'] = data['Telephone'].apply(modify_Telephone)
+    data['label'] = data['label'].apply(modify_label)


-def get_adult_dataset():
+def get_adult_dataset_pd():
    """
    Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it from
    https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ if necessary.
@ -228,7 +233,7 @@ def _modify_adult_dataset(data):
    return data.drop(['fnlwgt', 'education'], axis=1)


-def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
+def get_nursery_dataset_pd(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
    """
    Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it from
    https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/ if necessary.
--- a/apt/utils/datasets/datasets.py
+++ b/apt/utils/datasets/datasets.py
@ -5,7 +5,7 @@ Implementation of utility classes for dataset handling
 """

 from abc import ABCMeta, abstractmethod
-from typing import Callable, Collection, Any, Union, List, Optional
+from typing import Callable, Collection, Any, Union, List, Optional, Type

 import tarfile
 import os
@ -19,9 +19,9 @@ from torch import Tensor
 logger = logging.getLogger(__name__)


-INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
+INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, pd.Series, List, Tensor]
 OUTPUT_DATA_ARRAY_TYPE = np.ndarray
-DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
+DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame, pd.Series]


 class Dataset(metaclass=ABCMeta):
@ -323,7 +323,7 @@ class DatasetFactory:
        :return: a Callable that returns the registered dataset class
        """

-        def inner_wrapper(wrapped_class: Dataset) -> Any:
+        def inner_wrapper(wrapped_class: Type[Dataset]) -> Any:
            if name in cls.registry:
                logger.warning('Dataset %s already exists. Will replace it', name)
            cls.registry[name] = wrapped_class
@ -414,14 +414,18 @@ class Data:
        """
        Get test set samples

-        :return: test samples
+        :return: test samples, or None if no test data provided
        """
+        if self.test is None:
+            return None
        return self.test.get_samples()

    def get_test_labels(self) -> Collection[Any]:
        """
        Get test set labels

-        :return: test labels
+        :return: test labels, or None if no test data provided
        """
+        if self.test is None:
+            return None
        return self.test.get_labels()
--- a/apt/utils/models/init.py
+++ b/apt/utils/models/init.py
@ -1,2 +1,3 @@
-from apt.utils.models.model import Model, ModelOutputType
+from apt.utils.models.model import Model, BlackboxClassifier, ModelOutputType, ScoringMethod
 from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor
+from apt.utils.models.keras_model import KerasClassifier
--- a/apt/utils/models/keras_model.py
+++ b/apt/utils/models/keras_model.py
@ -0,0 +1,149 @@
+from typing import Optional
+
+import numpy as np
+from sklearn.preprocessing import OneHotEncoder
+
+import tensorflow as tf
+from tensorflow import keras
+tf.compat.v1.disable_eager_execution()
+
+from apt.utils.models import Model, ModelOutputType, ScoringMethod
+from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
+
+from art.utils import check_and_transform_label_format
+from art.estimators.classification.keras import KerasClassifier as ArtKerasClassifier
+# from art.estimators.regression.keras import KerasRegressor as ArtKerasRegressor
+
+
+class KerasModel(Model):
+    """
+    Wrapper class for keras models.
+    """
+
+
+
+class KerasClassifier(KerasModel):
+    """
+    Wrapper class for keras classification models.
+
+    :param model: The original keras model object.
+    :type model: `keras.models.Model`
+    :param output_type: The type of output the model yields (vector/label only)
+    :type output_type: `ModelOutputType`
+    :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+                             Set to True if the model is only available via query (API) access, i.e.,
+                             only the outputs of the model are exposed, and False if the model internals
+                             are also available. Default is True.
+    :type black_box_access: boolean, optional
+    :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
+                              unlimited queries to the model API or whether there is a limit to the number of
+                              queries that can be submitted. Default is True.
+    :type unlimited_queries: boolean, optional
+    """
+    def __init__(self, model: keras.models.Model, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
+                 unlimited_queries: Optional[bool] = True, **kwargs):
+        super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
+        logits = False
+        if output_type == ModelOutputType.CLASSIFIER_LOGITS:
+            logits = True
+        self._art_model = ArtKerasClassifier(model, use_logits=logits)
+
+    def fit(self, train_data: Dataset, **kwargs) -> None:
+        """
+        Fit the model using the training data.
+
+        :param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
+                           labels (consecutive integers starting at 0).
+        :type train_data: `Dataset`
+        :return: None
+        """
+        y_encoded = check_and_transform_label_format(train_data.get_labels())
+        self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
+
+    def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Perform predictions using the model for input `x`.
+
+        :param x: Input samples.
+        :type x: `Dataset`
+        :return: Predictions from the model as numpy array (class probabilities, if supported).
+        """
+        return self._art_model.predict(x.get_samples(), **kwargs)
+
+    def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
+        """
+        Score the model using test data.
+
+        :param test_data: Test data.
+        :type train_data: `Dataset`
+        :param scoring_method: The method for scoring predictions. Default is ACCURACY.
+        :type scoring_method: `ScoringMethod`, optional
+        :return: the score as float (between 0 and 1)
+        """
+        y = check_and_transform_label_format(test_data.get_labels(), self._art_model.nb_classes)
+        predicted = self.predict(test_data)
+        if scoring_method == ScoringMethod.ACCURACY:
+            return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
+        else:
+            raise NotImplementedError
+
+
+# class KerasRegressor(KerasModel):
+#     """
+#     Wrapper class for keras regression models.
+#
+#     :param model: The original keras model object.
+#     :type model: `keras.models.Model`
+#     :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+#                              Set to True if the model is only available via query (API) access, i.e.,
+#                              only the outputs of the model are exposed, and False if the model internals
+#                              are also available. Default is True.
+#     :type black_box_access: boolean, optional
+#     :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
+#                               unlimited queries to the model API or whether there is a limit to the number of
+#                               queries that can be submitted. Default is True.
+#     :type unlimited_queries: boolean, optional
+#     """
+#     def __init__(self, model: keras.models.Model, black_box_access: Optional[bool] = True,
+#                  unlimited_queries: Optional[bool] = True, **kwargs):
+#         super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
+#         self._art_model = ArtKerasRegressor(model)
+#
+#     def fit(self, train_data: Dataset, **kwargs) -> None:
+#         """
+#         Fit the model using the training data.
+#
+#         :param train_data: Training data.
+#         :type train_data: `Dataset`
+#         :return: None
+#         """
+#         self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
+#
+#     def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
+#         """
+#         Perform predictions using the model for input `x`.
+#
+#         :param x: Input samples.
+#         :type x: `Dataset`
+#         :return: Predictions from the model as numpy array.
+#         """
+#         return self._art_model.predict(x.get_samples(), **kwargs)
+#
+#     def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.MEAN_SQUARED_ERROR,
+#               **kwargs):
+#         """
+#         Score the model using test data.
+#
+#         :param test_data: Test data.
+#         :type train_data: `Dataset`
+#         :param scoring_method: The method for scoring predictions. Default is ACCURACY.
+#         :type scoring_method: `ScoringMethod`, optional
+#         :return: the score as float
+#         """
+#         y = check_and_transform_label_format(test_data.get_labels(), self._art_model.nb_classes)
+#         predicted = self.predict(test_data)
+#         if scoring_method == ScoringMethod.MEAN_SQUARED_ERROR:
+#             mse = keras.losses.MeanSquaredError(reduction=keras.losses.Reduction.SUM)
+#             return mse(y, predicted).numpy()
+#         else:
+#             raise NotImplementedError('Only MEAN_SQUARED_ERROR supported as scoring method')
--- a/apt/utils/models/model.py
+++ b/apt/utils/models/model.py
@ -1,16 +1,25 @@
 from abc import ABCMeta, abstractmethod
 from typing import Any, Optional
 from enum import Enum, auto
+import numpy as np

-from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
+from apt.utils.datasets import Dataset, Data, OUTPUT_DATA_ARRAY_TYPE
+from art.estimators.classification import BlackBoxClassifier
+from art.utils import check_and_transform_label_format


 class ModelOutputType(Enum):
-    CLASSIFIER_VECTOR = auto()  # probabilities or logits
+    CLASSIFIER_PROBABILITIES = auto()  # vector of probabilities
+    CLASSIFIER_LOGITS = auto()  # vector of logits
    CLASSIFIER_SCALAR = auto()  # label only
    REGRESSOR_SCALAR = auto()  # value


+class ScoringMethod(Enum):
+    ACCURACY = auto()  # number of correct predictions divided by the number of samples
+    MEAN_SQUARED_ERROR = auto()  # mean squared error between the predictions and true labels
+
+
 class Model(metaclass=ABCMeta):
    """
    Abstract base class for ML model wrappers.
@ -54,7 +63,7 @@ class Model(metaclass=ABCMeta):
        Perform predictions using the model for input `x`.

        :param x: Input samples.
-        :type x: `np.ndarray` or `pandas.DataFrame`
+        :type x: `Dataset`
        :return: Predictions from the model as numpy array.
        """
        raise NotImplementedError
@ -107,3 +116,87 @@ class Model(metaclass=ABCMeta):
        :return: True if a user can perform unlimited queries to the model API, otherwise False.
        """
        return self._unlimited_queries
+
+    def get_nb_classes(self, y: OUTPUT_DATA_ARRAY_TYPE) -> int:
+        """
+        Get the number of classes from an array of labels
+
+        :param y: the labels
+        :type y: numpy array
+        :return: the number of classes as integer
+        """
+        if len(y.shape) == 1:
+            return len(np.unique(y))
+        else:
+            return y.shape[1]
+
+
+class BlackboxClassifier(Model):
+    """
+    Wrapper for black-box ML classification models.
+
+    :param model: The training and/or test data along with the model's predictions for the data. Assumes that the data
+                  is represented as numpy arrays. Labels are expected to either be one-hot encoded or
+                  a 1D-array of categorical labels (consecutive integers starting at 0).
+    :type model: `Data` object
+    :param output_type: The type of output the model yields (vector/label only for classifiers,
+                        value for regressors)
+    :type output_type: `ModelOutputType`
+    :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+                             Always assumed to be True for this wrapper.
+    :type black_box_access: boolean, optional
+    :param unlimited_queries: Boolean indicating whether a user can perform unlimited queries to the model API.
+                              Always assumed to be False for this wrapper.
+    :type unlimited_queries: boolean, optional
+    """
+
+    def __init__(self, model: Data, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
+                 unlimited_queries: Optional[bool] = True, **kwargs):
+        super().__init__(model, output_type, black_box_access=True, unlimited_queries=False, **kwargs)
+        x = model.get_train_samples()
+        y = model.get_train_labels()
+        self.nb_classes = self.get_nb_classes(y)
+        y = check_and_transform_label_format(y, nb_classes=self.nb_classes)
+
+        if model.get_test_samples() is not None and type(x) == np.ndarray:
+            x = np.vstack((x, model.get_test_samples()))
+
+        if model.get_test_labels() is not None and type(y) == np.ndarray:
+            y = np.vstack((y, check_and_transform_label_format(model.get_test_labels(), nb_classes=self.nb_classes)))
+
+        predict_fn = (x, y)
+        self._art_model = BlackBoxClassifier(predict_fn, x.shape[1:], self.nb_classes, fuzzy_float_compare=True)
+
+    def fit(self, train_data: Dataset, **kwargs) -> None:
+        """
+        A blackbox model cannot be fit.
+        """
+        raise NotImplementedError
+
+    def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
+        """
+        Get predictions from the model for input `x`. `x` must be a subset of the data provided in the `model` data in
+        `__init__()`.
+
+        :param x: Input samples.
+        :type x: `Dataset`
+        :return: Predictions from the model as numpy array.
+        """
+        return self._art_model.predict(x.get_samples())
+
+    def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
+        """
+        Score the model using test data.
+
+        :param test_data: Test data.
+        :type train_data: `Dataset`
+        :param scoring_method: The method for scoring predictions. Default is ACCURACY.
+        :type scoring_method: `ScoringMethod`, optional
+        :return: the score as float (for classifiers, between 0 and 1)
+        """
+        predicted = self._art_model.predict(test_data.get_samples())
+        y = check_and_transform_label_format(test_data.get_labels(), nb_classes=self.nb_classes)
+        if scoring_method == ScoringMethod.ACCURACY:
+            return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
+        else:
+            raise NotImplementedError
--- a/apt/utils/models/sklearn_model.py
+++ b/apt/utils/models/sklearn_model.py
@ -1,7 +1,5 @@
 from typing import Optional

-import numpy as np
-
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.base import BaseEstimator

@ -10,6 +8,7 @@ from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE

 from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
 from art.estimators.regression.scikitlearn import ScikitlearnRegressor
+from art.utils import check_and_transform_label_format


 class SklearnModel(Model):
@ -54,12 +53,14 @@ class SklearnClassifier(SklearnModel):
        """
        Fit the model using the training data.

-        :param train_data: Training data.
+        :param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
+                           labels (consecutive integers starting at 0).
        :type train_data: `Dataset`
        :return: None
        """
-        encoder = OneHotEncoder(sparse=False)
-        y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
+        y = train_data.get_labels()
+        self.nb_classes = self.get_nb_classes(y)
+        y_encoded = check_and_transform_label_format(y, nb_classes=self.nb_classes)
        self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)

    def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
@ -70,7 +71,7 @@ class SklearnClassifier(SklearnModel):
        :type x: `Dataset`
        :return: Predictions from the model as numpy array (class probabilities, if supported).
        """
-        return self._art_model.predict(x, **kwargs)
+        return self._art_model.predict(x.get_samples(), **kwargs)


 class SklearnRegressor(SklearnModel):
@ -112,4 +113,4 @@ class SklearnRegressor(SklearnModel):
        :type x: `Dataset`
        :return: Predictions from the model as numpy array.
        """
-        return self._art_model.predict(x, **kwargs)
+        return self._art_model.predict(x.get_samples(), **kwargs)
--- a/notebooks/attribute_inference_anonymization_nursery.ipynb
+++ b/notebooks/attribute_inference_anonymization_nursery.ipynb
@ -29,15 +29,198 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "data": {
-      "text/plain": "           parents     has_nurs        form children     housing     finance  \\\n8450   pretentious    very_crit      foster        1   less_conv  convenient   \n12147   great_pret    very_crit    complete        1    critical      inconv   \n2780         usual     critical    complete        4   less_conv  convenient   \n11924   great_pret     critical      foster        1    critical  convenient   \n59           usual       proper    complete        2  convenient  convenient   \n...            ...          ...         ...      ...         ...         ...   \n5193   pretentious  less_proper    complete        1  convenient      inconv   \n1375         usual  less_proper  incomplete        2   less_conv  convenient   \n10318   great_pret  less_proper      foster        4  convenient  convenient   \n6396   pretentious     improper   completed        3   less_conv  convenient   \n485          usual       proper  incomplete        1    critical      inconv   \n\n       social       health  \n8450        1    not_recom  \n12147       1  recommended  \n2780        1    not_recom  \n11924       1    not_recom  \n59          0    not_recom  \n...       ...          ...  \n5193        0  recommended  \n1375        1     priority  \n10318       0     priority  \n6396        1  recommended  \n485         1    not_recom  \n\n[10366 rows x 8 columns]",
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>parents</th>\n      <th>has_nurs</th>\n      <th>form</th>\n      <th>children</th>\n      <th>housing</th>\n      <th>finance</th>\n      <th>social</th>\n      <th>health</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>8450</th>\n      <td>pretentious</td>\n      <td>very_crit</td>\n      <td>foster</td>\n      <td>1</td>\n      <td>less_conv</td>\n      <td>convenient</td>\n      <td>1</td>\n      <td>not_recom</td>\n    </tr>\n    <tr>\n      <th>12147</th>\n      <td>great_pret</td>\n      <td>very_crit</td>\n      <td>complete</td>\n      <td>1</td>\n      <td>critical</td>\n      <td>inconv</td>\n      <td>1</td>\n      <td>recommended</td>\n    </tr>\n    <tr>\n      <th>2780</th>\n      <td>usual</td>\n      <td>critical</td>\n      <td>complete</td>\n      <td>4</td>\n      <td>less_conv</td>\n      <td>convenient</td>\n      <td>1</td>\n      <td>not_recom</td>\n    </tr>\n    <tr>\n      <th>11924</th>\n      <td>great_pret</td>\n      <td>critical</td>\n      <td>foster</td>\n      <td>1</td>\n      <td>critical</td>\n      <td>convenient</td>\n      <td>1</td>\n      <td>not_recom</td>\n    </tr>\n    <tr>\n      <th>59</th>\n      <td>usual</td>\n      <td>proper</td>\n      <td>complete</td>\n      <td>2</td>\n      <td>convenient</td>\n      <td>convenient</td>\n      <td>0</td>\n      <td>not_recom</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>5193</th>\n      <td>pretentious</td>\n      <td>less_proper</td>\n      <td>complete</td>\n      <td>1</td>\n      <td>convenient</td>\n      <td>inconv</td>\n      <td>0</td>\n      <td>recommended</td>\n    </tr>\n    <tr>\n      <th>1375</th>\n      <td>usual</td>\n      <td>less_proper</td>\n      <td>incomplete</td>\n      <td>2</td>\n      <td>less_conv</td>\n      <td>convenient</td>\n      <td>1</td>\n      <td>priority</td>\n    </tr>\n    <tr>\n      <th>10318</th>\n      <td>great_pret</td>\n      <td>less_proper</td>\n      <td>foster</td>\n      <td>4</td>\n      <td>convenient</td>\n      <td>convenient</td>\n      <td>0</td>\n      <td>priority</td>\n    </tr>\n    <tr>\n      <th>6396</th>\n      <td>pretentious</td>\n      <td>improper</td>\n      <td>completed</td>\n      <td>3</td>\n      <td>less_conv</td>\n      <td>convenient</td>\n      <td>1</td>\n      <td>recommended</td>\n    </tr>\n    <tr>\n      <th>485</th>\n      <td>usual</td>\n      <td>proper</td>\n      <td>incomplete</td>\n      <td>1</td>\n      <td>critical</td>\n      <td>inconv</td>\n      <td>1</td>\n      <td>not_recom</td>\n    </tr>\n  </tbody>\n</table>\n<p>10366 rows × 8 columns</p>\n</div>"
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>parents</th>\n",
+       "      <th>has_nurs</th>\n",
+       "      <th>form</th>\n",
+       "      <th>children</th>\n",
+       "      <th>housing</th>\n",
+       "      <th>finance</th>\n",
+       "      <th>social</th>\n",
+       "      <th>health</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>8450</th>\n",
+       "      <td>pretentious</td>\n",
+       "      <td>very_crit</td>\n",
+       "      <td>foster</td>\n",
+       "      <td>1</td>\n",
+       "      <td>less_conv</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>1</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12147</th>\n",
+       "      <td>great_pret</td>\n",
+       "      <td>very_crit</td>\n",
+       "      <td>complete</td>\n",
+       "      <td>1</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>inconv</td>\n",
+       "      <td>1</td>\n",
+       "      <td>recommended</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2780</th>\n",
+       "      <td>usual</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>complete</td>\n",
+       "      <td>4</td>\n",
+       "      <td>less_conv</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>1</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11924</th>\n",
+       "      <td>great_pret</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>foster</td>\n",
+       "      <td>1</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>1</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>59</th>\n",
+       "      <td>usual</td>\n",
+       "      <td>proper</td>\n",
+       "      <td>complete</td>\n",
+       "      <td>2</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5193</th>\n",
+       "      <td>pretentious</td>\n",
+       "      <td>less_proper</td>\n",
+       "      <td>complete</td>\n",
+       "      <td>1</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>inconv</td>\n",
+       "      <td>0</td>\n",
+       "      <td>recommended</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1375</th>\n",
+       "      <td>usual</td>\n",
+       "      <td>less_proper</td>\n",
+       "      <td>incomplete</td>\n",
+       "      <td>2</td>\n",
+       "      <td>less_conv</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>1</td>\n",
+       "      <td>priority</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10318</th>\n",
+       "      <td>great_pret</td>\n",
+       "      <td>less_proper</td>\n",
+       "      <td>foster</td>\n",
+       "      <td>4</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>priority</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6396</th>\n",
+       "      <td>pretentious</td>\n",
+       "      <td>improper</td>\n",
+       "      <td>completed</td>\n",
+       "      <td>3</td>\n",
+       "      <td>less_conv</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>1</td>\n",
+       "      <td>recommended</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>485</th>\n",
+       "      <td>usual</td>\n",
+       "      <td>proper</td>\n",
+       "      <td>incomplete</td>\n",
+       "      <td>1</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>inconv</td>\n",
+       "      <td>1</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>10366 rows × 8 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           parents     has_nurs        form children     housing     finance  \\\n",
+       "8450   pretentious    very_crit      foster        1   less_conv  convenient   \n",
+       "12147   great_pret    very_crit    complete        1    critical      inconv   \n",
+       "2780         usual     critical    complete        4   less_conv  convenient   \n",
+       "11924   great_pret     critical      foster        1    critical  convenient   \n",
+       "59           usual       proper    complete        2  convenient  convenient   \n",
+       "...            ...          ...         ...      ...         ...         ...   \n",
+       "5193   pretentious  less_proper    complete        1  convenient      inconv   \n",
+       "1375         usual  less_proper  incomplete        2   less_conv  convenient   \n",
+       "10318   great_pret  less_proper      foster        4  convenient  convenient   \n",
+       "6396   pretentious     improper   completed        3   less_conv  convenient   \n",
+       "485          usual       proper  incomplete        1    critical      inconv   \n",
+       "\n",
+       "       social       health  \n",
+       "8450        1    not_recom  \n",
+       "12147       1  recommended  \n",
+       "2780        1    not_recom  \n",
+       "11924       1    not_recom  \n",
+       "59          0    not_recom  \n",
+       "...       ...          ...  \n",
+       "5193        0  recommended  \n",
+       "1375        1     priority  \n",
+       "10318       0     priority  \n",
+       "6396        1  recommended  \n",
+       "485         1    not_recom  \n",
+       "\n",
+       "[10366 rows x 8 columns]"
+      ]
     },
-     "execution_count": 1,
+     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -47,9 +230,9 @@
    "import sys\n",
    "sys.path.insert(0, os.path.abspath('..'))\n",
    "\n",
-    "from apt.utils.dataset_utils import get_nursery_dataset\n",
+    "from apt.utils.dataset_utils import get_nursery_dataset_pd\n",
    "\n",
-    "(x_train, y_train), (x_test, y_test) = get_nursery_dataset(transform_social=True)\n",
+    "(x_train, y_train), (x_test, y_test) = get_nursery_dataset_pd(transform_social=True)\n",
    "\n",
    "x_train"
   ]
@ -63,7 +246,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
@ -78,11 +261,25 @@
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.pipeline import Pipeline\n",
    "\n",
-    "x_train_str = x_train.astype(str)\n",
-    "train_encoded = OneHotEncoder(sparse=False).fit_transform(x_train_str)\n",
-    "x_test_str = x_test.astype(str)\n",
-    "test_encoded = OneHotEncoder(sparse=False).fit_transform(x_test_str)\n",
+    "numeric_features = ['social']\n",
+    "categorical_features = ['children', 'parents', 'has_nurs', 'form', 'housing', 'finance', 'health']\n",
+    "numeric_transformer = Pipeline(\n",
+    "    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
+    ")\n",
+    "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
+    "preprocessor = ColumnTransformer(\n",
+    "    transformers=[\n",
+    "        (\"num\", numeric_transformer, numeric_features),\n",
+    "        (\"cat\", categorical_transformer, categorical_features),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "train_encoded = preprocessor.fit_transform(x_train)\n",
+    "test_encoded = preprocessor.transform(x_test)\n",
    "    \n",
    "model = DecisionTreeClassifier()\n",
    "model.fit(train_encoded, y_train)\n",
@ -104,14 +301,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from art.attacks.inference.attribute_inference import AttributeInferenceBlackBox\n",
    "\n",
-    "attack_feature = 20\n",
+    "# social feature after preprocessing\n",
+    "attack_feature = 0\n",
    "\n",
    "# training data without attacked feature\n",
    "x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n",
@ -140,14 +338,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1.0\n"
+      "0.6000385876905268\n"
     ]
    }
   ],
@ -155,7 +353,7 @@
    "# get inferred values\n",
    "values=[0, 1]\n",
    "\n",
-    "inferred_train_bb = bb_attack.infer(x_train_for_attack[attack_train_size:], x_train_predictions[attack_train_size:], values=values)\n",
+    "inferred_train_bb = bb_attack.infer(x_train_for_attack[attack_train_size:], pred=x_train_predictions[attack_train_size:], values=values)\n",
    "# check accuracy\n",
    "train_acc = np.sum(inferred_train_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_bb)\n",
    "print(train_acc)"
@ -165,7 +363,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This means that for 64% of the training set, the attacked feature is inferred correctly using this attack."
+    "This means that for 60% of the training set, the attacked feature is inferred correctly using this attack."
   ]
  },
  {
@ -178,14 +376,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "0.5122515917422342\n"
+      "0.6980513216284006\n"
     ]
    }
   ],
@ -225,15 +423,198 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "data": {
-      "text/plain": "           parents     has_nurs        form children     housing     finance  \\\n0      pretentious    very_crit      foster        1   less_conv  convenient   \n1       great_pret    very_crit    complete        1    critical      inconv   \n2            usual     critical    complete        4   less_conv  convenient   \n3       great_pret     critical      foster        1    critical  convenient   \n4            usual       proper    complete        2  convenient  convenient   \n...            ...          ...         ...      ...         ...         ...   \n10361  pretentious  less_proper    complete        1  convenient      inconv   \n10362        usual  less_proper  incomplete        2   less_conv  convenient   \n10363   great_pret  less_proper      foster        4  convenient  convenient   \n10364  pretentious     improper   completed        3   less_conv  convenient   \n10365        usual       proper  incomplete        1    critical  convenient   \n\n      social       health  \n0          0    not_recom  \n1          1  recommended  \n2          0    not_recom  \n3          0    not_recom  \n4          0    not_recom  \n...      ...          ...  \n10361      0  recommended  \n10362      1     priority  \n10363      0     priority  \n10364      1  recommended  \n10365      0    not_recom  \n\n[10366 rows x 8 columns]",
-      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>parents</th>\n      <th>has_nurs</th>\n      <th>form</th>\n      <th>children</th>\n      <th>housing</th>\n      <th>finance</th>\n      <th>social</th>\n      <th>health</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>pretentious</td>\n      <td>very_crit</td>\n      <td>foster</td>\n      <td>1</td>\n      <td>less_conv</td>\n      <td>convenient</td>\n      <td>0</td>\n      <td>not_recom</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>great_pret</td>\n      <td>very_crit</td>\n      <td>complete</td>\n      <td>1</td>\n      <td>critical</td>\n      <td>inconv</td>\n      <td>1</td>\n      <td>recommended</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>usual</td>\n      <td>critical</td>\n      <td>complete</td>\n      <td>4</td>\n      <td>less_conv</td>\n      <td>convenient</td>\n      <td>0</td>\n      <td>not_recom</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>great_pret</td>\n      <td>critical</td>\n      <td>foster</td>\n      <td>1</td>\n      <td>critical</td>\n      <td>convenient</td>\n      <td>0</td>\n      <td>not_recom</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>usual</td>\n      <td>proper</td>\n      <td>complete</td>\n      <td>2</td>\n      <td>convenient</td>\n      <td>convenient</td>\n      <td>0</td>\n      <td>not_recom</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>10361</th>\n      <td>pretentious</td>\n      <td>less_proper</td>\n      <td>complete</td>\n      <td>1</td>\n      <td>convenient</td>\n      <td>inconv</td>\n      <td>0</td>\n      <td>recommended</td>\n    </tr>\n    <tr>\n      <th>10362</th>\n      <td>usual</td>\n      <td>less_proper</td>\n      <td>incomplete</td>\n      <td>2</td>\n      <td>less_conv</td>\n      <td>convenient</td>\n      <td>1</td>\n      <td>priority</td>\n    </tr>\n    <tr>\n      <th>10363</th>\n      <td>great_pret</td>\n      <td>less_proper</td>\n      <td>foster</td>\n      <td>4</td>\n      <td>convenient</td>\n      <td>convenient</td>\n      <td>0</td>\n      <td>priority</td>\n    </tr>\n    <tr>\n      <th>10364</th>\n      <td>pretentious</td>\n      <td>improper</td>\n      <td>completed</td>\n      <td>3</td>\n      <td>less_conv</td>\n      <td>convenient</td>\n      <td>1</td>\n      <td>recommended</td>\n    </tr>\n    <tr>\n      <th>10365</th>\n      <td>usual</td>\n      <td>proper</td>\n      <td>incomplete</td>\n      <td>1</td>\n      <td>critical</td>\n      <td>convenient</td>\n      <td>0</td>\n      <td>not_recom</td>\n    </tr>\n  </tbody>\n</table>\n<p>10366 rows × 8 columns</p>\n</div>"
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>parents</th>\n",
+       "      <th>has_nurs</th>\n",
+       "      <th>form</th>\n",
+       "      <th>children</th>\n",
+       "      <th>housing</th>\n",
+       "      <th>finance</th>\n",
+       "      <th>social</th>\n",
+       "      <th>health</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>pretentious</td>\n",
+       "      <td>very_crit</td>\n",
+       "      <td>foster</td>\n",
+       "      <td>1</td>\n",
+       "      <td>less_conv</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>great_pret</td>\n",
+       "      <td>very_crit</td>\n",
+       "      <td>complete</td>\n",
+       "      <td>1</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>inconv</td>\n",
+       "      <td>1</td>\n",
+       "      <td>recommended</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>usual</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>complete</td>\n",
+       "      <td>4</td>\n",
+       "      <td>less_conv</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>great_pret</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>foster</td>\n",
+       "      <td>1</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>usual</td>\n",
+       "      <td>proper</td>\n",
+       "      <td>complete</td>\n",
+       "      <td>2</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10361</th>\n",
+       "      <td>pretentious</td>\n",
+       "      <td>less_proper</td>\n",
+       "      <td>complete</td>\n",
+       "      <td>1</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>recommended</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10362</th>\n",
+       "      <td>usual</td>\n",
+       "      <td>less_proper</td>\n",
+       "      <td>incomplete</td>\n",
+       "      <td>2</td>\n",
+       "      <td>less_conv</td>\n",
+       "      <td>inconv</td>\n",
+       "      <td>0</td>\n",
+       "      <td>priority</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10363</th>\n",
+       "      <td>great_pret</td>\n",
+       "      <td>less_proper</td>\n",
+       "      <td>foster</td>\n",
+       "      <td>4</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>priority</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10364</th>\n",
+       "      <td>pretentious</td>\n",
+       "      <td>improper</td>\n",
+       "      <td>completed</td>\n",
+       "      <td>3</td>\n",
+       "      <td>less_conv</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>recommended</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10365</th>\n",
+       "      <td>usual</td>\n",
+       "      <td>proper</td>\n",
+       "      <td>incomplete</td>\n",
+       "      <td>1</td>\n",
+       "      <td>critical</td>\n",
+       "      <td>convenient</td>\n",
+       "      <td>0</td>\n",
+       "      <td>not_recom</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>10366 rows × 8 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "           parents     has_nurs        form children     housing     finance  \\\n",
+       "0      pretentious    very_crit      foster        1   less_conv  convenient   \n",
+       "1       great_pret    very_crit    complete        1    critical      inconv   \n",
+       "2            usual     critical    complete        4   less_conv  convenient   \n",
+       "3       great_pret     critical      foster        1    critical  convenient   \n",
+       "4            usual       proper    complete        2  convenient  convenient   \n",
+       "...            ...          ...         ...      ...         ...         ...   \n",
+       "10361  pretentious  less_proper    complete        1  convenient  convenient   \n",
+       "10362        usual  less_proper  incomplete        2   less_conv      inconv   \n",
+       "10363   great_pret  less_proper      foster        4  convenient  convenient   \n",
+       "10364  pretentious     improper   completed        3   less_conv  convenient   \n",
+       "10365        usual       proper  incomplete        1    critical  convenient   \n",
+       "\n",
+       "      social       health  \n",
+       "0          0    not_recom  \n",
+       "1          1  recommended  \n",
+       "2          0    not_recom  \n",
+       "3          0    not_recom  \n",
+       "4          0    not_recom  \n",
+       "...      ...          ...  \n",
+       "10361      0  recommended  \n",
+       "10362      0     priority  \n",
+       "10363      0     priority  \n",
+       "10364      0  recommended  \n",
+       "10365      0    not_recom  \n",
+       "\n",
+       "[10366 rows x 8 columns]"
+      ]
     },
-     "execution_count": 6,
+     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -244,24 +625,24 @@
    "\n",
    "features = x_train.columns\n",
    "QI = [\"finance\", \"social\", \"health\"]\n",
-    "categorical_features = [\"parents\", \"has_nurs\", \"form\", \"housing\", \"finance\", \"health\", 'children']\n",
-    "QI_indexes = [i for i, v in enumerate(features) if v in QI]\n",
-    "categorical_features_indexes = [i for i, v in enumerate(features) if v in categorical_features]\n",
-    "anonymizer = Anonymize(100, QI_indexes, categorical_features=categorical_features_indexes)\n",
+    "\n",
+    "anonymizer = Anonymize(100, QI, categorical_features=categorical_features)\n",
    "anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
-    "anon\n"
+    "anon"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
-      "text/plain": "7585"
+      "text/plain": [
+       "7585"
+      ]
     },
-     "execution_count": 7,
+     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -273,14 +654,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 128,
   "metadata": {},
   "outputs": [
    {
     "data": {
-      "text/plain": "5766"
+      "text/plain": [
+       "3001"
+      ]
     },
-     "execution_count": 8,
+     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -299,20 +682,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Anonymized model accuracy:  0.9976851851851852\n"
+      "Anonymized model accuracy:  0.9054783950617284\n"
     ]
    }
   ],
   "source": [
-    "anon_str = anon.astype(str)\n",
-    "anon_encoded = OneHotEncoder(sparse=False).fit_transform(anon_str)\n",
+    "anon_encoded = preprocessor.fit_transform(anon)\n",
+    "test_encoded = preprocessor.transform(x_test)\n",
    "\n",
    "anon_model = DecisionTreeClassifier()\n",
    "anon_model.fit(anon_encoded, y_train)\n",
@ -332,18 +715,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1.0\n"
+      "0.5813235577850666\n"
     ]
    }
   ],
   "source": [
+    "# training data without attacked feature\n",
+    "x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n",
+    "# only attacked feature\n",
+    "x_train_feature = train_encoded[:, attack_feature].copy().reshape(-1, 1)\n",
+    "\n",
    "anon_bb_attack = AttributeInferenceBlackBox(anon_art_classifier, attack_feature=attack_feature)\n",
    "\n",
    "# get original model's predictions\n",
@ -353,7 +741,7 @@
    "anon_bb_attack.fit(train_encoded[:attack_train_size])\n",
    "\n",
    "# get inferred values\n",
-    "inferred_train_anon_bb = anon_bb_attack.infer(x_train_for_attack[attack_train_size:], anon_x_train_predictions[attack_train_size:], values=values)\n",
+    "inferred_train_anon_bb = anon_bb_attack.infer(x_train_for_attack[attack_train_size:], pred=anon_x_train_predictions[attack_train_size:], values=values)\n",
    "# check accuracy\n",
    "train_acc = np.sum(inferred_train_anon_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon_bb)\n",
    "print(train_acc)"
@ -368,14 +756,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "0.5245996527107852\n"
+      "0.6857032606598495\n"
     ]
    }
   ],
@ -399,15 +787,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "(0.49415432579890883, 0.48976438779451525)\n",
-      "(0.49415432579890883, 0.48976438779451525)\n"
+      "(0.3353658536585366, 0.22540983606557377)\n",
+      "(0.3354908306364617, 0.18208430913348947)\n"
     ]
    }
   ],
@ -444,15 +832,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "(1.0, 0.019204655674102813)\n",
-      "(0.9829787234042553, 0.04481086323957323)\n"
+      "(0.6457357075913777, 0.2002324905550712)\n",
+      "(0.6384266263237519, 0.12263876780005813)\n"
     ]
    }
   ],
@ -483,24 +871,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
-    "anonymizer2 = Anonymize(1000, QI_indexes, categorical_features=categorical_features_indexes)\n",
+    "anonymizer2 = Anonymize(1000, QI, categorical_features=categorical_features)\n",
    "anon2 = anonymizer2.anonymize(ArrayDataset(x_train, x_train_predictions))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
-      "text/plain": "4226"
+      "text/plain": [
+       "1727"
+      ]
     },
-     "execution_count": 15,
+     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -519,20 +909,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Anonymized model accuracy:  0.9930555555555556\n"
+      "Anonymized model accuracy:  0.8981481481481481\n"
     ]
    }
   ],
   "source": [
-    "anon2_str = anon2.astype(str)\n",
-    "anon2_encoded = OneHotEncoder(sparse=False).fit_transform(anon2_str)\n",
+    "anon2_encoded = preprocessor.fit_transform(anon2)\n",
+    "test_encoded = preprocessor.transform(x_test)\n",
    "\n",
    "anon2_model = DecisionTreeClassifier()\n",
    "anon2_model.fit(anon2_encoded, y_train)\n",
@ -552,18 +942,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "1.0\n"
+      "0.546015820953116\n"
     ]
    }
   ],
   "source": [
+    "# training data without attacked feature\n",
+    "x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n",
+    "# only attacked feature\n",
+    "x_train_feature = train_encoded[:, attack_feature].copy().reshape(-1, 1)\n",
+    "\n",
    "anon2_bb_attack = AttributeInferenceBlackBox(anon2_art_classifier, attack_feature=attack_feature)\n",
    "\n",
    "# get original model's predictions\n",
@ -573,7 +968,7 @@
    "anon2_bb_attack.fit(train_encoded[:attack_train_size])\n",
    "\n",
    "# get inferred values\n",
-    "inferred_train_anon2_bb = anon2_bb_attack.infer(x_train_for_attack[attack_train_size:], anon2_x_train_predictions[attack_train_size:], values=values)\n",
+    "inferred_train_anon2_bb = anon2_bb_attack.infer(x_train_for_attack[attack_train_size:], pred=anon2_x_train_predictions[attack_train_size:], values=values)\n",
    "# check accuracy\n",
    "train_acc = np.sum(inferred_train_anon2_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon2_bb)\n",
    "print(train_acc)"
@ -588,14 +983,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "0.515820953115956\n"
+      "0.6680493922438742\n"
     ]
    }
   ],
@ -612,17 +1007,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "(0.49415432579890883, 0.48976438779451525)\n",
-      "(0.49415432579890883, 0.48976438779451525)\n",
-      "(1.0, 0.019204655674102813)\n",
-      "(1.0, 0.026382153249272552)\n"
+      "(0.3353658536585366, 0.22540983606557377)\n",
+      "(0.32242990654205606, 0.16159250585480095)\n",
+      "(0.6457357075913777, 0.2002324905550712)\n",
+      "(1, 0.0)\n"
     ]
    }
   ],
@ -655,26 +1050,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 140,
   "metadata": {},
   "outputs": [],
   "source": [
    "QI2 = [\"parents\", \"has_nurs\", \"form\", \"children\", \"housing\", \"finance\", \"social\", \"health\"]\n",
-    "QI2_indexes = [i for i, v in enumerate(features) if v in QI2]\n",
-    "anonymizer3 = Anonymize(100, QI2_indexes, categorical_features=categorical_features_indexes)\n",
+    "anonymizer3 = Anonymize(100, QI2, categorical_features=categorical_features)\n",
    "anon3 = anonymizer3.anonymize(ArrayDataset(x_train, x_train_predictions))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "data": {
-      "text/plain": "39"
+      "text/plain": [
+       "39"
+      ]
     },
-     "execution_count": 21,
+     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -686,22 +1082,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Anonymized model accuracy:  0.751929012345679\n",
-      "BB attack accuracy:  1.0\n",
-      "WB attack accuracy:  0.5187150299054601\n"
+      "Anonymized model accuracy:  0.7600308641975309\n",
+      "BB attack accuracy:  0.5330889446266641\n",
+      "WB attack accuracy:  0.6680493922438742\n"
     ]
    }
   ],
   "source": [
-    "anon3_str = anon3.astype(str)\n",
-    "anon3_encoded = OneHotEncoder(sparse=False).fit_transform(anon3_str)\n",
+    "anon3_encoded = preprocessor.fit_transform(anon3)\n",
+    "test_encoded = preprocessor.transform(x_test)\n",
    "\n",
    "anon3_model = DecisionTreeClassifier()\n",
    "anon3_model.fit(anon3_encoded, y_train)\n",
@ -710,6 +1106,11 @@
    "\n",
    "print('Anonymized model accuracy: ', anon3_model.score(test_encoded, y_test))\n",
    "\n",
+    "# training data without attacked feature\n",
+    "x_train_for_attack = np.delete(train_encoded, attack_feature, 1)\n",
+    "# only attacked feature\n",
+    "x_train_feature = train_encoded[:, attack_feature].copy().reshape(-1, 1)\n",
+    "\n",
    "anon3_bb_attack = AttributeInferenceBlackBox(anon3_art_classifier, attack_feature=attack_feature)\n",
    "\n",
    "# get original model's predictions\n",
@ -719,7 +1120,7 @@
    "anon3_bb_attack.fit(train_encoded[:attack_train_size])\n",
    "\n",
    "# get inferred values\n",
-    "inferred_train_anon3_bb = anon3_bb_attack.infer(x_train_for_attack[attack_train_size:], anon3_x_train_predictions[attack_train_size:], values=values)\n",
+    "inferred_train_anon3_bb = anon3_bb_attack.infer(x_train_for_attack[attack_train_size:], pred=anon3_x_train_predictions[attack_train_size:], values=values)\n",
    "# check accuracy\n",
    "train_acc = np.sum(inferred_train_anon3_bb == np.around(x_train_feature[attack_train_size:], decimals=8).reshape(1,-1)) / len(inferred_train_anon2_bb)\n",
    "print('BB attack accuracy: ', train_acc)\n",
@ -736,17 +1137,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "(0.49415432579890883, 0.48976438779451525)\n",
-      "(0.49415432579890883, 0.48976438779451525)\n",
-      "(1.0, 0.019204655674102813)\n",
-      "(1.0, 0.032201745877788554)\n"
+      "(0.3353658536585366, 0.22540983606557377)\n",
+      "(0.344644750795334, 0.19028103044496486)\n",
+      "(0.6457357075913777, 0.2002324905550712)\n",
+      "(1, 0.0)\n"
     ]
    }
   ],
@ -793,4 +1194,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/notebooks/minimization_adult.ipynb
+++ b/notebooks/minimization_adult.ipynb
@ -27,7 +27,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
@ -42,18 +42,6 @@
      " [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
      " [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
-      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
-      "  y_train = y_train.astype(np.int)\n",
-      "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
-      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
-      "  y_test = y_test.astype(np.int)\n"
-     ]
    }
   ],
   "source": [
@ -96,24 +84,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Base model accuracy:  0.8183158282660771\n"
+      "Base model accuracy:  0.8190528837295007\n"
     ]
    }
   ],
   "source": [
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
+    "\n",
    "from apt.utils.datasets import ArrayDataset\n",
    "from apt.utils.models import SklearnClassifier, ModelOutputType\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "base_est = DecisionTreeClassifier()\n",
-    "model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)\n",
+    "model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)\n",
    "model.fit(ArrayDataset(x_train, y_train))\n",
    "\n",
    "print('Base model accuracy: ', model.score(ArrayDataset(x_test, y_test)))"
@ -129,34 +121,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.920665\n",
      "Improving accuracy\n",
-      "feature to remove: 2\n",
-      "Removed feature: 2, new relative accuracy: 0.935261\n",
-      "feature to remove: 4\n",
-      "Removed feature: 4, new relative accuracy: 0.946776\n",
-      "feature to remove: 0\n",
-      "Removed feature: 0, new relative accuracy: 0.972876\n",
      "feature to remove: 1\n",
-      "Removed feature: 1, new relative accuracy: 0.992835\n",
+      "Removed feature: 1, new relative accuracy: 0.920026\n",
+      "feature to remove: 0\n",
+      "Removed feature: 0, new relative accuracy: 0.938580\n",
+      "feature to remove: 4\n",
+      "Removed feature: 4, new relative accuracy: 0.987204\n",
+      "feature to remove: 2\n",
+      "Removed feature: 2, new relative accuracy: 0.992962\n",
      "feature to remove: 3\n",
      "Removed feature: 3, new relative accuracy: 1.000000\n",
-      "Accuracy on minimized data:  0.8231229847996315\n"
+      "Accuracy on minimized data:  0.8165771297006907\n"
     ]
    }
   ],
   "source": [
-    "import sys\n",
-    "import os\n",
-    "sys.path.insert(0, os.path.abspath('..'))\n",
-    "\n",
    "from apt.minimization import GeneralizeToRepresentative\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
@ -169,7 +157,7 @@
    "# Don't forget to leave a hold-out set for final validation!\n",
    "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
    "                                                                test_size = 0.4, random_state = 38)\n",
-    "x_train_predictions = model.predict(X_generalizer_train)\n",
+    "x_train_predictions = model.predict(ArrayDataset(X_generalizer_train))\n",
    "if x_train_predictions.shape[1] > 1:\n",
    "    x_train_predictions = np.argmax(x_train_predictions, axis=1)\n",
    "minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
@ -187,14 +175,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'ranges': {}, 'categories': {}, 'untouched': ['4', '1', '3', '0', '2']}\n"
+      "{'ranges': {}, 'categories': {}, 'untouched': ['2', '4', '3', '1', '0']}\n"
     ]
    }
   ],
@ -214,25 +202,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.920665\n",
      "Improving accuracy\n",
-      "feature to remove: 2\n",
-      "Removed feature: 2, new relative accuracy: 0.935261\n",
-      "feature to remove: 4\n",
-      "Removed feature: 4, new relative accuracy: 0.946776\n",
-      "feature to remove: 0\n",
-      "Removed feature: 0, new relative accuracy: 0.972876\n",
      "feature to remove: 1\n",
-      "Removed feature: 1, new relative accuracy: 0.992835\n",
-      "Accuracy on minimized data:  0.8192845079072624\n",
-      "{'ranges': {'3': [569.0, 782.0, 870.0, 870.5, 938.0, 1016.5, 1311.5, 1457.0, 1494.5, 1596.0, 1629.5, 1684.0, 1805.0, 1859.0, 1867.5, 1881.5, 1938.0, 1978.5, 2119.0, 2210.0, 2218.0, 2244.5, 2298.5, 2443.5]}, 'categories': {}, 'untouched': ['2', '1', '0', '4']}\n"
+      "Removed feature: 1, new relative accuracy: 0.920026\n",
+      "feature to remove: 0\n",
+      "Removed feature: 0, new relative accuracy: 0.938580\n",
+      "feature to remove: 4\n",
+      "Removed feature: 4, new relative accuracy: 0.987204\n",
+      "feature to remove: 2\n",
+      "Removed feature: 2, new relative accuracy: 0.992962\n",
+      "Accuracy on minimized data:  0.8100537221795856\n",
+      "{'ranges': {'3': [704.0, 782.0, 870.0, 951.0, 1588.0, 1647.5, 1684.0, 1805.0, 1923.0, 2168.5]}, 'categories': {}, 'untouched': ['2', '4', '1', '0']}\n"
     ]
    }
   ],
@ -276,4 +264,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/notebooks/minimization_diabetes_reg.ipynb
+++ b/notebooks/minimization_diabetes_reg.ipynb
@ -14,31 +14,33 @@
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "In this tutorial we will show how to perform data minimization for regression ML models using the minimization module.\n",
    "\n",
    "We will show you applying data minimization to a different trained regression models."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "## Load data\n",
-    "QI parameter determines which features will be minimized."
-   ],
   "metadata": {
-    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
-   }
+   },
+   "source": [
+    "## Load data\n",
+    "QI parameter determines which features will be minimized."
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 7,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_diabetes\n",
@ -49,27 +51,24 @@
    "\n",
    "features = ['age', 'sex', 'bmi', 'bp',\n",
    "                's1', 's2', 's3', 's4', 's5', 's6']\n",
-    "QI = [0, 2, 5, 8, 9]"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+    "QI = ['age', 'bmi', 's2', 's5', 's6']"
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "## Train DecisionTreeRegressor model"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 8,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
@ -86,27 +85,24 @@
    "model1 = DecisionTreeRegressor(random_state=10, min_samples_split=2)\n",
    "model1.fit(X_train, y_train)\n",
    "print('Base model accuracy (R2 score): ', model1.score(X_test, y_test))"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "## Run minimization\n",
    "We will try to run minimization with only a subset of the features."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 9,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
@ -121,14 +117,14 @@
      "feature to remove: bmi\n",
      "Removed feature: bmi, new relative accuracy: 0.718978\n",
      "Accuracy on minimized data:  0.11604533946025941\n",
-      "generalizations:  {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 'bmi', 's6', 'bp', 's4', 's5', 'sex', 's1']}\n"
+      "generalizations:  {'ranges': {'age': [-0.07090024650096893, -0.043656209483742714, -0.041839939542114735, -0.03639113181270659, -0.01459590089507401, -0.012779632292222232, -0.009147093165665865, -0.0036982858437113464, 0.03989217430353165, 0.039892176166176796, 0.05623859912157059, 0.06713621318340302], 's2': [-0.0550188384950161, -0.0285577941685915, -0.024643437936902046, -0.02135537937283516, -0.013683241792023182, -0.006480826530605555, 0.009176596067845821, 0.023111702874302864, 0.02420772146433592, 0.02655633445829153, 0.039082273840904236]}, 'categories': {}, 'untouched': ['s3', 's6', 's5', 'bp', 'bmi', 's4', 's1', 'sex']}\n"
     ]
    }
   ],
   "source": [
    "# note that is_regression param is True\n",
    "\n",
-    "minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, features=features, is_regression=True,\n",
+    "minimizer1 = GeneralizeToRepresentative(model1, target_accuracy=0.7, is_regression=True,\n",
    "                                    features_to_minimize=QI)\n",
    "\n",
    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
@ -139,32 +135,40 @@
    "                                                                test_size = 0.4, random_state = 38)\n",
    "\n",
    "x_train_predictions1 = model1.predict(X_generalizer_train1)\n",
-    "minimizer1.fit(X_generalizer_train1, x_train_predictions1)\n",
-    "transformed1 = minimizer1.transform(x_test1)\n",
+    "minimizer1.fit(X_generalizer_train1, x_train_predictions1, features_names=features)\n",
+    "transformed1 = minimizer1.transform(x_test1, features_names=features)\n",
    "print('Accuracy on minimized data: ', model1.score(transformed1, y_test1))\n",
    "print('generalizations: ',minimizer1.generalizations_)#%% md"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "## Train linear regression model"
-   ],
   "metadata": {
-    "collapsed": false,
    "pycharm": {
     "name": "#%% md\n"
    }
-   }
+   },
+   "source": [
+    "## Train linear regression model"
+   ]
  },
  {
   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Base model accuracy (R2 score):  0.5080618258593723\n"
+     ]
+    }
+   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "from apt.minimization import GeneralizeToRepresentative\n",
@ -172,49 +176,42 @@
    "model2 = LinearRegression()\n",
    "model2.fit(X_train, y_train)\n",
    "print('Base model accuracy (R2 score): ', model2.score(X_test, y_test))"
-   ],
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run minimization\n",
+    "We will try to run minimization with only a subset of the features."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
   "metadata": {
-    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   },
-   "execution_count": null,
-   "outputs": []
-  },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "## Run minimization\n",
-    "We will try to run minimization with only a subset of the features."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.225782\n",
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.355377\n",
      "Improving accuracy\n",
-      "feature to remove: age\n",
-      "Removed feature: age, new relative accuracy: 0.223565\n",
      "feature to remove: s2\n",
-      "Removed feature: s2, new relative accuracy: 0.759788\n",
-      "Accuracy on minimized data:  0.4414329261774286\n",
-      "generalizations:  {'ranges': {'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.04049498960375786, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, 0.0015758189256303012, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025, 0.06386702693998814], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.0383566590026021, -0.02800139266764745, -0.021788232028484344, -0.007290858076885343, -0.007290857844054699, 0.017561784014105797, 0.02377494378015399, 0.02791705122217536, 0.02998810407007113, 0.054840744473040104]}, 'categories': {}, 'untouched': ['s2', 's3', 'bp', 's4', 'age', 'sex', 's1']}\n"
+      "Removed feature: s2, new relative accuracy: 0.773233\n",
+      "Accuracy on minimized data:  0.3945625296515525\n",
+      "generalizations:  {'ranges': {'age': [-0.06181889958679676, -0.027309785597026348, -0.012779631884768605, -0.0036982858437113464, -0.001882016658782959, 0.0035667913034558296, 0.01991321425884962, 0.021729483967646956, 0.02717829099856317, 0.04534098319709301, 0.05805486813187599], 'bmi': [-0.0660245232284069, -0.06171327643096447, -0.048779530450701714, -0.036923596635460854, -0.022912041284143925, -0.015906263142824173, -0.009978296235203743, 0.007266696775332093, 0.022356065921485424, 0.028822937980294228, 0.04499012045562267, 0.04876246117055416, 0.053073709830641747, 0.10103634744882584], 's5': [-0.08940735459327698, -0.07823517918586731, -0.07310866191983223, -0.07022909820079803, -0.06740894541144371, -0.06558558344841003, -0.041897499933838844, -0.03781316243112087, -0.033939776942133904, -0.03263746201992035, -0.02538660168647766, -0.023219254799187183, -0.017585186287760735, -0.016525186598300934, -0.008522996446117759, -0.0048803192912600935, 0.0002040128456428647, 0.0015758189256303012, 0.008132445393130183, 0.012934560421854258, 0.014069339726120234, 0.015929921995848417, 0.01947084255516529, 0.028651678003370762, 0.03358383011072874, 0.03639278281480074, 0.041416410356760025], 's6': [-0.07356456853449345, -0.052854035049676895, -0.048711927607655525, -0.044569820165634155, -0.0383566590026021, -0.021788232028484344, -0.017646125052124262, -0.013504017610102892, 0.02377494378015399, 0.06519601307809353, 0.08383549377322197]}, 'categories': {}, 'untouched': ['s3', 's2', 'bp', 's4', 's1', 'sex']}\n"
     ]
    }
   ],
   "source": [
    "# note that is_regression param is True\n",
    "\n",
-    "minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, features=features, is_regression=True,\n",
+    "minimizer2 = GeneralizeToRepresentative(model2, target_accuracy=0.7, is_regression=True,\n",
    "                                    features_to_minimize=QI)\n",
    "\n",
    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
@ -225,17 +222,11 @@
    "                                                                test_size = 0.4, random_state = 38)\n",
    "\n",
    "x_train_predictions2 = model2.predict(X_generalizer_train2)\n",
-    "minimizer2.fit(X_generalizer_train2, x_train_predictions2)\n",
-    "transformed2 = minimizer2.transform(x_test2)\n",
+    "minimizer2.fit(X_generalizer_train2, x_train_predictions2, features_names=features)\n",
+    "transformed2 = minimizer2.transform(x_test2, features_names=features)\n",
    "print('Accuracy on minimized data: ', model2.score(transformed2, y_test2))\n",
    "print('generalizations: ',minimizer2.generalizations_)"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
  }
 ],
 "metadata": {
@ -247,16 +238,16 @@
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
-    "version": 2
+    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 0
-}
+ "nbformat_minor": 1
+}
--- a/notebooks/minimization_german_credit.ipynb
+++ b/notebooks/minimization_german_credit.ipynb
@ -2,37 +2,36 @@
 "cells": [
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "# Applying data minimization with categorical data and only a subset of the features to a trained ML model"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
    "\n",
    "This will be demonstarted using the German Credit dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data)."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "## Load data\n",
    "QI parameter determines which features will be minimized."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
@ -108,9 +107,13 @@
    }
   ],
   "source": [
-    "from apt.utils import get_german_credit_dataset\n",
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
    "\n",
-    "(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()\n",
+    "from apt.utils.dataset_utils import get_german_credit_dataset_pd\n",
+    "\n",
+    "(x_train, y_train), (x_test, y_test) = get_german_credit_dataset_pd()\n",
    "features = [\"Existing_checking_account\", \"Duration_in_month\", \"Credit_history\", \"Purpose\", \"Credit_amount\",\n",
    "                \"Savings_account\", \"Present_employment_since\", \"Installment_rate\", \"Personal_status_sex\", \"debtors\",\n",
    "                \"Present_residence\", \"Property\", \"Age\", \"Other_installment_plans\", \"Housing\",\n",
@ -123,33 +126,30 @@
    "      \"Housing\", \"Job\"]\n",
    "\n",
    "print(x_train)"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "## Train decision tree model\n",
    "we use OneHotEncoder to handle categorical features."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Base model accuracy:  0.7033333333333334\n"
+      "Base model accuracy:  0.6933333333333334\n"
     ]
    }
   ],
@ -176,50 +176,47 @@
    "\n",
    "encoded_test = preprocessor.transform(x_test)\n",
    "print('Base model accuracy: ', model.score(encoded_test, y_test))"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "## Run minimization\n",
    "We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.805556\n",
      "Improving accuracy\n",
-      "feature to remove: Property\n",
-      "Removed feature: Property, new relative accuracy: 0.819444\n",
-      "feature to remove: Other_installment_plans\n",
-      "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
-      "feature to remove: Job\n",
-      "Removed feature: Job, new relative accuracy: 0.833333\n",
-      "feature to remove: Housing\n",
-      "Removed feature: Housing, new relative accuracy: 0.833333\n",
-      "feature to remove: Purpose\n",
-      "Removed feature: Purpose, new relative accuracy: 0.916667\n",
      "feature to remove: Credit_history\n",
-      "Removed feature: Credit_history, new relative accuracy: 0.930556\n",
-      "feature to remove: debtors\n",
-      "Removed feature: debtors, new relative accuracy: 0.944444\n",
+      "Removed feature: Credit_history, new relative accuracy: 0.819444\n",
+      "feature to remove: Other_installment_plans\n",
+      "Removed feature: Other_installment_plans, new relative accuracy: 0.847222\n",
      "feature to remove: Duration_in_month\n",
-      "Removed feature: Duration_in_month, new relative accuracy: 1.000000\n",
+      "Removed feature: Duration_in_month, new relative accuracy: 0.847222\n",
+      "feature to remove: Property\n",
+      "Removed feature: Property, new relative accuracy: 0.847222\n",
+      "feature to remove: Housing\n",
+      "Removed feature: Housing, new relative accuracy: 0.847222\n",
+      "feature to remove: Purpose\n",
+      "Removed feature: Purpose, new relative accuracy: 0.986111\n",
+      "feature to remove: debtors\n",
+      "Removed feature: debtors, new relative accuracy: 0.986111\n",
+      "feature to remove: Job\n",
+      "Removed feature: Job, new relative accuracy: 1.000000\n",
      "Accuracy on minimized data:  0.6666666666666666\n"
     ]
    }
@ -233,7 +230,7 @@
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# default target_accuracy is 0.998\n",
-    "minimizer = GeneralizeToRepresentative(model, features=features,\n",
+    "minimizer = GeneralizeToRepresentative(model, \n",
    "                                     categorical_features=categorical_features, features_to_minimize=QI)\n",
    "\n",
    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
@ -248,117 +245,103 @@
    "y_test.reset_index(drop=True, inplace=True)\n",
    "encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
    "x_train_predictions = model.predict(encoded_generalizer_train)\n",
-    "minimizer.fit(X_generalizer_train, x_train_predictions)\n",
-    "transformed = minimizer.transform(x_test)\n",
+    "minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features)\n",
+    "transformed = minimizer.transform(x_test, features_names=features)\n",
    "\n",
    "encoded_transformed = preprocessor.transform(transformed)\n",
    "print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test))"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "#### Let's see what features were generalized"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'ranges': {}, 'categories': {}, 'untouched': ['Purpose', 'Present_residence', 'Credit_history', 'Telephone', 'Job', 'Housing', 'Installment_rate', 'Number_of_existing_credits', 'Foreign_worker', 'Existing_checking_account', 'Other_installment_plans', 'N_people_being_liable_provide_maintenance', 'Property', 'Savings_account', 'Present_employment_since', 'Personal_status_sex', 'Duration_in_month', 'debtors', 'Credit_amount', 'Age']}\n"
+      "{'ranges': {}, 'categories': {}, 'untouched': ['Foreign_worker', 'Other_installment_plans', 'Existing_checking_account', 'Purpose', 'debtors', 'Housing', 'N_people_being_liable_provide_maintenance', 'Present_employment_since', 'Installment_rate', 'Credit_history', 'Property', 'Present_residence', 'Age', 'Credit_amount', 'Duration_in_month', 'Job', 'Personal_status_sex', 'Number_of_existing_credits', 'Savings_account', 'Telephone']}\n"
     ]
    }
   ],
   "source": [
    "generalizations = minimizer.generalizations\n",
    "print(generalizations)"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
    "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
    "\n",
    "Let's change to a slightly lower target accuracy."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.791667\n",
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.805556\n",
      "Improving accuracy\n",
-      "feature to remove: Property\n",
-      "Removed feature: Property, new relative accuracy: 0.819444\n",
-      "feature to remove: Other_installment_plans\n",
-      "Removed feature: Other_installment_plans, new relative accuracy: 0.833333\n",
-      "feature to remove: Job\n",
-      "Removed feature: Job, new relative accuracy: 0.833333\n",
-      "feature to remove: Housing\n",
-      "Removed feature: Housing, new relative accuracy: 0.833333\n",
-      "feature to remove: Purpose\n",
-      "Removed feature: Purpose, new relative accuracy: 0.916667\n",
      "feature to remove: Credit_history\n",
-      "Removed feature: Credit_history, new relative accuracy: 0.930556\n",
-      "Accuracy on minimized data:  0.6416666666666667\n",
-      "{'ranges': {'Duration_in_month': [7.0, 8.5, 11.0, 13.0, 14.0, 18.0, 23.0, 25.5, 34.5, 47.5]}, 'categories': {'debtors': [['A101', 'A102'], ['A103']]}, 'untouched': ['Existing_checking_account', 'Savings_account', 'Present_employment_since', 'Property', 'Housing', 'Purpose', 'Personal_status_sex', 'Present_residence', 'Credit_history', 'Telephone', 'Installment_rate', 'Other_installment_plans', 'Number_of_existing_credits', 'Credit_amount', 'N_people_being_liable_provide_maintenance', 'Foreign_worker', 'Age', 'Job']}\n"
+      "Removed feature: Credit_history, new relative accuracy: 0.819444\n",
+      "feature to remove: Other_installment_plans\n",
+      "Removed feature: Other_installment_plans, new relative accuracy: 0.847222\n",
+      "feature to remove: Duration_in_month\n",
+      "Removed feature: Duration_in_month, new relative accuracy: 0.847222\n",
+      "feature to remove: Property\n",
+      "Removed feature: Property, new relative accuracy: 0.847222\n",
+      "feature to remove: Housing\n",
+      "Removed feature: Housing, new relative accuracy: 0.847222\n",
+      "feature to remove: Purpose\n",
+      "Removed feature: Purpose, new relative accuracy: 0.986111\n",
+      "Accuracy on minimized data:  0.6666666666666666\n",
+      "{'ranges': {}, 'categories': {'debtors': [['A103', 'A102'], ['A101']], 'Job': [['A173', 'A174'], ['A171'], ['A172']]}, 'untouched': ['Credit_amount', 'Duration_in_month', 'Credit_history', 'Foreign_worker', 'Housing', 'Other_installment_plans', 'Property', 'N_people_being_liable_provide_maintenance', 'Present_residence', 'Personal_status_sex', 'Telephone', 'Number_of_existing_credits', 'Present_employment_since', 'Existing_checking_account', 'Savings_account', 'Age', 'Purpose', 'Installment_rate']}\n"
     ]
    }
   ],
   "source": [
    "# We allow a 1% deviation in accuracy from the original model accuracy\n",
-    "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, features=features,\n",
+    "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.92, \n",
    "                                     categorical_features=categorical_features, features_to_minimize=QI)\n",
    "\n",
-    "minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
-    "transformed2 = minimizer2.transform(x_test)\n",
+    "minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features)\n",
+    "transformed2 = minimizer2.transform(x_test, features_names=features)\n",
    "\n",
    "encoded_transformed2 = preprocessor.transform(transformed2)\n",
    "print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test))\n",
    "generalizations2 = minimizer2.generalizations\n",
    "print(generalizations2)"
-   ],
-   "metadata": {
-    "collapsed": false,
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {},
   "source": [
-    "This time we were able to generalize two features (Duration_in_month and debtors)."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+    "This time we were able to generalize two features (debtors and Job)."
+   ]
  }
 ],
 "metadata": {
@ -370,16 +353,16 @@
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
-    "version": 2
+    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 0
-}
+ "nbformat_minor": 1
+}
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@ -7,14 +7,14 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.preprocessing import OneHotEncoder

 from apt.anonymization import Anonymize
-from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
+from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_nursery_dataset_pd
 from sklearn.datasets import load_diabetes
 from sklearn.model_selection import train_test_split
-from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
+from apt.utils.datasets import ArrayDataset


 def test_anonymize_ndarray_iris():
-    (x_train, y_train), _ = get_iris_dataset()
+    (x_train, y_train), _ = get_iris_dataset_np()

    model = DecisionTreeClassifier()
    model.fit(x_train, y_train)
@ -31,11 +31,7 @@ def test_anonymize_ndarray_iris():


 def test_anonymize_pandas_adult():
-    (x_train, y_train), _ = get_adult_dataset()
-    encoded = OneHotEncoder().fit_transform(x_train)
-    model = DecisionTreeClassifier()
-    model.fit(encoded, y_train)
-    pred = model.predict(encoded)
+    (x_train, y_train), _ = get_adult_dataset_pd()

    k = 100
    features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
@ -68,8 +64,9 @@ def test_anonymize_pandas_adult():
    assert (anon.loc[:, QI].value_counts().min() >= k)
    np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))

+
 def test_anonymize_pandas_nursery():
-    (x_train, y_train), _ = get_nursery_dataset()
+    (x_train, y_train), _ = get_nursery_dataset_pd()
    x_train = x_train.astype(str)

    k = 100
@ -102,7 +99,6 @@ def test_anonymize_pandas_nursery():


 def test_regression():
-
    dataset = load_diabetes()
    x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)

@ -130,9 +126,9 @@ def test_errors():
    with pytest.raises(ValueError):
        Anonymize(2, None)
    anonymizer = Anonymize(10, [0, 2])
-    (x_train, y_train), (x_test, y_test) = get_iris_dataset()
+    (x_train, y_train), (x_test, y_test) = get_iris_dataset_np()
    with pytest.raises(ValueError):
        anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
-    (x_train, y_train), _ = get_adult_dataset()
+    (x_train, y_train), _ = get_adult_dataset_pd()
    with pytest.raises(ValueError):
        anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -9,11 +9,14 @@ from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OneHotEncoder

+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Input
+
 from apt.minimization import GeneralizeToRepresentative
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
-from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
-from apt.utils.datasets import ArrayDataset
-from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor
+from apt.utils.dataset_utils import get_iris_dataset_np, get_adult_dataset_pd, get_german_credit_dataset_pd
+from apt.utils.datasets import ArrayDataset, Data
+from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor, KerasClassifier, BlackboxClassifier


@pytest.fixture
@ -39,7 +42,7 @@ def test_minimizer_params(data):
    y = [1, 1, 0]
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(X, y))

    gen = GeneralizeToRepresentative(model, cells=cells)
@ -63,9 +66,10 @@ def test_minimizer_fit(data):
    y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(X, y))
-    predictions = model.predict(X)
+    ad = ArrayDataset(X)
+    predictions = model.predict(ad)
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

@ -73,26 +77,26 @@ def test_minimizer_fit(data):
    train_dataset = ArrayDataset(X, predictions, features_names=features)

    gen.fit(dataset=train_dataset)
-    transformed = gen.transform(dataset=ArrayDataset(X))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}
+    transformed = gen.transform(dataset=ad)
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    indexes = []
    for i in range(len(features)):
        if features[i] in modified_features:
            indexes.append(i)
    assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[indexes]) != (X[indexes])).any())

@ -131,9 +135,9 @@ def test_minimizer_fit_pandas(data):
    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y))
-    predictions = model.predict(encoded)
+    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

@ -144,21 +148,22 @@ def test_minimizer_fit_pandas(data):
    train_dataset = ArrayDataset(X, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(X))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['ola', 'height', 'sex']}
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'age': []}, 'categories': {'sex': [['f', 'm']], 'ola': [['aa', 'bb']]},
+                                'untouched': ['height']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[modified_features]).equals(X[modified_features])) == False)

@ -212,9 +217,9 @@ def test_minimizer_params_categorical(data):
    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y))
-    predictions = model.predict(encoded)
+    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)
    # Append classifier to preprocessing pipeline.
@ -244,35 +249,36 @@ def test_minimizer_fit_QI(data):
    QI = ['age', 'weight']
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(X, y))
-    predictions = model.predict(X)
+    ad = ArrayDataset(X)
+    predictions = model.predict(ad)
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

    gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
    train_dataset = ArrayDataset(X, predictions, features_names=features)
    gen.fit(dataset=train_dataset)
-    transformed = gen.transform(dataset=ArrayDataset(X))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    transformed = gen.transform(dataset=ad)
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    indexes = []
    for i in range(len(features)):
        if features[i] in modified_features:
            indexes.append(i)
    assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[indexes]) != (X[indexes])).any())

@ -313,9 +319,9 @@ def test_minimizer_fit_pandas_QI(data):
    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y))
-    predictions = model.predict(encoded)
+    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

@ -326,72 +332,72 @@ def test_minimizer_fit_pandas_QI(data):
    train_dataset = ArrayDataset(X, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(X))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
                                'untouched': ['height', 'sex']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    # assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
    np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    # assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[modified_features]).equals(X[modified_features])) == False)


 def test_minimize_ndarray_iris():
    features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
-    (x_train, y_train), (x_test, y_test) = get_iris_dataset()
+    (x_train, y_train), (x_test, y_test) = get_iris_dataset_np()
    QI = ['sepal length (cm)', 'petal length (cm)']
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(x_train, y_train))
-    predictions = model.predict(x_train)
+    predictions = model.predict(ArrayDataset(x_train))
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

    gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI)
    # gen.fit(dataset=ArrayDataset(x_train, predictions))
    transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
                                'categories': {}, 'untouched': ['petal width (cm)', 'sepal width (cm)']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x_train, [0, 2], axis=1)).all())

    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    indexes = []
    for i in range(len(features)):
        if features[i] in modified_features:
            indexes.append(i)
    assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[indexes]) != (x_train[indexes])).any())


 def test_minimize_pandas_adult():
-    (x_train, y_train), (x_test, y_test) = get_adult_dataset()
+    (x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()
    x_train = x_train.head(1000)
    y_train = y_train.head(1000)

@ -420,9 +426,9 @@ def test_minimize_pandas_adult():
    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y_train))
-    predictions = model.predict(encoded)
+    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

@ -430,8 +436,8 @@ def test_minimize_pandas_adult():
                                     categorical_features=categorical_features, features_to_minimize=QI)
    gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
    transformed = gen.transform(dataset=ArrayDataset(x_train))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
        'workclass': [['Self-emp-not-inc', 'Private', 'Federal-gov', 'Self-emp-inc', '?', 'Local-gov', 'State-gov']],
        'marital-status': [
            ['Divorced', 'Married-AF-spouse', 'Married-spouse-absent', 'Widowed', 'Separated', 'Married-civ-spouse',
@ -445,28 +451,28 @@ def test_minimize_pandas_adult():
            ['Euro_1', 'LatinAmerica', 'BritishCommonwealth', 'SouthAmerica', 'UnitedStates', 'China', 'Euro_2',
             'SE_Asia', 'Other', 'Unknown']]}, 'untouched': ['capital-loss', 'hours-per-week', 'capital-gain']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    # assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
    np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))

    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    # assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)


 def test_german_credit_pandas():
-    (x_train, y_train), (x_test, y_test) = get_german_credit_dataset()
+    (x_train, y_train), (x_test, y_test) = get_german_credit_dataset_pd()
    features = ["Existing_checking_account", "Duration_in_month", "Credit_history", "Purpose", "Credit_amount",
                "Savings_account", "Present_employment_since", "Installment_rate", "Personal_status_sex", "debtors",
                "Present_residence", "Property", "Age", "Other_installment_plans", "Housing",
@ -493,9 +499,9 @@ def test_german_credit_pandas():
    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(encoded, y_train))
-    predictions = model.predict(encoded)
+    predictions = model.predict(ArrayDataset(encoded))
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

@ -503,8 +509,8 @@ def test_german_credit_pandas():
                                     categorical_features=categorical_features, features_to_minimize=QI)
    gen.fit(dataset=ArrayDataset(x_train, predictions))
    transformed = gen.transform(dataset=ArrayDataset(x_train))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'Duration_in_month': [31.5]},
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'Duration_in_month': [31.5]},
                                'categories': {'Credit_history': [['A30', 'A32', 'A31', 'A34', 'A33']], 'Purpose': [
                                    ['A41', 'A46', 'A43', 'A40', 'A44', 'A410', 'A49', 'A45', 'A48', 'A42']],
                                               'debtors': [['A101', 'A102', 'A103']],
@ -518,22 +524,22 @@ def test_german_credit_pandas():
                                              'Age', 'Existing_checking_account', 'Credit_amount',
                                              'Present_employment_since']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    # assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
    np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))

    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    # assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)

@ -545,7 +551,7 @@ def test_regression():
    base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
    model = SklearnRegressor(base_est)
    model.fit(ArrayDataset(x_train, y_train))
-    predictions = model.predict(x_train)
+    predictions = model.predict(ArrayDataset(x_train))
    QI = ['age', 'bmi', 's2', 's5']
    features = ['age', 'sex', 'bmi', 'bp',
                's1', 's2', 's3', 's4', 's5', 's6']
@ -557,8 +563,8 @@ def test_regression():
    print('Base model accuracy (R2 score): ', model.score(ArrayDataset(x_test, y_test)))
    model.fit(ArrayDataset(transformed, y_train))
    print('Base model accuracy (R2 score) after anonymization: ', model.score(ArrayDataset(x_test, y_test)))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {
        'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
                -0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
                -0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
@ -586,24 +592,24 @@ def test_regression():
               0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
        'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())

    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    indexes = []
    for i in range(len(features)):
        if features[i] in modified_features:
            indexes.append(i)
    assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[indexes]) != (x_train[indexes])).any())

@ -626,34 +632,35 @@ def test_X_y(data):
    QI = [0, 2]
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(X, y))
-    predictions = model.predict(X)
+    ad = ArrayDataset(X)
+    predictions = model.predict(ad)
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

    gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
    gen.fit(X=X, y=predictions)
    transformed = gen.transform(X)
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
    modified_features = [f for f in features if
-                         str(f) in expexted_generalizations['categories'].keys() or str(f) in expexted_generalizations[
+                         str(f) in expected_generalizations['categories'].keys() or str(f) in expected_generalizations[
                             'ranges'].keys()]
    indexes = []
    for i in range(len(features)):
        if features[i] in modified_features:
            indexes.append(i)
    assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[indexes]) != (X[indexes])).any())

@ -676,34 +683,35 @@ def test_X_y_features_names(data):
    QI = ['age', 'weight']
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(X, y))
-    predictions = model.predict(X)
+    ad = ArrayDataset(X)
+    predictions = model.predict(ad)
    if predictions.shape[1] > 1:
        predictions = np.argmax(predictions, axis=1)

    gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
    gen.fit(X=X, y=predictions, features_names=features)
    transformed = gen.transform(X=X, features_names=features)
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    indexes = []
    for i in range(len(features)):
        if features[i] in modified_features:
            indexes.append(i)
    assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[indexes]) != (X[indexes])).any())

@ -755,25 +763,25 @@ def test_BaseEstimator_classification(data):
    train_dataset = ArrayDataset(X, predictions)
    gen.fit(dataset=train_dataset)
    transformed = gen.transform(dataset=ArrayDataset(X))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
                                'untouched': ['height', 'sex']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    # assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
    np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    # assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[modified_features]).equals(X[modified_features])) == False)

@ -797,8 +805,8 @@ def test_BaseEstimator_regression():
    print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
    model.fit(transformed, y_train)
    print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test))
-    gener = gen.generalizations_
-    expexted_generalizations = {'ranges': {
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {
        'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
                -0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
                -0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
@ -826,23 +834,106 @@ def test_BaseEstimator_regression():
               0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
        'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}

-    for key in expexted_generalizations['ranges']:
-        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
-    for key in expexted_generalizations['categories']:
-        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
-    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
    assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())

    modified_features = [f for f in features if
-                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
                             'ranges'].keys()]
    indexes = []
    for i in range(len(features)):
        if features[i] in modified_features:
            indexes.append(i)
    assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
-    ncp = gen.ncp_
-    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
        assert (((transformed[indexes]) != (x_train[indexes])).any())
+
+
+def test_keras_model():
+    (X, y), (x_test, y_test) = get_iris_dataset_np()
+
+    base_est = Sequential()
+    base_est.add(Input(shape=(4,)))
+    base_est.add(Dense(10, activation="relu"))
+    base_est.add(Dense(3, activation='softmax'))
+
+    base_est.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+
+    model = KerasClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    model.fit(ArrayDataset(X, y))
+    ad = ArrayDataset(x_test)
+    predictions = model.predict(ad)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
+
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
+    test_dataset = ArrayDataset(x_test, predictions)
+
+    gen.fit(dataset=test_dataset)
+    transformed = gen.transform(dataset=ad)
+    gener = gen.generalizations
+
+    features = ['0', '1', '2', '3']
+    modified_features = [f for f in features if
+                         f in gener['categories'].keys() or f in gener['ranges'].keys()]
+    indexes = []
+    for i in range(len(features)):
+        if features[i] in modified_features:
+            indexes.append(i)
+    assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_test, indexes, axis=1)).all())
+    ncp = gen.ncp
+    if len(gener['ranges'].keys()) > 0 or len(gener['categories'].keys()) > 0:
+        assert (ncp > 0)
+        assert (((transformed[indexes]) != (X[indexes])).any())
+
+
+def test_blackbox_model():
+    (X, y), (x_test, y_test) = get_iris_dataset_np()
+    train_data = ArrayDataset(X, y)
+    test_data = ArrayDataset(x_test, y_test)
+    data = Data(train_data, test_data)
+
+    model = BlackboxClassifier(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    ad = ArrayDataset(x_test)
+    predictions = model.predict(ad)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
+
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
+    train_dataset = ArrayDataset(x_test, predictions)
+
+    gen.fit(dataset=train_dataset)
+    transformed = gen.transform(dataset=ad)
+    gener = gen.generalizations
+    expected_generalizations = {'ranges': {'0': [], '1': [], '2': [4.849999904632568, 5.049999952316284],
+                                           '3': [0.7000000029802322, 1.600000023841858]},
+                                'categories': {},
+                                'untouched': []}
+
+    for key in expected_generalizations['ranges']:
+        assert (set(expected_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expected_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]]) ==
+                set([frozenset(sl) for sl in gener['categories'][key]]))
+    assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
+
+    features = ['0', '1', '2', '3']
+    modified_features = [f for f in features if
+                         f in expected_generalizations['categories'].keys() or f in expected_generalizations[
+                             'ranges'].keys()]
+    indexes = []
+    for i in range(len(features)):
+        if features[i] in modified_features:
+            indexes.append(i)
+    assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_test, indexes, axis=1)).all())
+    ncp = gen.ncp
+    if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
+        assert (ncp > 0)
+        assert (((transformed[indexes]) != (X[indexes])).any())
--- a/tests/test_model.py
+++ b/tests/test_model.py
@ -1,21 +1,24 @@
 import pytest

-from apt.utils.models import SklearnClassifier, SklearnRegressor, ModelOutputType
-from apt.utils.datasets import ArrayDataset
+from apt.utils.models import SklearnClassifier, SklearnRegressor, ModelOutputType, KerasClassifier, BlackboxClassifier
+from apt.utils.datasets import ArrayDataset, Data
 from apt.utils import dataset_utils

 from sklearn.tree import DecisionTreeRegressor
 from sklearn.ensemble import RandomForestClassifier

+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Input
+

 def test_sklearn_classifier():
-    (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset()
+    (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
    underlying_model = RandomForestClassifier()
-    model = SklearnClassifier(underlying_model, ModelOutputType.CLASSIFIER_VECTOR)
+    model = SklearnClassifier(underlying_model, ModelOutputType.CLASSIFIER_PROBABILITIES)
    train = ArrayDataset(x_train, y_train)
    test = ArrayDataset(x_test, y_test)
    model.fit(train)
-    pred = model.predict(x_test)
+    pred = model.predict(test)
    assert(pred.shape[0] == x_test.shape[0])

    score = model.score(test)
@ -23,13 +26,50 @@ def test_sklearn_classifier():


 def test_sklearn_regressor():
-    (x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset()
+    (x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset_np()
    underlying_model = DecisionTreeRegressor()
    model = SklearnRegressor(underlying_model)
    train = ArrayDataset(x_train, y_train)
    test = ArrayDataset(x_test, y_test)
    model.fit(train)
-    pred = model.predict(x_test)
+    pred = model.predict(test)
    assert (pred.shape[0] == x_test.shape[0])

    score = model.score(test)
+
+
+def test_keras_classifier():
+    (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
+
+    underlying_model = Sequential()
+    underlying_model.add(Input(shape=(4,)))
+    underlying_model.add(Dense(100, activation="relu"))
+    underlying_model.add(Dense(10, activation="relu"))
+    underlying_model.add(Dense(3, activation='softmax'))
+
+    underlying_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+
+    model = KerasClassifier(underlying_model, ModelOutputType.CLASSIFIER_PROBABILITIES)
+
+    train = ArrayDataset(x_train, y_train)
+    test = ArrayDataset(x_test, y_test)
+    model.fit(train)
+    pred = model.predict(test)
+    assert(pred.shape[0] == x_test.shape[0])
+
+    score = model.score(test)
+    assert(0.0 <= score <= 1.0)
+
+
+def test_blackbox_classifier():
+    (x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset_np()
+
+    train = ArrayDataset(x_train, y_train)
+    test = ArrayDataset(x_test, y_test)
+    data = Data(train, test)
+    model = BlackboxClassifier(data, ModelOutputType.CLASSIFIER_PROBABILITIES)
+    pred = model.predict(test)
+    assert(pred.shape[0] == x_test.shape[0])
+
+    score = model.score(test)
+    assert(0.0 <= score <= 1.0)