Wrapper minimizer (#20)

* apply dataset wrapper on minimizer * apply changes on minimization notebook * add black_box_access and unlimited_queries params
2026-06-08 15:05:13 +02:00 · 2022-04-18 13:14:49 +03:00 · 2022-04-18 13:14:49 +03:00 · ac5d82aab6
commit ac5d82aab6
parent 6b04fd5564
6 changed files with 583 additions and 215 deletions
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -1,7 +1,7 @@
 """
 This module implements all classes needed to perform data minimization
 """
-from typing import Union
+from typing import Union, Optional
 import pandas as pd
 import numpy as np
 import copy
@ -16,37 +16,32 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.model_selection import train_test_split

+from apt.utils.datasets import ArrayDataset, Data, DATA_PANDAS_NUMPY_TYPE
+from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnClassifier
+

 class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
    """ A transformer that generalizes data to representative points.
-
    Learns data generalizations based on an original model's predictions
    and a target accuracy. Once the generalizations are learned, can
    receive one or more data records and transform them to representative
    points based on the learned generalization.
-
-    An alternative way to use the transformer is to supply ``cells`` and
-    ``features`` in init or set_params and those will be used to transform
+    An alternative way to use the transformer is to supply ``cells`` in
+    init or set_params and those will be used to transform
    data to representatives. In this case, fit must still be called but
    there is no need to supply it with ``X`` and ``y``, and there is no
    need to supply an existing ``estimator`` to init.
-
    In summary, either ``estimator`` and ``target_accuracy`` should be
-    supplied or ``cells`` and ``features`` should be supplied.
-
+    supplied or ``cells`` should be supplied.
    Parameters
    ----------
    estimator : estimator, optional
        The original model for which generalization is being performed.
        Should be pre-fitted.
-
    target_accuracy : float, optional
        The required accuracy when applying the base model to the
        generalized data. Accuracy is measured relative to the original
        accuracy of the model.
-
-    features : list of str, optional
-        The feature names, in the order that they appear in the data.
    categorical_features: list of str, optional
        The list of categorical features should only be supplied when
         passing data as a pandas dataframe.
@ -63,31 +58,33 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        The required method to train data set for minimizing. Default is
        to train the tree just on the features that are given as
        features_to_minimize.
-
    Attributes
    ----------
+    features_ : list of str
+        The feature names, in the order that they appear in the data.
    cells_ : list of object
        The cells used to generalize records, as learned when calling fit.
-
    ncp_ : float
        The NCP (information loss) score of the resulting generalization,
        as measured on the training data.
-
    generalizations_ : object
        The generalizations that were learned (actual feature ranges).
-
    Notes
    -----
-
-
    """

-    def __init__(self, estimator=None, target_accuracy=0.998, features=None,
-                 cells=None, categorical_features=None, features_to_minimize: Union[np.ndarray, list] = None
-                 , train_only_QI=True, is_regression=False):
-        self.estimator = estimator
+    def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
+                 cells: list = None, categorical_features: Union[np.ndarray, list] = None,
+                 features_to_minimize: Union[np.ndarray, list] = None, train_only_QI: bool = True,
+                 is_regression: bool = False):
+        if issubclass(estimator.__class__, Model):
+            self.estimator = estimator
+        else:
+            if is_regression:
+                self.estimator = SklearnRegressor(estimator)
+            else:
+                self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_VECTOR)
        self.target_accuracy = target_accuracy
-        self.features = features
        self.cells = cells
        self.categorical_features = []
        if categorical_features:
@ -98,13 +95,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    def get_params(self, deep=True):
        """Get parameters for this estimator.
-
        Parameters
        ----------
        deep : boolean, optional
            If True, will return the parameters for this estimator and contained
            subobjects that are estimators.
-
        Returns
        -------
        params : mapping of string to any
@ -113,17 +108,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        ret = {}
        ret['target_accuracy'] = self.target_accuracy
        if deep:
-            ret['features'] = copy.deepcopy(self.features)
            ret['cells'] = copy.deepcopy(self.cells)
            ret['estimator'] = self.estimator
        else:
-            ret['features'] = copy.copy(self.features)
            ret['cells'] = copy.copy(self.cells)
        return ret

    def set_params(self, **params):
        """Set the parameters of this estimator.
-
        Returns
        -------
        self : object
@ -131,8 +123,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        """
        if 'target_accuracy' in params:
            self.target_accuracy = params['target_accuracy']
-        if 'features' in params:
-            self.features = params['features']
        if 'cells' in params:
            self.cells = params['cells']
        return self
@ -141,9 +131,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    def generalizations(self):
        return self.generalizations_

-    def fit_transform(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
+    def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
+                      features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
        """Learns the generalizations based on training data, and applies them to the data.
-
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -151,19 +141,23 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        y : array-like, shape (n_samples,), optional
            The target values. An array of int.
            This should contain the predictions of the original model on ``X``.
-
+        features_names : list of str, The feature names, in the order that they appear in the data,
+                        provided just if X and y were provided (optional).
+        dataset : Data wrapper containing the training input samples and the predictions of the
+                  original model on the training data.
+        Either X,y OR dataset need to be provided, not both.
        Returns
        -------
        X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
            The array containing the representative values to which each record in
            ``X`` is mapped.
        """
-        self.fit(X, y)
-        return self.transform(X)
+        self.fit(X, y, features_names, dataset=dataset)
+        return self.transform(X, features_names, dataset=dataset)

-    def fit(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
+    def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
+            features_names: Optional = None, dataset: ArrayDataset = None):
        """Learns the generalizations based on training data.
-
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -171,7 +165,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        y : array-like, shape (n_samples,), optional
            The target values. An array of int.
            This should contain the predictions of the original model on ``X``.
-
+        features_names : list of str, The feature names, in the order that they appear in the data,
+                        provided just if X and y were provided (optional).
+        dataset : Data wrapper containing the training input samples and the predictions of the
+                  original model on the training data.
+        Either X,y OR dataset need to be provided, not both.
        Returns
        -------
        X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
@ -180,26 +178,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        """

        # take into account that estimator, X, y, cells, features may be None
-        if X is not None:
-            if type(X) == np.ndarray:
-                self.is_numpy = True
-            else:
-                self.is_numpy = False
-
        if X is not None and y is not None:
-            if self.is_numpy:
-                X, y = check_X_y(X, y, accept_sparse=True)
-            self.n_features_ = X.shape[1]
-        elif self.features:
-            self.n_features_ = len(self.features)
+            if dataset is not None:
+                raise ValueError('Either X,y OR dataset need to be provided, not both')
+            else:
+                dataset = ArrayDataset(X, y, features_names)
+
+        if dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
+            self.n_features_ = dataset.get_samples().shape[1]
+
+        elif dataset and dataset.features_names:
+            self.n_features_ = len(dataset.features_names)
        else:
            self.n_features_ = 0

-        if self.features:
-            self._features = self.features
+        if dataset and dataset.features_names:
+            self._features = dataset.features_names
        # if features is None, use numbers instead of names
        elif self.n_features_ != 0:
-            self._features = [i for i in range(self.n_features_)]
+            self._features = [str(i) for i in range(self.n_features_)]
        else:
            self._features = None

@ -211,27 +208,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

        # Going to fit
        # (currently not dealing with option to fit with only X and y and no estimator)
-        if self.estimator and X is not None and y is not None:
+        if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
+            x = pd.DataFrame(dataset.get_samples(), columns=self._features)
+            if not self.features_to_minimize:
+                self.features_to_minimize = self._features
+            self.features_to_minimize = [str(i) for i in self.features_to_minimize]
+            if not all(elem in self._features for elem in self.features_to_minimize):
+                raise ValueError('features to minimize should be a subset of features names')
+            x_QI = x.loc[:, self.features_to_minimize]

-            if self.is_numpy:
-                if not self.features_to_minimize:
-                    self.features_to_minimize = [i for i in range(len(self._features))]
-                x_QI = X[:, self.features_to_minimize]
-                self.features_to_minimize = [self._features[i] for i in self.features_to_minimize]
-                X = pd.DataFrame(X, columns=self._features)
-            else:
-                if not self.features_to_minimize:
-                    self.features_to_minimize = self._features
-                x_QI = X.loc[:, self.features_to_minimize]
-            x_QI = pd.DataFrame(x_QI, columns=self.features_to_minimize)
            # divide dataset into train and test
-            used_data = X
+            used_data = x
            if self.train_only_QI:
                used_data = x_QI
            if self.is_regression:
-                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=14)
+                X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4, random_state=14)
            else:
-                X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=18)
+                X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), stratify=dataset.get_labels(), test_size=0.4,
+                                                                    random_state=18)

            X_train_QI = X_train.loc[:, self.features_to_minimize]
            X_test_QI = X_test.loc[:, self.features_to_minimize]
@ -245,7 +239,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            for feature in self._features:
                if feature not in feature_data.keys():
                    fd = {}
-                    values = list(X.loc[:, feature])
+                    values = list(x.loc[:, feature])
                    if feature not in self.categorical_features:
                        fd['min'] = min(values)
                        fd['max'] = max(values)
@ -258,7 +252,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            categorical_features = [f for f in self._features if f in self.categorical_features and
                                    f in self.features_to_minimize]

-
            numeric_transformer = Pipeline(
                steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
            )
@ -287,7 +280,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    ("cat", categorical_transformer, self.categorical_features),
                ]
            )
-            preprocessor.fit(X)
+            preprocessor.fit(x)
            x_prepared = preprocessor.transform(X_train)
            if self.train_only_QI:
                x_prepared = preprocessor_QI_features.transform(X_train_QI)
@ -299,7 +292,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
            else:
                self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
-                                              min_samples_leaf=1)
+                                                  min_samples_leaf=1)
            self.dt_.fit(x_prepared, y_train)
            self._modify_categorical_features(used_data)

@ -328,7 +321,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)

            # check accuracy
-            accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
+            accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
            print('Initial accuracy of model on generalized data, relative to original model predictions '
                  '(base generalization derived from tree, before improvements): %f' % accuracy)

@ -348,7 +341,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                        self._calculate_generalizations()
                        generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
                                                       self.cells_by_id_)
-                        accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
+                        accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
                        # if accuracy passed threshold roll back to previous iteration generalizations
                        if accuracy < self.target_accuracy:
                            self.cells_ = cells_previous_iter
@ -374,7 +367,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

                    self._calculate_generalizations()
                    generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
-                    accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
+                    accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
                    print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))

            # self.cells_ currently holds the chosen generalization based on target accuracy
@ -385,14 +378,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        # Return the transformer
        return self

-    def transform(self, X: Union[np.ndarray, pd.DataFrame]):
+    def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
        """ Transforms data records to representative points.
-
        Parameters
        ----------
        X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
         may contain both numeric and categorical data.
            The input samples.
+        features_names : list of str, The feature names, in the order that they appear in the data,
+                        provided just if X was provided (optional).
+        dataset : Data wrapper containing the training input samples.
+        Either X OR dataset need to be provided, not both.
        Returns
        -------
        X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
@ -404,26 +400,30 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        msg = 'This %(name)s instance is not initialized yet. ' \
              'Call ‘fit’ or ‘set_params’ with ' \
              'appropriate arguments before using this method.'
-        check_is_fitted(self, ['cells', 'features'], msg=msg)
+        check_is_fitted(self, ['cells'], msg=msg)

-        if type(X) == np.ndarray:
-            # Input validation
-            X = check_array(X, accept_sparse=True)
-            self.is_numpy = True
-            X = pd.DataFrame(X, columns=self._features)
-        else:
-            self.is_numpy = False
+        if X is not None:
+            if dataset is not None:
+                raise ValueError('Either X OR dataset need to be provided, not both')
+            else:
+                dataset = ArrayDataset(X, features_names=features_names)
+        elif dataset is None:
+            raise ValueError('Either X OR dataset need to be provided, not both')
+        if dataset and dataset.features_names:
+            self._features = dataset.features_names
+        if dataset and dataset.get_samples() is not None:
+            x = pd.DataFrame(dataset.get_samples(), columns=self._features)

-        if X.shape[1] != self.n_features_ and self.n_features_ != 0:
+        if x.shape[1] != self.n_features_ and self.n_features_ != 0:
            raise ValueError('Shape of input is different from what was seen'
                             'in `fit`')

        if not self._features:
-            self._features = [i for i in range(X.shape[1])]
+            self._features = [i for i in range(x.shape[1])]

        representatives = pd.DataFrame(columns=self._features)  # only columns
-        generalized = pd.DataFrame(X, columns=self._features, copy=True)  # original data
-        mapped = np.zeros(X.shape[0])  # to mark records we already mapped
+        generalized = pd.DataFrame(x, columns=self._features, copy=True)  # original data
+        mapped = np.zeros(x.shape[0])  # to mark records we already mapped

        # iterate over cells (leaves in decision tree)
        for i in range(len(self.cells_)):
@ -442,7 +442,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    representatives = representatives.drop(feature, axis=1)

            # get the indexes of all records that map to this cell
-            indexes = self._get_record_indexes_for_cell(X, self.cells_[i], mapped)
+            indexes = self._get_record_indexes_for_cell(x, self.cells_[i], mapped)

            # replace the values in the representative columns with the representative
            # values (leaves others untouched)
@ -453,9 +453,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
                replace.index = indexes
                generalized.loc[indexes, representatives.columns] = replace
-        if self.is_numpy:
-            return generalized.to_numpy()
-        return generalized
+        if dataset and dataset.is_pandas:
+            return generalized
+        elif isinstance(X, pd.DataFrame):
+            return generalized
+        return generalized.to_numpy()

    def _get_record_indexes_for_cell(self, X, cell, mapped):
        indexes = []
@ -639,7 +641,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            # else: nothing to do, stay with previous cells

    def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
-        new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
+        new_cell['hist'] = [x + y for x, y in
+                            zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
        new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1

    def _get_nodes_level(self, level):
@ -796,8 +799,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    cells_by_id = copy.deepcopy(self.cells_by_id_)
                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
                    generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
-                    accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
-                                                         labels) - current_accuracy
+                    accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
+                                                                      labels)) - current_accuracy
                    if accuracy_gain < 0:
                        accuracy_gain = 0
                    if accuracy_gain != 0:
@ -819,8 +822,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    cells_by_id = copy.deepcopy(self.cells_by_id_)
                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
                    generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
-                    accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
-                                                         labels) - current_accuracy
+                    accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
+                                                                      labels)) - current_accuracy

                    if accuracy_gain < 0:
                        accuracy_gain = 0
--- a/apt/utils/datasets/datasets.py
+++ b/apt/utils/datasets/datasets.py
@ -180,7 +180,7 @@ class ArrayDataset(Dataset):
        if self.is_pandas:
            if features_names and not np.array_equal(features_names, x.columns):
                raise ValueError("The supplied features are not the same as in the data features")
-            self.features_names = x.columns
+            self.features_names = x.columns.to_list()

        if y is not None and len(self._x) != len(self._y):
            raise ValueError('Non equivalent lengths of x and y')
--- a/apt/utils/models/model.py
+++ b/apt/utils/models/model.py
@ -1,5 +1,5 @@
 from abc import ABCMeta, abstractmethod
-from typing import Any
+from typing import Any, Optional
 from enum import Enum, auto

 from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
@ -8,7 +8,7 @@ from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
 class ModelOutputType(Enum):
    CLASSIFIER_VECTOR = auto()  # probabilities or logits
    CLASSIFIER_SCALAR = auto()  # label only
-    REGRESSOR_SCALAR = auto()   # value
+    REGRESSOR_SCALAR = auto()  # value


 class Model(metaclass=ABCMeta):
@ -16,16 +16,26 @@ class Model(metaclass=ABCMeta):
    Abstract base class for ML model wrappers.
    """

-    def __init__(self, model: Any, output_type: ModelOutputType, **kwargs):
+    def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
+                 unlimited_queries: Optional[bool] = True, **kwargs):
        """
        Initialize a `Model` wrapper object.

        :param model: The original model object (of the underlying ML framework)
        :param output_type: The type of output the model yields (vector/label only for classifiers,
                            value for regressors)
+        :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+                                 Set to True if the model is only available via query (API) access, i.e.,
+                                 only the outputs of the model are exposed, and False if the model internals
+                                 are also available. Optional, Default is True.
+        :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
+                                  unlimited queries to the model API or whether there is a limit to the number of
+                                  queries that can be submitted. Optional, Default is True.
        """
        self._model = model
        self._output_type = output_type
+        self._black_box_access = black_box_access
+        self._unlimited_queries = unlimited_queries

    @abstractmethod
    def fit(self, train_data: Dataset, **kwargs) -> None:
@ -48,6 +58,16 @@ class Model(metaclass=ABCMeta):
        """
        raise NotImplementedError

+    @abstractmethod
+    def score(self, test_data: Dataset, **kwargs):
+        """
+        Score the model using test data.
+
+        :param test_data: Test data.
+        :type train_data: `Dataset`
+        """
+        return NotImplementedError
+
    @property
    def model(self) -> Any:
        """
@ -65,3 +85,25 @@ class Model(metaclass=ABCMeta):
        :return: The model's output type.
        """
        return self._output_type
+
+    @property
+    def black_box_access(self) -> Any:
+        """
+        Return True if the model is only available via query (API) access, i.e.,
+        only the outputs of the model are exposed, and False if the model internals are also available.
+
+        :return: True if the model is only available via query (API) access, i.e.,
+                 only the outputs of the model are exposed, and False if the model internals are also available.
+        """
+        return self._black_box_access
+
+    @property
+    def unlimited_queries(self) -> Any:
+        """
+        If black_box_access is True, Return whether a user can perform unlimited queries to the model API
+        or whether there is a limit to the number of queries that can be submitted.
+
+        :return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API
+                 or whether there is a limit to the number of queries that can be submitted.
+        """
+        return self._unlimited_queries
--- a/apt/utils/models/sklearn_model.py
+++ b/apt/utils/models/sklearn_model.py
@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np

 from sklearn.preprocessing import OneHotEncoder
@ -28,13 +30,23 @@ class SklearnClassifier(SklearnModel):
    """
    Wrapper class for scikitlearn classification models.
    """
-    def __init__(self, model: BaseEstimator, output_type: ModelOutputType, **kwargs):
+    def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
+                 unlimited_queries: Optional[bool] = True, **kwargs):
        """
        Initialize a `SklearnClassifier` wrapper object.

-        :param model: The original sklearn model object
+        :param model: The original sklearn model object.
+        :param output_type: The type of output the model yields (vector/label only for classifiers,
+                            value for regressors)
+        :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+                                 Set to True if the model is only available via query (API) access, i.e.,
+                                 only the outputs of the model are exposed, and False if the model internals
+                                 are also available. Optional, Default is True.
+        :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
+                                  unlimited queries to the model API or whether there is a limit to the number of
+                                  queries that can be submitted. Optional, Default is True.
        """
-        super().__init__(model, output_type, **kwargs)
+        super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
        self._art_model = ArtSklearnClassifier(model)

    def fit(self, train_data: Dataset, **kwargs) -> None:
@ -63,13 +75,21 @@ class SklearnRegressor(SklearnModel):
    """
    Wrapper class for scikitlearn regression models.
    """
-    def __init__(self, model: BaseEstimator, **kwargs):
+    def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True,
+                 unlimited_queries: Optional[bool] = True, **kwargs):
        """
        Initialize a `SklearnRegressor` wrapper object.

-        :param model: The original sklearn model object
+        :param model: The original sklearn model object.
+        :param black_box_access: Boolean describing the type of deployment of the model (when in production).
+                                 Set to True if the model is only available via query (API) access, i.e.,
+                                 only the outputs of the model are exposed, and False if the model internals
+                                 are also available. Optional, Default is True.
+        :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
+                                  unlimited queries to the model API or whether there is a limit to the number of
+                                  queries that can be submitted. Optional, Default is True.
        """
-        super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, **kwargs)
+        super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
        self._art_model = ScikitlearnRegressor(model)

    def fit(self, train_data: Dataset, **kwargs) -> None:
--- a/notebooks/minimization_adult.ipynb
+++ b/notebooks/minimization_adult.ipynb
@ -27,7 +27,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
@ -42,6 +42,18 @@
      " [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
      " [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  y_train = y_train.astype(np.int)\n",
+      "/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  y_test = y_test.astype(np.int)\n"
+     ]
    }
   ],
   "source": [
@ -84,24 +96,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Base model accuracy:  0.8189914624408821\n"
+      "Base model accuracy:  0.8183158282660771\n"
     ]
    }
   ],
   "source": [
+    "from apt.utils.datasets import ArrayDataset\n",
+    "from apt.utils.models import SklearnClassifier, ModelOutputType\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
-    "model = DecisionTreeClassifier()\n",
-    "model.fit(x_train, y_train)\n",
+    "base_est = DecisionTreeClassifier()\n",
+    "model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)\n",
+    "model.fit(ArrayDataset(x_train, y_train))\n",
    "\n",
-    "print('Base model accuracy: ', model.score(x_test, y_test))"
+    "print('Base model accuracy: ', model.score(ArrayDataset(x_test, y_test)))"
   ]
  },
  {
@ -114,26 +129,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
      "Improving accuracy\n",
-      "feature to remove: 0\n",
-      "Removed feature: 0, new relative accuracy: 0.939867\n",
-      "feature to remove: 4\n",
-      "Removed feature: 4, new relative accuracy: 0.967247\n",
      "feature to remove: 2\n",
-      "Removed feature: 2, new relative accuracy: 0.972620\n",
+      "Removed feature: 2, new relative accuracy: 0.935261\n",
+      "feature to remove: 4\n",
+      "Removed feature: 4, new relative accuracy: 0.946776\n",
+      "feature to remove: 0\n",
+      "Removed feature: 0, new relative accuracy: 0.972876\n",
      "feature to remove: 1\n",
-      "Removed feature: 1, new relative accuracy: 0.992323\n",
+      "Removed feature: 1, new relative accuracy: 0.992835\n",
      "feature to remove: 3\n",
      "Removed feature: 3, new relative accuracy: 1.000000\n",
-      "Accuracy on minimized data:  0.8237371411024106\n"
+      "Accuracy on minimized data:  0.8231229847996315\n"
     ]
    }
   ],
@ -155,10 +170,12 @@
    "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
    "                                                                test_size = 0.4, random_state = 38)\n",
    "x_train_predictions = model.predict(X_generalizer_train)\n",
-    "minimizer.fit(X_generalizer_train, x_train_predictions)\n",
-    "transformed = minimizer.transform(x_test)\n",
+    "if x_train_predictions.shape[1] > 1:\n",
+    "    x_train_predictions = np.argmax(x_train_predictions, axis=1)\n",
+    "minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
+    "transformed = minimizer.transform(dataset=ArrayDataset(x_test))\n",
    "\n",
-    "print('Accuracy on minimized data: ', model.score(transformed, y_test))"
+    "print('Accuracy on minimized data: ', model.score(ArrayDataset(transformed, y_test)))"
   ]
  },
  {
@ -170,14 +187,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'ranges': {}, 'untouched': [0, 1, 2, 3, 4]}\n"
+      "{'ranges': {}, 'categories': {}, 'untouched': ['4', '1', '3', '0', '2']}\n"
     ]
    }
   ],
@ -197,25 +214,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
      "Improving accuracy\n",
-      "feature to remove: 0\n",
-      "Removed feature: 0, new relative accuracy: 0.939867\n",
-      "feature to remove: 4\n",
-      "Removed feature: 4, new relative accuracy: 0.967247\n",
      "feature to remove: 2\n",
-      "Removed feature: 2, new relative accuracy: 0.972620\n",
+      "Removed feature: 2, new relative accuracy: 0.935261\n",
+      "feature to remove: 4\n",
+      "Removed feature: 4, new relative accuracy: 0.946776\n",
+      "feature to remove: 0\n",
+      "Removed feature: 0, new relative accuracy: 0.972876\n",
      "feature to remove: 1\n",
-      "Removed feature: 1, new relative accuracy: 0.992323\n",
-      "Accuracy on minimized data:  0.820205742361431\n",
-      "{'ranges': {3: [546.0, 704.0, 705.5, 742.5, 782.0, 834.0, 870.0, 1446.5, 1538.5, 1612.5, 1699.0, 1744.0, 1801.0, 1814.0, 1846.0, 1881.5, 1978.5, 2248.0, 2298.5, 2537.5]}, 'untouched': [0, 1, 2, 4]}\n"
+      "Removed feature: 1, new relative accuracy: 0.992835\n",
+      "Accuracy on minimized data:  0.8192845079072624\n",
+      "{'ranges': {'3': [569.0, 782.0, 870.0, 870.5, 938.0, 1016.5, 1311.5, 1457.0, 1494.5, 1596.0, 1629.5, 1684.0, 1805.0, 1859.0, 1867.5, 1881.5, 1938.0, 1978.5, 2119.0, 2210.0, 2218.0, 2244.5, 2298.5, 2443.5]}, 'categories': {}, 'untouched': ['2', '1', '0', '4']}\n"
     ]
    }
   ],
@ -223,9 +240,9 @@
    "# We allow a 1% deviation in accuracy from the original model accuracy\n",
    "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.99)\n",
    "\n",
-    "minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
-    "transformed2 = minimizer2.transform(x_test)\n",
-    "print('Accuracy on minimized data: ', model.score(transformed2, y_test))\n",
+    "minimizer2.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
+    "transformed2 = minimizer2.transform(dataset=ArrayDataset(x_test))\n",
+    "print('Accuracy on minimized data: ', model.score(test_data=ArrayDataset(transformed2, y_test)))\n",
    "generalizations2 = minimizer2.generalizations\n",
    "print(generalizations2)"
   ]
@ -259,4 +276,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -5,14 +5,15 @@ from sklearn.compose import ColumnTransformer

 from sklearn.datasets import load_boston, load_diabetes
 from sklearn.impute import SimpleImputer
-from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.preprocessing import OneHotEncoder

 from apt.minimization import GeneralizeToRepresentative
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
+from apt.utils.datasets import ArrayDataset
+from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor


@pytest.fixture
@ -38,11 +39,12 @@ def test_minimizer_params(data):
    y = [1, 1, 0]
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    base_est.fit(X, y)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(X, y))

-    gen = GeneralizeToRepresentative(base_est, features=features, cells=cells)
+    gen = GeneralizeToRepresentative(model, cells=cells)
    gen.fit()
-    transformed = gen.transform(X)
+    transformed = gen.transform(dataset=ArrayDataset(X, features_names=features))


 def test_minimizer_fit(data):
@ -58,15 +60,20 @@ def test_minimizer_fit(data):
                  [69, 175],
                  [24, 181],
                  [18, 190]])
-    y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
+    y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    base_est.fit(X, y)
-    predictions = base_est.predict(X)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(X, y))
+    predictions = model.predict(X)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)

-    gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5)
-    gen.fit(X, predictions)
-    transformed = gen.transform(X)
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
+    train_dataset = ArrayDataset(X, predictions, features_names=features)
+
+    gen.fit(dataset=train_dataset)
+    transformed = gen.transform(dataset=ArrayDataset(X))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}

@ -103,7 +110,7 @@ def test_minimizer_fit_pandas(data):
         [69, 175, 'm', 'aa'],
         [24, 181, 'm', 'bb'],
         [18, 190, 'm', 'bb']]
-    y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
+    y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
    X = pd.DataFrame(X, columns=features)

    numeric_features = ["age", "height"]
@ -121,16 +128,22 @@ def test_minimizer_fit_pandas(data):
        ]
    )
    encoded = preprocessor.fit_transform(X)
+    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    base_est.fit(encoded, y)
-    predictions = base_est.predict(encoded)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(encoded, y))
+    predictions = model.predict(encoded)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
+
    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
-    gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
                                     categorical_features=categorical_features)
-    gen.fit(X, predictions)
-    transformed = gen.transform(X)
+    train_dataset = ArrayDataset(X, predictions)
+    gen.fit(dataset=train_dataset)
+    transformed = gen.transform(dataset=ArrayDataset(X))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['ola', 'height', 'sex']}

@ -143,7 +156,7 @@ def test_minimizer_fit_pandas(data):
    modified_features = [f for f in features if
                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
                             'ranges'].keys()]
-    assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
    ncp = gen.ncp_
    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
@ -179,7 +192,7 @@ def test_minimizer_params_categorical(data):
         [24, 181, 'm'],
         [18, 190, 'm']]

-    y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
+    y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
    X = pd.DataFrame(X, columns=features)
    numeric_features = ["age", "height"]
    numeric_transformer = Pipeline(
@ -196,16 +209,21 @@ def test_minimizer_params_categorical(data):
        ]
    )
    encoded = preprocessor.fit_transform(X)
+    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    base_est.fit(encoded, y)
-    predictions = base_est.predict(encoded)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(encoded, y))
+    predictions = model.predict(encoded)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
-    gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
                                     categorical_features=categorical_features, cells=cells)
-    gen.fit(X, predictions)
-    transformed = gen.transform(X)
+    train_dataset = ArrayDataset(X, predictions)
+    gen.fit(dataset=train_dataset)
+    transformed = gen.transform(dataset=ArrayDataset(X))


 def test_minimizer_fit_QI(data):
@ -222,16 +240,20 @@ def test_minimizer_fit_QI(data):
                  [24, 181, 95],
                  [18, 190, 102]])
    print(X)
-    y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
-    QI = [0, 2]
+    y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    QI = ['age', 'weight']
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    base_est.fit(X, y)
-    predictions = base_est.predict(X)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(X, y))
+    predictions = model.predict(X)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)

-    gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, features_to_minimize=QI)
-    gen.fit(X, predictions)
-    transformed = gen.transform(X)
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
+    train_dataset = ArrayDataset(X, predictions, features_names=features)
+    gen.fit(dataset=train_dataset)
+    transformed = gen.transform(dataset=ArrayDataset(X))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
    for key in expexted_generalizations['ranges']:
@ -240,7 +262,7 @@ def test_minimizer_fit_QI(data):
        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
-    assert ((np.delete(transformed, QI, axis=1) == np.delete(X, QI, axis=1)).all())
+    assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
    modified_features = [f for f in features if
                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
                             'ranges'].keys()]
@ -269,7 +291,7 @@ def test_minimizer_fit_pandas_QI(data):
         [24, 181, 49, 'm', 'bb'],
         [18, 190, 69, 'm', 'bb']]

-    y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
+    y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
    X = pd.DataFrame(X, columns=features)
    QI = ['age', 'weight', 'ola']

@ -288,16 +310,22 @@ def test_minimizer_fit_pandas_QI(data):
        ]
    )
    encoded = preprocessor.fit_transform(X)
+    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    base_est.fit(encoded, y)
-    predictions = base_est.predict(encoded)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(encoded, y))
+    predictions = model.predict(encoded)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
+
    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
-    gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
                                     categorical_features=categorical_features, features_to_minimize=QI)
-    gen.fit(X, predictions)
-    transformed = gen.transform(X)
+    train_dataset = ArrayDataset(X, predictions)
+    gen.fit(dataset=train_dataset)
+    transformed = gen.transform(dataset=ArrayDataset(X))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
                                'untouched': ['height', 'sex']}
@ -308,12 +336,13 @@ def test_minimizer_fit_pandas_QI(data):
        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
-    assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
-
+    # assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
    modified_features = [f for f in features if
                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
                             'ranges'].keys()]
-    assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
+    # assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
    ncp = gen.ncp_
    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
@ -322,16 +351,19 @@ def test_minimizer_fit_pandas_QI(data):

 def test_minimize_ndarray_iris():
    features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
-    (x_train, y_train), _ = get_iris_dataset()
-    QI = [0, 2]
-    model = DecisionTreeClassifier(random_state=0, min_samples_split=2,
-                                   min_samples_leaf=1)
-    model.fit(x_train, y_train)
-    pred = model.predict(x_train)
+    (x_train, y_train), (x_test, y_test) = get_iris_dataset()
+    QI = ['sepal length (cm)', 'petal length (cm)']
+    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+                                      min_samples_leaf=1)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(x_train, y_train))
+    predictions = model.predict(x_train)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)

-    gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features=features, features_to_minimize=QI)
-    gen.fit(x_train, pred)
-    transformed = gen.transform(x_train)
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI)
+    # gen.fit(dataset=ArrayDataset(x_train, predictions))
+    transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
                                'categories': {}, 'untouched': ['petal width (cm)', 'sepal width (cm)']}
@ -342,7 +374,7 @@ def test_minimize_ndarray_iris():
        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
-    assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
+    assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x_train, [0, 2], axis=1)).all())

    modified_features = [f for f in features if
                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
@ -359,12 +391,13 @@ def test_minimize_ndarray_iris():


 def test_minimize_pandas_adult():
-    (x_train, y_train), _ = get_adult_dataset()
+    (x_train, y_train), (x_test, y_test) = get_adult_dataset()
    x_train = x_train.head(1000)
    y_train = y_train.head(1000)

    features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
+    x_train = pd.DataFrame(x_train, columns=features)

    categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                            'hours-per-week', 'native-country']
@ -384,15 +417,19 @@ def test_minimize_pandas_adult():
        ]
    )
    encoded = preprocessor.fit_transform(x_train)
+    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    base_est.fit(encoded, y_train)
-    predictions = base_est.predict(encoded)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(encoded, y_train))
+    predictions = model.predict(encoded)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)

-    gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
                                     categorical_features=categorical_features, features_to_minimize=QI)
-    gen.fit(x_train, predictions)
-    transformed = gen.transform(x_train)
+    gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
+    transformed = gen.transform(dataset=ArrayDataset(x_train))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
        'workclass': [['Self-emp-not-inc', 'Private', 'Federal-gov', 'Self-emp-inc', '?', 'Local-gov', 'State-gov']],
@ -414,12 +451,14 @@ def test_minimize_pandas_adult():
        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
-    assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+    # assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))

    modified_features = [f for f in features if
                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
                             'ranges'].keys()]
-    assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
+    # assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
    ncp = gen.ncp_
    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
@ -451,15 +490,19 @@ def test_german_credit_pandas():
        ]
    )
    encoded = preprocessor.fit_transform(x_train)
+    encoded = pd.DataFrame(encoded)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
-    base_est.fit(encoded, y_train)
-    predictions = base_est.predict(encoded)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(encoded, y_train))
+    predictions = model.predict(encoded)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)

-    gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
                                     categorical_features=categorical_features, features_to_minimize=QI)
-    gen.fit(x_train, predictions)
-    transformed = gen.transform(x_train)
+    gen.fit(dataset=ArrayDataset(x_train, predictions))
+    transformed = gen.transform(dataset=ArrayDataset(x_train))
    gener = gen.generalizations_
    expexted_generalizations = {'ranges': {'Duration_in_month': [31.5]},
                                'categories': {'Credit_history': [['A30', 'A32', 'A31', 'A34', 'A33']], 'Purpose': [
@ -481,12 +524,14 @@ def test_german_credit_pandas():
        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
-    assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+    # assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))

    modified_features = [f for f in features if
                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
                             'ranges'].keys()]
-    assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
+    # assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
    ncp = gen.ncp_
    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
        assert (ncp > 0)
@ -497,17 +542,258 @@ def test_regression():
    dataset = load_diabetes()
    x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)

-    model = DecisionTreeRegressor(random_state=10, min_samples_split=2)
-    model.fit(x_train, y_train)
-    pred = model.predict(x_train)
-    QI = [0, 2, 5, 8]
+    base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
+    model = SklearnRegressor(base_est)
+    model.fit(ArrayDataset(x_train, y_train))
+    predictions = model.predict(x_train)
+    QI = ['age', 'bmi', 's2', 's5']
    features = ['age', 'sex', 'bmi', 'bp',
                's1', 's2', 's3', 's4', 's5', 's6']

-    gen = GeneralizeToRepresentative(model, target_accuracy=0.7, features=features, is_regression=True,
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
                                     features_to_minimize=QI)
-    gen.fit(x_train, pred)
-    transformed = gen.transform(x_train)
+    gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
+    transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
+    print('Base model accuracy (R2 score): ', model.score(ArrayDataset(x_test, y_test)))
+    model.fit(ArrayDataset(transformed, y_train))
+    print('Base model accuracy (R2 score) after anonymization: ', model.score(ArrayDataset(x_test, y_test)))
+    gener = gen.generalizations_
+    expexted_generalizations = {'ranges': {
+        'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
+                -0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
+                -0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
+                0.0017505218856967986, 0.0035667913616634905, 0.007199329789727926, 0.010831868276000023,
+                0.02354575227946043, 0.030810829252004623, 0.03262709779664874, 0.03444336913526058,
+                0.03625963814556599, 0.03807590529322624, 0.03807590715587139, 0.047157252207398415,
+                0.06168740428984165, 0.0635036751627922, 0.06895248219370842, 0.07258502021431923, 0.07621755823493004,
+                0.1034616008400917],
+        'bmi': [-0.07626373693346977, -0.060635464265942574, -0.056863121688365936, -0.05578530766069889,
+                -0.054168591275811195, -0.042312657460570335, -0.0374625027179718, -0.03422906715422869,
+                -0.033690162003040314, -0.03261234890669584, -0.02614547684788704, -0.025067666545510292,
+                -0.022373135201632977, -0.016984074376523495, -0.01375063881278038, -0.007822672137990594,
+                -0.004589236050378531, 0.008344509289599955, 0.015889193629845977, 0.016967005096375942,
+                0.024511689320206642, 0.0272062208969146, 0.030978563241660595, 0.032595280557870865,
+                0.033673093654215336, 0.04391230642795563, 0.04552902653813362, 0.05469042807817459,
+                0.06977979838848114, 0.07301323488354683, 0.09349166229367256],
+        's2': [-0.1044962927699089, -0.08649025857448578, -0.07740895450115204, -0.07114598527550697,
+               -0.06378699466586113, -0.05971606448292732, -0.04437179118394852, -0.0398311372846365,
+               -0.03137612994760275, -0.022138250060379505, -0.018067320343106985, -0.017910746857523918,
+               -0.017910745926201344, -0.01618842873722315, -0.007576846517622471, -0.007263698382303119,
+               -0.0010007291566580534, 0.0010347360512241721, 0.006514834007248282, 0.00933317095041275,
+               0.012464655097573996, 0.019197346206055954, 0.020919663831591606, 0.02217225730419159,
+               0.032036433927714825, 0.036420512944459915, 0.04080459102988243, 0.04127431474626064,
+               0.04268348217010498, 0.04424922354519367, 0.04424922540783882, 0.056462014093995094, 0.05928034894168377,
+               0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
+        'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}
+
+    for key in expexted_generalizations['ranges']:
+        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expexted_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+                set([frozenset(sl) for sl in gener['categories'][key]]))
+    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
+
+    modified_features = [f for f in features if
+                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                             'ranges'].keys()]
+    indexes = []
+    for i in range(len(features)):
+        if features[i] in modified_features:
+            indexes.append(i)
+    assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
+    ncp = gen.ncp_
+    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+        assert (ncp > 0)
+        assert (((transformed[indexes]) != (x_train[indexes])).any())
+
+
+def test_X_y(data):
+    features = [0, 1, 2]
+    X = np.array([[23, 165, 70],
+                  [45, 158, 67],
+                  [56, 123, 65],
+                  [67, 154, 90],
+                  [45, 149, 67],
+                  [42, 166, 58],
+                  [73, 172, 68],
+                  [94, 168, 69],
+                  [69, 175, 80],
+                  [24, 181, 95],
+                  [18, 190, 102]])
+    print(X)
+    y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    QI = [0, 2]
+    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+                                      min_samples_leaf=1)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(X, y))
+    predictions = model.predict(X)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
+
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
+    gen.fit(X=X, y=predictions)
+    transformed = gen.transform(X)
+    gener = gen.generalizations_
+    expexted_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
+    for key in expexted_generalizations['ranges']:
+        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expexted_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+                set([frozenset(sl) for sl in gener['categories'][key]]))
+    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
+    modified_features = [f for f in features if
+                         str(f) in expexted_generalizations['categories'].keys() or str(f) in expexted_generalizations[
+                             'ranges'].keys()]
+    indexes = []
+    for i in range(len(features)):
+        if features[i] in modified_features:
+            indexes.append(i)
+    assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
+    ncp = gen.ncp_
+    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+        assert (ncp > 0)
+        assert (((transformed[indexes]) != (X[indexes])).any())
+
+
+def test_X_y_features_names(data):
+    features = ['age', 'height', 'weight']
+    X = np.array([[23, 165, 70],
+                  [45, 158, 67],
+                  [56, 123, 65],
+                  [67, 154, 90],
+                  [45, 149, 67],
+                  [42, 166, 58],
+                  [73, 172, 68],
+                  [94, 168, 69],
+                  [69, 175, 80],
+                  [24, 181, 95],
+                  [18, 190, 102]])
+    print(X)
+    y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    QI = ['age', 'weight']
+    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+                                      min_samples_leaf=1)
+    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
+    model.fit(ArrayDataset(X, y))
+    predictions = model.predict(X)
+    if predictions.shape[1] > 1:
+        predictions = np.argmax(predictions, axis=1)
+
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
+    gen.fit(X=X, y=predictions, features_names=features)
+    transformed = gen.transform(X=X, features_names=features)
+    gener = gen.generalizations_
+    expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
+    for key in expexted_generalizations['ranges']:
+        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expexted_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+                set([frozenset(sl) for sl in gener['categories'][key]]))
+    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
+    modified_features = [f for f in features if
+                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                             'ranges'].keys()]
+    indexes = []
+    for i in range(len(features)):
+        if features[i] in modified_features:
+            indexes.append(i)
+    assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
+    ncp = gen.ncp_
+    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+        assert (ncp > 0)
+        assert (((transformed[indexes]) != (X[indexes])).any())
+
+
+def test_BaseEstimator_classification(data):
+    features = ['age', 'height', 'weight', 'sex', 'ola']
+    X = [[23, 165, 65, 'f', 'aa'],
+         [45, 158, 76, 'f', 'aa'],
+         [56, 123, 78, 'f', 'bb'],
+         [67, 154, 87, 'm', 'aa'],
+         [45, 149, 45, 'f', 'bb'],
+         [42, 166, 76, 'm', 'bb'],
+         [73, 172, 85, 'm', 'bb'],
+         [94, 168, 92, 'f', 'aa'],
+         [69, 175, 95, 'm', 'aa'],
+         [24, 181, 49, 'm', 'bb'],
+         [18, 190, 69, 'm', 'bb']]
+
+    y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    X = pd.DataFrame(X, columns=features)
+    QI = ['age', 'weight', 'ola']
+
+    numeric_features = ["age", "height", "weight"]
+    numeric_transformer = Pipeline(
+        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
+    )
+
+    categorical_features = ["sex", "ola"]
+    categorical_transformer = OneHotEncoder(handle_unknown="ignore")
+
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("num", numeric_transformer, numeric_features),
+            ("cat", categorical_transformer, categorical_features),
+        ]
+    )
+    encoded = preprocessor.fit_transform(X)
+    encoded = pd.DataFrame(encoded)
+    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
+                                      min_samples_leaf=1)
+    model = base_est
+    model.fit(encoded, y)
+    predictions = model.predict(encoded)
+
+    # Append classifier to preprocessing pipeline.
+    # Now we have a full prediction pipeline.
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
+                                     categorical_features=categorical_features, features_to_minimize=QI)
+    train_dataset = ArrayDataset(X, predictions)
+    gen.fit(dataset=train_dataset)
+    transformed = gen.transform(dataset=ArrayDataset(X))
+    gener = gen.generalizations_
+    expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
+                                'untouched': ['height', 'sex']}
+
+    for key in expexted_generalizations['ranges']:
+        assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
+    for key in expexted_generalizations['categories']:
+        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
+                set([frozenset(sl) for sl in gener['categories'][key]]))
+    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
+    # assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
+    modified_features = [f for f in features if
+                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
+                             'ranges'].keys()]
+    # assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
+    np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
+    ncp = gen.ncp_
+    if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
+        assert (ncp > 0)
+        assert (((transformed[modified_features]).equals(X[modified_features])) == False)
+
+
+def test_BaseEstimator_regression():
+    dataset = load_diabetes()
+    x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
+
+    base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
+    model = base_est
+    model.fit(x_train, y_train)
+    predictions = model.predict(x_train)
+    QI = ['age', 'bmi', 's2', 's5']
+    features = ['age', 'sex', 'bmi', 'bp',
+                's1', 's2', 's3', 's4', 's5', 's6']
+
+    gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
+                                     features_to_minimize=QI)
+    gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
+    transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
    print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
    model.fit(transformed, y_train)
    print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
@ -546,7 +832,7 @@ def test_regression():
        assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
                set([frozenset(sl) for sl in gener['categories'][key]]))
    assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
-    assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
+    assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())

    modified_features = [f for f in features if
                         f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[