Consistent one-hot-encoding (#38)

* Reuse code between generalize and transform methods * Option to get encoder from user * Consistent encoding for decision tree and generalizations (separate from target model encoding)
2026-04-28 14:26:21 +02:00 · 2022-05-22 18:02:33 +03:00 · 2022-05-22 18:02:33 +03:00 · dfa684da6b
commit dfa684da6b
parent 7055d5ecf6
2 changed files with 153 additions and 128 deletions
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -11,7 +11,7 @@ from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 from sklearn.utils.validation import check_is_fitted
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.model_selection import train_test_split
@ -47,12 +47,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    :type cells: list of objects, optional
    :param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
                                 encoded before using them to train the decision tree model).
+    :param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
+                    features)
+    :type encoder: sklearn OrdinalEncoder or OneHotEncoder
    :type categorical_features: list of strings, optional
    :param features_to_minimize: The features to be minimized.
    :type features_to_minimize: list of strings or int, optional
-    :param train_only_QI: Whether to train the tree just on the ``features_to_minimize`` or on all features. Default
-                          is only on ``features_to_minimize``.
-    :type train_only_QI: boolean, optional
+    :param train_only_features_to_minimize: Whether to train the tree just on the ``features_to_minimize`` or on all
+                                            features. Default is only on ``features_to_minimize``.
+    :type train_only_features_to_minimize: boolean, optional
    :param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
                          Default is False.
    :type is_regression: boolean, optional
@ -60,7 +63,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998,
                 cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None,
-                 features_to_minimize: Optional[Union[np.ndarray, list]] = None, train_only_QI: Optional[bool] = True,
+                 encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
+                 features_to_minimize: Optional[Union[np.ndarray, list]] = None,
+                 train_only_features_to_minimize: Optional[bool] = True,
                 is_regression: Optional[bool] = False):

        self.estimator = estimator
@ -75,8 +80,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        if categorical_features:
            self.categorical_features = categorical_features
        self.features_to_minimize = features_to_minimize
-        self.train_only_QI = train_only_QI
+        self.train_only_features_to_minimize = train_only_features_to_minimize
        self.is_regression = is_regression
+        self.encoder = encoder

    def get_params(self, deep=True):
        """
@ -89,9 +95,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        """
        ret = {}
        ret['target_accuracy'] = self.target_accuracy
+        ret['categorical_features'] = self.categorical_features
+        ret['features_to_minimize'] = self.features_to_minimize
+        ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
+        ret['is_regression'] = self.is_regression
        if deep:
            ret['cells'] = copy.deepcopy(self.cells)
            ret['estimator'] = self.estimator
+            ret['encoder'] = self.encoder
        else:
            ret['cells'] = copy.copy(self.cells)
        return ret
@ -111,6 +122,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        """
        if 'target_accuracy' in params:
            self.target_accuracy = params['target_accuracy']
+        if 'categorical_features' in params:
+            self.categorical_features = params['categorical_features']
+        if 'features_to_minimize' in params:
+            self.features_to_minimize = params['features_to_minimize']
+        if 'train_only_features_to_minimize' in params:
+            self.train_only_features_to_minimize = params['train_only_features_to_minimize']
+        if 'is_regression' in params:
+            self.is_regression = params['is_regression']
        if 'cells' in params:
            self.cells = params['cells']
        return self
@ -208,7 +227,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

            # divide dataset into train and test
            used_data = x
-            if self.train_only_QI:
+            if self.train_only_features_to_minimize:
                used_data = x_QI
            if self.is_regression:
                X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4, random_state=14)
@ -219,8 +238,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            X_train_QI = X_train.loc[:, self.features_to_minimize]
            X_test_QI = X_test.loc[:, self.features_to_minimize]
            used_X_train = X_train
-            if self.train_only_QI:
+            used_X_test = X_test
+            if self.train_only_features_to_minimize:
                used_X_train = X_train_QI
+                used_X_test = X_test_QI

            # collect feature data (such as min, max)
            feature_data = {}
@ -236,46 +257,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                        fd['range'] = len(np.unique(values))
                    feature_data[feature] = fd

-            # prepare data for DT
-
-            # preprocessor to fit data that have features not included in QI (to get accuracy)
-            numeric_features = [f for f in self._features if f not in self.categorical_features]
-            numeric_transformer = Pipeline(
-                steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
-            )
-            categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
-            preprocessor = ColumnTransformer(
-                transformers=[
-                    ("num", numeric_transformer, numeric_features),
-                    ("cat", categorical_transformer, self.categorical_features),
-                ]
-            )
-            preprocessor.fit(x)
-
-            if self.train_only_QI:
-                categorical_features = [f for f in self._features if f in self.categorical_features and
-                                        f in self.features_to_minimize]
-
+            # default encoder in case none provided
+            if self.encoder is None:
+                numeric_features = [f for f in self._features if f not in self.categorical_features]
                numeric_transformer = Pipeline(
                        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
                )
-
-                numeric_features = [f for f in self._features if f not in self.categorical_features and
-                                    f in self.features_to_minimize]
                categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
-
-                preprocessor_QI_features = ColumnTransformer(
+                self.encoder = ColumnTransformer(
                        transformers=[
                            ("num", numeric_transformer, numeric_features),
-                            ("cat", categorical_transformer, categorical_features),
+                            ("cat", categorical_transformer, self.categorical_features),
                        ]
                )
-                preprocessor_QI_features.fit(x_QI)
-                x_prepared = preprocessor_QI_features.transform(X_train_QI)
-            else:
-                x_prepared = preprocessor.transform(X_train)
-
-            self._preprocessor = preprocessor
+                self.encoder.fit(x)

            self.cells = []
            self._categorical_values = {}
@ -285,11 +280,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            else:
                self._dt = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                                  min_samples_leaf=1)
+
+            # prepare data for DT
+            self._encode_categorical_features(used_data, save_mapping=True)
+            x_prepared = self._encode_categorical_features(used_X_train)
            self._dt.fit(x_prepared, y_train)
-
-            self._modify_categorical_features(used_data)
-
-            x_prepared = pd.DataFrame(x_prepared, columns=self._categorical_data.columns)
+            x_prepared_test = self._encode_categorical_features(used_X_test)

            self._calculate_cells()
            self._modify_cells()
@ -303,19 +299,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

            # self._cells currently holds the generalization created from the tree leaves
            self._calculate_generalizations()
-
-            # apply generalizations to test data
-            if self.train_only_QI:
-                x_prepared_test = preprocessor_QI_features.transform(X_test_QI)
-            else:
-                x_prepared_test = preprocessor.transform(X_test)
-
-            x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self._categorical_data.columns)
-
            generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)

            # check accuracy
-            accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
+            accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
            print('Initial accuracy of model on generalized data, relative to original model predictions '
                  '(base generalization derived from tree, before improvements): %f' % accuracy)

@ -340,7 +327,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    self._calculate_generalizations()
                    generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells,
                                                   self._cells_by_id)
-                    accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
+                    accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
                    # if accuracy passed threshold roll back to previous iteration generalizations
                    if accuracy < self.target_accuracy:
                        self.cells = cells_previous_iter
@ -364,7 +351,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

                    self._calculate_generalizations()
                    generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
-                    accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
+                    accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
                    print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))

            # self._cells currently holds the chosen generalization based on target accuracy
@ -416,38 +403,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        if not self._features:
            self._features = [i for i in range(x.shape[1])]

-        representatives = pd.DataFrame(columns=self._features)  # only columns
-        generalized = pd.DataFrame(x, columns=self._features, copy=True)  # original data
        mapped = np.zeros(x.shape[0])  # to mark records we already mapped
-
-        # iterate over cells (leaves in decision tree)
+        all_indexes = []
        for i in range(len(self.cells)):
-            # Copy the representatives from the cells into another data structure:
-            # iterate over features in test data
-            for feature in self._features:
-                # if feature has a representative value in the cell and should not
-                # be left untouched, take the representative value
-                if feature in self.cells[i]['representative'] and \
-                        ('untouched' not in self.cells[i]
-                         or feature not in self.cells[i]['untouched']):
-                    representatives.loc[i, feature] = self.cells[i]['representative'][feature]
-                # else, drop the feature (removes from representatives columns that
-                # do not have a representative value or should remain untouched)
-                elif feature in representatives.columns.tolist():
-                    representatives = representatives.drop(feature, axis=1)
-
-            # get the indexes of all records that map to this cell
            indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
+            all_indexes.append(indexes)
+        generalized = self._generalize_indexes(x, self.cells, all_indexes)

-            # replace the values in the representative columns with the representative
-            # values (leaves others untouched)
-            if indexes and not representatives.columns.empty:
-                if len(indexes) > 1:
-                    replace = pd.concat([representatives.loc[i].to_frame().T] * len(indexes)).reset_index(drop=True)
-                else:
-                    replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
-                replace.index = indexes
-                generalized.loc[indexes, representatives.columns] = replace
        if dataset and dataset.is_pandas:
            return generalized
        elif isinstance(X, pd.DataFrame):
@ -477,29 +439,36 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        mapped.itemset(i, 1)
        return True

-    def _modify_categorical_features(self, X):
-        self._categorical_values = {}
-        self._one_hot_vector_features_to_features = {}
+    def _encode_categorical_features(self, X, save_mapping=False):
+        if save_mapping:
+            self._categorical_values = {}
+            self._one_hot_vector_features_to_features = {}
        features_to_remove = []
        used_features = self._features
-        if self.train_only_QI:
+        if self.train_only_features_to_minimize:
            used_features = self.features_to_minimize
        for feature in self.categorical_features:
            if feature in used_features:
                try:
                    all_values = X.loc[:, feature]
                    values = list(all_values.unique())
-                    self._categorical_values[feature] = values
-                    X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False)
+                    if save_mapping:
+                        self._categorical_values[feature] = values
+                    X[feature] = pd.Categorical(X.loc[:, feature], categories=self._categorical_values[feature],
+                                                ordered=False)
                    ohe = pd.get_dummies(X[feature], prefix=feature)
-                    for one_hot_vector_feature in ohe.columns:
-                        self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
+                    if save_mapping:
+                        for one_hot_vector_feature in ohe.columns:
+                            self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
                    X = pd.concat([X, ohe], axis=1)
                    features_to_remove.append(feature)
                except KeyError:
                    print("feature " + feature + "not found in training data")

-        self._categorical_data = X.drop(features_to_remove, axis=1)
+        new_data = X.drop(features_to_remove, axis=1)
+        if save_mapping:
+            self._encoded_features = new_data.columns
+        return new_data

    def _cell_contains_numeric(self, f, range, x):
        i = self._features.index(f)
@ -538,7 +507,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            return [cell]

        cells = []
-        feature = self._categorical_data.columns[feature_index]
+        feature = self._encoded_features[feature_index]
        threshold = self._dt.tree_.threshold[node]
        left_child = self._dt.tree_.children_left[node]
        right_child = self._dt.tree_.children_right[node]
@ -569,7 +538,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

    def _modify_cells(self):
        cells = []
-        features = self._categorical_data.columns
+        features = self._encoded_features
        for cell in self.cells:
            new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {}, 'categories': {}, 'hist': cell['hist'],
                        'untouched': [], 'representative': None}
@ -711,11 +680,19 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]

    def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
+        mapping_to_cells = self._map_to_cells(prepared_data, level_nodes, cells_by_id)
+        all_indexes = []
+        for i in range(len(cells)):
+            # get the indexes of all records that map to this cell
+            indexes = [j for j in mapping_to_cells if mapping_to_cells[j]['id'] == cells[i]['id']]
+            all_indexes.append(indexes)
+        return self._generalize_indexes(original_data, cells, all_indexes)
+
+    def _generalize_indexes(self, original_data, cells, all_indexes):
        # prepared data include one hot encoded categorical data + QI
        representatives = pd.DataFrame(columns=self._features)  # empty except for columns
-        generalized = pd.DataFrame(prepared_data, columns=self._categorical_data.columns, copy=True)
        original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
-        mapping_to_cells = self._map_to_cells(generalized, level_nodes, cells_by_id)
+
        # iterate over cells (leaves in decision tree)
        for i in range(len(cells)):
            # This code just copies the representatives from the cells into another data structure
@ -731,9 +708,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                elif feature in representatives.columns.tolist():
                    representatives = representatives.drop(feature, axis=1)

-            # get the indexes of all records that map to this cell
-            indexes = [j for j in mapping_to_cells if mapping_to_cells[j]['id'] == cells[i]['id']]
-
+            indexes = all_indexes[i]
            # replaces the values in the representative columns with the representative values
            # (leaves others untouched)
            if indexes and not representatives.columns.empty:
@ -794,7 +769,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    cells_by_id = copy.deepcopy(self._cells_by_id)
                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
                    generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
-                    accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
+                    accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
                                                                      labels)) - current_accuracy
                    if accuracy_gain < 0:
                        accuracy_gain = 0
@ -817,7 +792,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    cells_by_id = copy.deepcopy(self._cells_by_id)
                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
                    generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
-                    accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
+                    accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
                                                                      labels)) - current_accuracy

                    if accuracy_gain < 0:
@ -838,7 +813,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        self._remove_categorical_untouched(self._generalizations)

    def _find_range_count(self, samples, ranges):
-        samples_df = pd.DataFrame(samples, columns=self._categorical_data.columns)
+        samples_df = pd.DataFrame(samples, columns=self._encoded_features)
        range_counts = {}
        last_value = None
        for r in ranges.keys():