Many fixes, some tests pass

Signed-off-by: abigailt <abigailt@il.ibm.com>
2026-06-02 14:45:13 +02:00 · 2023-05-29 19:13:35 +03:00 · 2023-05-29 19:13:35 +03:00 · cc4cba0d8e
commit cc4cba0d8e
parent 4541ee60a2
2 changed files with 200 additions and 58 deletions
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -95,7 +95,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        self.encoder = encoder
        self.generalize_using_transform = generalize_using_transform
        self._ncp = 0.0
-        self._feature_data = {}
+        self._feature_data = None
        self._categorical_values = {}
        self._dt = None
        self._features = None
@ -204,8 +204,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
        :param y: The target values. This should contain the predictions of the original model on ``X``.
        :type y: array-like, shape (n_samples,), optional
-        :param features_names: The feature names, in the order that they appear in the data. Can be provided when
+        :param features_names: The feature names, in the order that they appear in the data. Should be provided when
-                               passing the data as ``X`` and ``y``
+                               passing the data as ``X`` as a numpy array
        :type features_names: list of strings, optional
        :param dataset: Data wrapper containing the training input samples and the predictions of the original model
                        on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
@ -272,18 +272,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                used_X_test = X_test_QI
            # collect feature data (such as min, max)
-            self._feature_data = {}
+            self._feature_data = self._get_feature_data(x)
            for feature in self._features:
                if feature not in self._feature_data.keys():
                    fd = {}
                    values = list(x.loc[:, feature])
                    if feature not in self.categorical_features:
                        fd['min'] = min(values)
                        fd['max'] = max(values)
                        fd['range'] = max(values) - min(values)
                    else:
                        fd['range'] = len(np.unique(values))
                    self._feature_data[feature] = fd
            # default encoder in case none provided
            if self.encoder is None:
@ -386,7 +375,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            # self._cells currently holds the chosen generalization based on target accuracy
            # calculate iLoss
-            self.calculate_ncp(X_test)
+            X_test_dataset = ArrayDataset(X_test, features_names=self._features)
            self.calculate_ncp(X_test_dataset)
        # Return the transformer
        return self
@ -397,8 +387,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        :param X: The training input samples.
        :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
-        :param features_names: The feature names, in the order that they appear in the data. Can be provided when
+        :param features_names: The feature names, in the order that they appear in the data. Should be provided when
-                               passing the data as ``X`` and ``y``
+                               passing the data as ``X`` as a numpy array
        :type features_names: list of strings, optional
        :param dataset: Data wrapper containing the training input samples and the predictions of the original model
                        on the training data. Either ``X`` OR ``dataset`` need to be provided, not both.
@ -410,10 +400,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        if not self.generalize_using_transform:
            raise ValueError('transform method called even though generalize_using_transform parameter was False. This '
                             'can lead to inconsistent results.')
-        self.calculate_ncp(transformed, True)
+        transformed_dataset = ArrayDataset(transformed, features_names=self._features)
        self.calculate_ncp(transformed_dataset, True)
        return transformed
-    def calculate_ncp(self, samples: Optional[DATA_PANDAS_NUMPY_TYPE] = None, transformed: Optional[bool] = False):
+    def calculate_ncp(self, samples: Optional[ArrayDataset] = None, transformed: Optional[bool] = False):
        """
        Compute the NCP score of the generalization. Calculation is based on the value of the
        generalize_using_transform param.
@ -423,7 +414,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        :param samples: The input samples to compute the NCP score on. Ideally should be the data that will be
                        transformed (e.g., test/runtime data). If not samples supplied, will return the last NCP score
                        computed by the `fit` or `transform` method.
-        :type samples: {array-like, sparse matrix}, shape (n_samples, n_features), optional
+        :type samples: ArrayDataset, optional. feature_names should be set.
        :param transformed: Whether the supplied samples have already been transformed using the `transform` method.
                            Default is False.
        :type transformed: boolean, optional
@ -431,37 +422,50 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        """
        if samples is None:
            return self._ncp
-        elif self.generalize_using_transform:
+
        samples_pd = pd.DataFrame(samples.get_samples(), columns=samples.features_names)
        if self._features is None:
            self._features = samples.features_names
        if self._feature_data is None:
            self._feature_data = self._get_feature_data(samples_pd)
        if self.generalize_using_transform:
            if not transformed:
                # transform data
-                transformed_data = self._inner_transform(samples)
+                transformed_data = self._inner_transform(dataset=samples)  # can return numpy or pandas
                if not samples.is_pandas:
                    transformed_data = pd.DataFrame(transformed_data, columns=samples.features_names)
            else:
-                transformed_data = samples
+                transformed_data = samples_pd
-            #TODO
+            range_counts, category_counts = self._calculate_transformed_generalizations(transformed_data)
            generalizations = self._transformed_generalizations
        else:  # use generalizations
-            # suppressed features are already taken care of within _calc_ncp_numeric
+            generalizations = self.generalizations
-            ranges = self.generalizations['ranges']
+            range_counts = self._find_range_counts(samples_pd, generalizations['ranges'])
-            categories = self.generalizations['categories']
+            category_counts = self._find_categories_counts(samples_pd, generalizations['categories'])
            range_counts = self._find_range_count(samples, ranges)
            category_counts = self._find_categories_count(samples, categories)
-            total = samples.shape[0]
+        # suppressed features are already taken care of within _calc_ncp_numeric
-            total_ncp = 0
+        #TODO: check that this is the case for tramsformed as well
-            total_features = len(self.generalizations['untouched'])
+        ranges = generalizations['ranges']
-            for feature in ranges.keys():
+        categories = generalizations['categories']
-                feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
+
-                                                     self._feature_data[feature], total)
+        total = samples_pd.shape[0]
-                total_ncp = total_ncp + feature_ncp
+        total_ncp = 0
-                total_features += 1
+        total_features = len(generalizations['untouched'])
-            for feature in categories.keys():
+        for feature in ranges.keys():
-                feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature],
+            feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
-                                                         self._feature_data[feature],
+                                                 self._feature_data[feature], total)
-                                                         total)
+            total_ncp = total_ncp + feature_ncp
-                total_ncp = total_ncp + feature_ncp
+            total_features += 1
-                total_features += 1
+        for feature in categories.keys():
-            if total_features == 0:
+            feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature],
-                return 0
+                                                     self._feature_data[feature],
-            self._ncp = total_ncp / total_features
+                                                     total)
            total_ncp = total_ncp + feature_ncp
            total_features += 1
        if total_features == 0:
            return 0
        self._ncp = total_ncp / total_features
        return self._ncp
    def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
@ -480,7 +484,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        elif dataset is None:
            raise ValueError('Either X OR dataset need to be provided, not both')
        if dataset and dataset.features_names:
-            self._features = dataset.features_names
+            if self._features is None:
                self._features = dataset.features_names
        if dataset and dataset.get_samples() is not None:
            x = pd.DataFrame(dataset.get_samples(), columns=self._features)
@ -522,7 +527,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
    @staticmethod
    def _calc_ncp_numeric(feature_range, range_count, feature_data, total):
-        # if there are no ranges, feature is supressed and iLoss is 1
+        # if there are no ranges, feature is suppressed and iLoss is 1
        if not feature_range:
            return 1
        # range only contains the split values, need to add min and max value of feature
@ -533,6 +538,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
        return average_range_size / (feature_data['max'] - feature_data['min'])
    def _get_feature_data(self, x):
        feature_data = {}
        for feature in self._features:
            if feature not in feature_data.keys():
                fd = {}
                values = list(x.loc[:, feature])
                if feature not in self.categorical_features:
                    fd['min'] = min(values)
                    fd['max'] = max(values)
                    fd['range'] = max(values) - min(values)
                else:
                    fd['range'] = len(np.unique(values))
                feature_data[feature] = fd
        return feature_data
    def _get_record_indexes_for_cell(self, X, cell, mapped):
        indexes = []
        for index, row in X.iterrows():
@ -868,12 +889,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        # We want to remove features with low iLoss (NCP) and high accuracy gain
        # (after removing them)
        ranges = self._generalizations['ranges']
-        range_counts = self._find_range_count(original_data, ranges)
+        range_counts = self._find_range_counts(original_data, ranges)
        total = prepared_data.size
        range_min = sys.float_info.max
        remove_feature = None
        categories = self.generalizations['categories']
-        category_counts = self._find_categories_count(original_data, categories)
+        category_counts = self._find_categories_counts(original_data, categories)
        for feature in ranges.keys():
            if feature not in self._generalizations['untouched']:
@ -930,25 +951,109 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                                 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}
        self._remove_categorical_untouched(self._generalizations)
-    def _find_range_count(self, samples, ranges):
+    def _calculate_transformed_generalizations(self, transformed):
-        samples_df = pd.DataFrame(samples, columns=self._features)
+        # transformed data should only consist of representative values from cells (when removing untouched features)
        ranges = {}
        categories = {}
        range_counts = {}
        category_counts = {}
        unique_records = transformed.value_counts().reset_index(name='count')
        representatives = unique_records.drop('count', axis=1)
        representative_counts = unique_records['count']  # needed to normalize ncp according to quantity
        index = 0
        for _, record in representatives.iterrows():
            # TODO: what if some cells are not present, we will not take their generalizations into account. We need to
            # "gain" ncp in this case...
            record_dict = self.pandas_record_to_dict(record)
            for cell in self.cells:
                representative = cell["representative"].copy()
                record_copy = record_dict.copy()
                if 'untouched' in cell:
                    for feature in cell['untouched']:
                        record_copy.pop(feature)
                        representative.pop(feature)
                if record_copy == representative:
                    # handle numerical features
                    for feature in [key for key in cell['ranges'].keys() if
                                    'untouched' not in cell or key not in cell['untouched']]:
                        if feature not in ranges.keys():
                            ranges[feature] = []
                        if cell['ranges'][feature]['start'] is not None:
                            ranges[feature].append(cell['ranges'][feature]['start'])
                        if cell['ranges'][feature]['end'] is not None:
                            ranges[feature].append(cell['ranges'][feature]['end'])
                        if feature in range_counts:
                            range_counts[feature].append(representative_counts[index])
                        else:
                            range_counts[feature] = [representative_counts[index]]
                    # handle categorical features
                    categorical_features_values = {}
                    for feature in [key for key in cell['categories'].keys() if
                                    'untouched' not in cell or key not in cell['untouched']]:
                        if feature not in categorical_features_values.keys():
                            categorical_features_values[feature] = []
                        for value in cell['categories'][feature]:
                            if value not in categorical_features_values[feature]:
                                categorical_features_values[feature].append(value)
                    for feature in categorical_features_values.keys():
                        partitions = []
                        values = categorical_features_values[feature]
                        assigned = []
                        for i in range(len(values)):
                            value1 = values[i]
                            if value1 in assigned:
                                continue
                            partition = [value1]
                            assigned.append(value1)
                            for j in range(len(values)):
                                if j <= i:
                                    continue
                                value2 = values[j]
                                if GeneralizeToRepresentative._are_inseparable(self.cells, feature, value1, value2):
                                    partition.append(value2)
                                    assigned.append(value2)
                            partitions.append(partition)
                        if feature in categories:
                            categories[feature].append(partitions)
                        else:
                            categories[feature] = [partitions]
                        if feature in category_counts:
                            category_counts[feature].append(representative_counts[index])
                        else:
                            category_counts[feature] = [representative_counts[index]]
                    break
            index += 1
        for feature in ranges.keys():
            ranges[feature] = list(set(ranges[feature]))
            ranges[feature].sort()
        self._transformed_generalizations = {
            'ranges': ranges,
            'categories': categories,
            'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}
        self._remove_categorical_untouched(self._transformed_generalizations)
        return range_counts, category_counts
    def _find_range_counts(self, samples, ranges):
        range_counts = {}
        last_value = None
        for r in ranges.keys():
            range_counts[r] = []
            # if empty list, all samples should be counted
            if not ranges[r]:
-                range_counts[r].append(samples_df.shape[0])
+                range_counts[r].append(samples.shape[0])
            else:
                for value in ranges[r]:
-                    counter = [item for item in samples_df[r] if int(item) <= value]
+                    counter = [item for item in samples[r] if int(item) <= value]
                    range_counts[r].append(len(counter))
                    last_value = value
-                counter = [item for item in samples_df[r] if int(item) <= last_value]
+                counter = [item for item in samples[r] if int(item) <= last_value]
                range_counts[r].append(len(counter))
        return range_counts
-    def _find_categories_count(self, samples, categories):
+    def _find_categories_counts(self, samples, categories):
        category_counts = {}
        for c in categories.keys():
            category_counts[c] = []
@ -1054,3 +1159,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        for feature in to_remove:
            del generalizations['categories'][feature]
    @staticmethod
    def pandas_record_to_dict(record):
        dict = {}
        for feature in record.index:
            dict[feature] = record[feature]
        return dict
--- a/tests/test_minimizer.py
+++ b/tests/test_minimizer.py
@ -70,13 +70,42 @@ def test_minimizer_params_not_transform(data):
                  [45, 158],
                  [18, 190]])
    y = [1, 1, 0]
    samples = ArrayDataset(X, y, features)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(X, y))
    gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False)
-    gen.calculate_ncp(X)
+    gen.calculate_ncp(samples)
    ncp = gen.ncp
    assert (ncp > 0.0)
 def test_minimizer_params_not_transform_no_data(data):
    # Assume two features, age and height, and boolean label
    cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": None, "end": 170}}, "label": 0,
              'categories': {}, "representative": {"age": 26, "height": 149}},
             {"id": 2, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": None, "end": 170}}, "label": 1,
              'categories': {}, "representative": {"age": 58, "height": 163}},
             {"id": 3, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": 171, "end": None}}, "label": 0,
              'categories': {}, "representative": {"age": 31, "height": 184}},
             {"id": 4, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": 171, "end": None}}, "label": 1,
              'categories': {}, "representative": {"age": 45, "height": 176}}
             ]
    features = ['age', 'height']
    X = np.array([[23, 165],
                  [45, 158],
                  [18, 190]])
    y = [1, 1, 0]
    samples = ArrayDataset(X, y, features)
    base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
                                      min_samples_leaf=1)
    model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
    model.fit(ArrayDataset(X, y))
    gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False)
    gen.calculate_ncp(samples)
    ncp = gen.ncp
    assert (ncp > 0.0)