Minimization fixes (#12)

* Fixes related to corner cases in calculating generalizations * Fix print * Fix corner cases in transform as well * Improve prints + bug fixes in calculation of feature to remove * Notebook demonstrating ai minimization
2026-06-08 15:05:13 +02:00 · 2021-08-17 21:19:48 +03:00 · 2021-08-17 21:19:48 +03:00 · 43952e2332
commit 43952e2332
parent d2591d7840
2 changed files with 326 additions and 47 deletions
--- a/apt/minimization/minimizer.py
+++ b/apt/minimization/minimizer.py
@ -117,6 +117,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            self.cells = params['cells']
        return self

+    @property
+    def generalizations(self):
+        return self.generalizations_
+
    def fit_transform(self, X=None, y=None):
        """Learns the generalizations based on training data, and applies them to the data.

@ -206,40 +210,45 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            nodes = self._get_nodes_level(0)
            self._attach_cells_representatives(X_train, y_train, nodes)
            # self.cells_ currently holds the generalization created from the tree leaves
+            self._calculate_generalizations()

            # apply generalizations to test data
            generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_)

            # check accuracy
            accuracy = self.estimator.score(generalized, y_test)
-            print('Initial accuracy is %f' % accuracy)
+            print('Initial accuracy of model on generalized data, relative to original model predictions '
+                  '(base generalization derived from tree, before improvements): %f' % accuracy)

            # if accuracy above threshold, improve generalization
            if accuracy > self.target_accuracy:
+                print('Improving generalizations')
                level = 1
                while accuracy > self.target_accuracy:
                    nodes = self._get_nodes_level(level)
                    self._calculate_level_cells(level)
                    self._attach_cells_representatives(X_train, y_train, nodes)
+                    self._calculate_generalizations()
                    generalized = self._generalize(X_test, nodes, self.cells_,
                                                   self.cells_by_id_)
                    accuracy = self.estimator.score(generalized, y_test)
-                    print('Level: %d, accuracy: %f' % (level, accuracy))
+                    print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
                    level+=1

            # if accuracy below threshold, improve accuracy by removing features from generalization
            if accuracy < self.target_accuracy:
+                print('Improving accuracy')
                while accuracy < self.target_accuracy:
-                    self._calculate_generalizations()
                    removed_feature = self._remove_feature_from_generalization(X_test,
                                                                               nodes, y_test,
-                                                                               feature_data)
-                    if not removed_feature:
+                                                                               feature_data, accuracy)
+                    if removed_feature is None:
                        break
-                    generalized = self._generalize(X_test, nodes, self.cells_,
-                                                   self.cells_by_id_)
+
+                    self._calculate_generalizations()
+                    generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_)
                    accuracy = self.estimator.score(generalized, y_test)
-                    print('Removed feature: %s, accuracy: %f' % (removed_feature, accuracy))
+                    print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))

            # self.cells_ currently holds the chosen generalization based on target accuracy

@ -304,12 +313,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM

            # replace the values in the representative columns with the representative
            # values (leaves others untouched)
-            if not representatives.columns.empty:
+            if indexes and not representatives.columns.empty:
                if len(indexes) > 1:
                    replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True)
-                    replace.index = indexes
                else:
-                    replace = representatives.loc[i].to_frame().T
+                    replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
+                replace.index = indexes
                generalized.loc[indexes, representatives.columns] = replace

        return generalized.to_numpy()
@ -409,30 +418,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            new_cells = []
            new_cells_by_id = {}
            nodes = self._get_nodes_level(level)
-            for node in nodes:
-                if self.dt_.tree_.feature[node] == -2: # leaf node
-                    new_cell = self.cells_by_id_[node]
-                else:
-                    left_child = self.dt_.tree_.children_left[node]
-                    right_child = self.dt_.tree_.children_right[node]
-                    left_cell = self.cells_by_id_[left_child]
-                    right_cell = self.cells_by_id_[right_child]
-                    new_cell = {'id': int(node), 'ranges': {}, 'categories': {},
-                                'label': None, 'representative': None}
-                    for feature in left_cell['ranges'].keys():
-                        new_cell['ranges'][feature] = {}
-                        new_cell['ranges'][feature]['start'] = left_cell['ranges'][feature]['start']
-                        new_cell['ranges'][feature]['end'] = right_cell['ranges'][feature]['start']
-                    for feature in left_cell['categories'].keys():
-                        new_cell['categories'][feature] = \
-                            list(set(left_cell['categories'][feature]) |
-                                 set(right_cell['categories'][feature]))
-                    self._calculate_level_cell_label(left_cell, right_cell, new_cell)
-                new_cells.append(new_cell)
-                new_cells_by_id[new_cell['id']] = new_cell
-            self.cells_ = new_cells
-            self.cells_by_id_ = new_cells_by_id
-        # else: nothing to do, stay with previous cells
+            if nodes:
+                for node in nodes:
+                    if self.dt_.tree_.feature[node] == -2: # leaf node
+                        new_cell = self.cells_by_id_[node]
+                    else:
+                        left_child = self.dt_.tree_.children_left[node]
+                        right_child = self.dt_.tree_.children_right[node]
+                        left_cell = self.cells_by_id_[left_child]
+                        right_cell = self.cells_by_id_[right_child]
+                        new_cell = {'id': int(node), 'ranges': {}, 'categories': {},
+                                    'label': None, 'representative': None}
+                        for feature in left_cell['ranges'].keys():
+                            new_cell['ranges'][feature] = {}
+                            new_cell['ranges'][feature]['start'] = left_cell['ranges'][feature]['start']
+                            new_cell['ranges'][feature]['end'] = right_cell['ranges'][feature]['start']
+                        for feature in left_cell['categories'].keys():
+                            new_cell['categories'][feature] = \
+                                list(set(left_cell['categories'][feature]) |
+                                     set(right_cell['categories'][feature]))
+                        self._calculate_level_cell_label(left_cell, right_cell, new_cell)
+                    new_cells.append(new_cell)
+                    new_cells_by_id[new_cell['id']] = new_cell
+                self.cells_ = new_cells
+                self.cells_by_id_ = new_cells_by_id
+            # else: nothing to do, stay with previous cells

    def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
        new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])]
@ -445,6 +455,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        stack = [(0, -1)]  # seed is the root node id and its parent depth
        while len(stack) > 0:
            node_id, parent_depth = stack.pop()
+            # depth = distance from root
            node_depth[node_id] = parent_depth + 1

            if self.dt_.tree_.children_left[node_id] != self.dt_.tree_.children_right[node_id]:
@ -453,10 +464,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            else:
                is_leaves[node_id] = True

+        # depth of entire tree
        max_depth = max(node_depth)
+        # depth of current level
        depth = max_depth - level
+        # level is higher than root
        if depth < 0:
            return None
+        # return all nodes with depth == level or leaves higher than level
        return [i for i, x in enumerate(node_depth) if x == depth or (x < depth and is_leaves[i])]

    def _attach_cells_representatives(self, samples, labels, level_nodes):
@ -518,12 +533,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
            indexes = [j for j in range(len(mapping_to_cells)) if mapping_to_cells[j]['id'] == cells[i]['id']]
            # replaces the values in the representative columns with the representative values
            # (leaves others untouched)
-            if not representatives.columns.empty:
+            if indexes and not representatives.columns.empty:
                if len(indexes) > 1:
                    replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True)
-                    replace.index = indexes
                else:
-                    replace = representatives.loc[i].to_frame().T
+                    replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
+                replace.index = indexes
                generalized.loc[indexes, representatives.columns] = replace

        return generalized.to_numpy()
@ -539,14 +554,16 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
        node_ids = self._find_sample_nodes(samples, nodes)
        return [cells_by_id[nodeId] for nodeId in node_ids]

-    def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data):
-        feature = self._get_feature_to_remove(samples, nodes, labels, feature_data)
-        if not feature:
+    def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data, current_accuracy):
+        feature = self._get_feature_to_remove(samples, nodes, labels, feature_data, current_accuracy)
+        if feature is None:
            return None
        GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
+        # del self.generalizations_['ranges'][feature]
+        # self.generalizations_['untouched'].append(feature)
        return feature

-    def _get_feature_to_remove(self, samples, nodes, labels, feature_data):
+    def _get_feature_to_remove(self, samples, nodes, labels, feature_data, current_accuracy):
        # We want to remove features with low iLoss (NCP) and high accuracy gain
        # (after removing them)
        ranges = self.generalizations_['ranges']
@ -567,13 +584,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                    cells_by_id = copy.deepcopy(self.cells_by_id_)
                    GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
                    generalized = self._generalize(samples, nodes, new_cells, cells_by_id)
-                    accuracy = self.estimator.score(generalized, labels)
-                    feature_ncp = feature_ncp / accuracy
+                    accuracy_gain = self.estimator.score(generalized, labels) - current_accuracy
+                    if accuracy_gain < 0:
+                        accuracy_gain = 0
+                    if accuracy_gain != 0:
+                        feature_ncp = feature_ncp / accuracy_gain
+
                if feature_ncp < range_min:
                    range_min = feature_ncp
                    remove_feature = feature

-        print('feature to remove: ' + (remove_feature if remove_feature else ''))
+        print('feature to remove: ' + (str(remove_feature) if remove_feature is not None else 'none'))
        return remove_feature

    def _calculate_generalizations(self):
@ -660,5 +681,3 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
                del cell['categories'][feature]
            cell['untouched'].append(feature)
            cells_by_id[cell['id']] = cell.copy()
-
-
--- a/notebooks/minimization_adult.ipynb
+++ b/notebooks/minimization_adult.ipynb
@ -0,0 +1,260 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Applying data minimization to a trained ML model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial we will show how to perform data minimization for ML models using the minimization module. \n",
+    "\n",
+    "This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). \n",
+    "\n",
+    "We use only the numerical features in the dataset because this is what is currently supported by the module."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[3.9000e+01 1.3000e+01 2.1740e+03 0.0000e+00 4.0000e+01]\n",
+      " [5.0000e+01 1.3000e+01 0.0000e+00 0.0000e+00 1.3000e+01]\n",
+      " [3.8000e+01 9.0000e+00 0.0000e+00 0.0000e+00 4.0000e+01]\n",
+      " ...\n",
+      " [5.8000e+01 9.0000e+00 0.0000e+00 0.0000e+00 4.0000e+01]\n",
+      " [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
+      " [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
+    "x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
+    "                        usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n",
+    "\n",
+    "y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
+    "                        usecols=14, dtype=str, delimiter=\", \")\n",
+    "\n",
+    "\n",
+    "x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
+    "                        usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n",
+    "\n",
+    "y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
+    "                        usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n",
+    "\n",
+    "# Trim trailing period \".\" from label\n",
+    "y_test = np.array([a[:-1] for a in y_test])\n",
+    "\n",
+    "y_train[y_train == '<=50K'] = 0\n",
+    "y_train[y_train == '>50K'] = 1\n",
+    "y_train = y_train.astype(np.int)\n",
+    "\n",
+    "y_test[y_test == '<=50K'] = 0\n",
+    "y_test[y_test == '>50K'] = 1\n",
+    "y_test = y_test.astype(np.int)\n",
+    "\n",
+    "print(x_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train decision tree model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Base model accuracy:  0.8189914624408821\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "\n",
+    "model = DecisionTreeClassifier()\n",
+    "model.fit(x_train, y_train)\n",
+    "\n",
+    "print('Base model accuracy: ', model.score(x_test, y_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run minimization\n",
+    "We will try to run minimization with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
+      "Improving accuracy\n",
+      "feature to remove: 0\n",
+      "Removed feature: 0, new relative accuracy: 0.939867\n",
+      "feature to remove: 4\n",
+      "Removed feature: 4, new relative accuracy: 0.967247\n",
+      "feature to remove: 2\n",
+      "Removed feature: 2, new relative accuracy: 0.972620\n",
+      "feature to remove: 1\n",
+      "Removed feature: 1, new relative accuracy: 0.992323\n",
+      "feature to remove: 3\n",
+      "Removed feature: 3, new relative accuracy: 1.000000\n",
+      "Accuracy on minimized data:  0.8237371411024106\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
+    "\n",
+    "from apt.minimization import GeneralizeToRepresentative\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# default target_accuracy is 0.998\n",
+    "minimizer = GeneralizeToRepresentative(model)\n",
+    "# Can be done either on training or test data. Doing it with test data is better as the resulting accuracy on test\n",
+    "# data will be closer to the desired target accuracy (when working with training data it could result in a larger gap)\n",
+    "# Don't forget to leave a hold-out set for final validation!\n",
+    "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
+    "                                                                test_size = 0.4, random_state = 38)\n",
+    "x_train_predictions = model.predict(X_generalizer_train)\n",
+    "minimizer.fit(X_generalizer_train, x_train_predictions)\n",
+    "transformed = minimizer.transform(x_test)\n",
+    "\n",
+    "print('Accuracy on minimized data: ', model.score(transformed, y_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Let's see what features were generalized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'ranges': {}, 'untouched': [0, 1, 2, 3, 4]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "generalizations = minimizer.generalizations\n",
+    "print(generalizations)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
+    "\n",
+    "Let's change to a slightly lower target accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
+      "Improving accuracy\n",
+      "feature to remove: 0\n",
+      "Removed feature: 0, new relative accuracy: 0.939867\n",
+      "feature to remove: 4\n",
+      "Removed feature: 4, new relative accuracy: 0.967247\n",
+      "feature to remove: 2\n",
+      "Removed feature: 2, new relative accuracy: 0.972620\n",
+      "feature to remove: 1\n",
+      "Removed feature: 1, new relative accuracy: 0.992323\n",
+      "Accuracy on minimized data:  0.820205742361431\n",
+      "{'ranges': {3: [546.0, 704.0, 705.5, 742.5, 782.0, 834.0, 870.0, 1446.5, 1538.5, 1612.5, 1699.0, 1744.0, 1801.0, 1814.0, 1846.0, 1881.5, 1978.5, 2248.0, 2298.5, 2537.5]}, 'untouched': [0, 1, 2, 4]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We allow a 2.5% deviation in accuracy from the original model accuracy\n",
+    "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.975)\n",
+    "\n",
+    "minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
+    "transformed2 = minimizer2.transform(x_test)\n",
+    "print('Accuracy on minimized data: ', model.score(transformed2, y_test))\n",
+    "generalizations2 = minimizer2.generalizations\n",
+    "print(generalizations2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This time we were able to generalize one feature, feature number 3 (capital-loss)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}