diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 4440e44..5ab5c76 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -117,6 +117,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM self.cells = params['cells'] return self + @property + def generalizations(self): + return self.generalizations_ + def fit_transform(self, X=None, y=None): """Learns the generalizations based on training data, and applies them to the data. @@ -206,40 +210,45 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM nodes = self._get_nodes_level(0) self._attach_cells_representatives(X_train, y_train, nodes) # self.cells_ currently holds the generalization created from the tree leaves + self._calculate_generalizations() # apply generalizations to test data generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_) # check accuracy accuracy = self.estimator.score(generalized, y_test) - print('Initial accuracy is %f' % accuracy) + print('Initial accuracy of model on generalized data, relative to original model predictions ' + '(base generalization derived from tree, before improvements): %f' % accuracy) # if accuracy above threshold, improve generalization if accuracy > self.target_accuracy: + print('Improving generalizations') level = 1 while accuracy > self.target_accuracy: nodes = self._get_nodes_level(level) self._calculate_level_cells(level) self._attach_cells_representatives(X_train, y_train, nodes) + self._calculate_generalizations() generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_) accuracy = self.estimator.score(generalized, y_test) - print('Level: %d, accuracy: %f' % (level, accuracy)) + print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy)) level+=1 # if accuracy below threshold, improve accuracy by removing features from generalization if accuracy < self.target_accuracy: + print('Improving accuracy') while accuracy < self.target_accuracy: - self._calculate_generalizations() removed_feature = self._remove_feature_from_generalization(X_test, nodes, y_test, - feature_data) - if not removed_feature: + feature_data, accuracy) + if removed_feature is None: break - generalized = self._generalize(X_test, nodes, self.cells_, - self.cells_by_id_) + + self._calculate_generalizations() + generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_) accuracy = self.estimator.score(generalized, y_test) - print('Removed feature: %s, accuracy: %f' % (removed_feature, accuracy)) + print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy)) # self.cells_ currently holds the chosen generalization based on target accuracy @@ -304,12 +313,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM # replace the values in the representative columns with the representative # values (leaves others untouched) - if not representatives.columns.empty: + if indexes and not representatives.columns.empty: if len(indexes) > 1: replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True) - replace.index = indexes else: - replace = representatives.loc[i].to_frame().T + replace = representatives.loc[i].to_frame().T.reset_index(drop=True) + replace.index = indexes generalized.loc[indexes, representatives.columns] = replace return generalized.to_numpy() @@ -409,30 +418,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM new_cells = [] new_cells_by_id = {} nodes = self._get_nodes_level(level) - for node in nodes: - if self.dt_.tree_.feature[node] == -2: # leaf node - new_cell = self.cells_by_id_[node] - else: - left_child = self.dt_.tree_.children_left[node] - right_child = self.dt_.tree_.children_right[node] - left_cell = self.cells_by_id_[left_child] - right_cell = self.cells_by_id_[right_child] - new_cell = {'id': int(node), 'ranges': {}, 'categories': {}, - 'label': None, 'representative': None} - for feature in left_cell['ranges'].keys(): - new_cell['ranges'][feature] = {} - new_cell['ranges'][feature]['start'] = left_cell['ranges'][feature]['start'] - new_cell['ranges'][feature]['end'] = right_cell['ranges'][feature]['start'] - for feature in left_cell['categories'].keys(): - new_cell['categories'][feature] = \ - list(set(left_cell['categories'][feature]) | - set(right_cell['categories'][feature])) - self._calculate_level_cell_label(left_cell, right_cell, new_cell) - new_cells.append(new_cell) - new_cells_by_id[new_cell['id']] = new_cell - self.cells_ = new_cells - self.cells_by_id_ = new_cells_by_id - # else: nothing to do, stay with previous cells + if nodes: + for node in nodes: + if self.dt_.tree_.feature[node] == -2: # leaf node + new_cell = self.cells_by_id_[node] + else: + left_child = self.dt_.tree_.children_left[node] + right_child = self.dt_.tree_.children_right[node] + left_cell = self.cells_by_id_[left_child] + right_cell = self.cells_by_id_[right_child] + new_cell = {'id': int(node), 'ranges': {}, 'categories': {}, + 'label': None, 'representative': None} + for feature in left_cell['ranges'].keys(): + new_cell['ranges'][feature] = {} + new_cell['ranges'][feature]['start'] = left_cell['ranges'][feature]['start'] + new_cell['ranges'][feature]['end'] = right_cell['ranges'][feature]['start'] + for feature in left_cell['categories'].keys(): + new_cell['categories'][feature] = \ + list(set(left_cell['categories'][feature]) | + set(right_cell['categories'][feature])) + self._calculate_level_cell_label(left_cell, right_cell, new_cell) + new_cells.append(new_cell) + new_cells_by_id[new_cell['id']] = new_cell + self.cells_ = new_cells + self.cells_by_id_ = new_cells_by_id + # else: nothing to do, stay with previous cells def _calculate_level_cell_label(self, left_cell, right_cell, new_cell): new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])] @@ -445,6 +455,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM stack = [(0, -1)] # seed is the root node id and its parent depth while len(stack) > 0: node_id, parent_depth = stack.pop() + # depth = distance from root node_depth[node_id] = parent_depth + 1 if self.dt_.tree_.children_left[node_id] != self.dt_.tree_.children_right[node_id]: @@ -453,10 +464,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM else: is_leaves[node_id] = True + # depth of entire tree max_depth = max(node_depth) + # depth of current level depth = max_depth - level + # level is higher than root if depth < 0: return None + # return all nodes with depth == level or leaves higher than level return [i for i, x in enumerate(node_depth) if x == depth or (x < depth and is_leaves[i])] def _attach_cells_representatives(self, samples, labels, level_nodes): @@ -518,12 +533,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM indexes = [j for j in range(len(mapping_to_cells)) if mapping_to_cells[j]['id'] == cells[i]['id']] # replaces the values in the representative columns with the representative values # (leaves others untouched) - if not representatives.columns.empty: + if indexes and not representatives.columns.empty: if len(indexes) > 1: replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True) - replace.index = indexes else: - replace = representatives.loc[i].to_frame().T + replace = representatives.loc[i].to_frame().T.reset_index(drop=True) + replace.index = indexes generalized.loc[indexes, representatives.columns] = replace return generalized.to_numpy() @@ -539,14 +554,16 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM node_ids = self._find_sample_nodes(samples, nodes) return [cells_by_id[nodeId] for nodeId in node_ids] - def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data): - feature = self._get_feature_to_remove(samples, nodes, labels, feature_data) - if not feature: + def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data, current_accuracy): + feature = self._get_feature_to_remove(samples, nodes, labels, feature_data, current_accuracy) + if feature is None: return None GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature) + # del self.generalizations_['ranges'][feature] + # self.generalizations_['untouched'].append(feature) return feature - def _get_feature_to_remove(self, samples, nodes, labels, feature_data): + def _get_feature_to_remove(self, samples, nodes, labels, feature_data, current_accuracy): # We want to remove features with low iLoss (NCP) and high accuracy gain # (after removing them) ranges = self.generalizations_['ranges'] @@ -567,13 +584,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM cells_by_id = copy.deepcopy(self.cells_by_id_) GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) generalized = self._generalize(samples, nodes, new_cells, cells_by_id) - accuracy = self.estimator.score(generalized, labels) - feature_ncp = feature_ncp / accuracy + accuracy_gain = self.estimator.score(generalized, labels) - current_accuracy + if accuracy_gain < 0: + accuracy_gain = 0 + if accuracy_gain != 0: + feature_ncp = feature_ncp / accuracy_gain + if feature_ncp < range_min: range_min = feature_ncp remove_feature = feature - print('feature to remove: ' + (remove_feature if remove_feature else '')) + print('feature to remove: ' + (str(remove_feature) if remove_feature is not None else 'none')) return remove_feature def _calculate_generalizations(self): @@ -660,5 +681,3 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM del cell['categories'][feature] cell['untouched'].append(feature) cells_by_id[cell['id']] = cell.copy() - - diff --git a/notebooks/minimization_adult.ipynb b/notebooks/minimization_adult.ipynb new file mode 100644 index 0000000..bf965a6 --- /dev/null +++ b/notebooks/minimization_adult.ipynb @@ -0,0 +1,260 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Applying data minimization to a trained ML model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we will show how to perform data minimization for ML models using the minimization module. \n", + "\n", + "This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). \n", + "\n", + "We use only the numerical features in the dataset because this is what is currently supported by the module." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[3.9000e+01 1.3000e+01 2.1740e+03 0.0000e+00 4.0000e+01]\n", + " [5.0000e+01 1.3000e+01 0.0000e+00 0.0000e+00 1.3000e+01]\n", + " [3.8000e+01 9.0000e+00 0.0000e+00 0.0000e+00 4.0000e+01]\n", + " ...\n", + " [5.8000e+01 9.0000e+00 0.0000e+00 0.0000e+00 4.0000e+01]\n", + " [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n", + " [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n", + "x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n", + " usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n", + "\n", + "y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n", + " usecols=14, dtype=str, delimiter=\", \")\n", + "\n", + "\n", + "x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n", + " usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n", + "\n", + "y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n", + " usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n", + "\n", + "# Trim trailing period \".\" from label\n", + "y_test = np.array([a[:-1] for a in y_test])\n", + "\n", + "y_train[y_train == '<=50K'] = 0\n", + "y_train[y_train == '>50K'] = 1\n", + "y_train = y_train.astype(np.int)\n", + "\n", + "y_test[y_test == '<=50K'] = 0\n", + "y_test[y_test == '>50K'] = 1\n", + "y_test = y_test.astype(np.int)\n", + "\n", + "print(x_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train decision tree model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base model accuracy: 0.8189914624408821\n" + ] + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "model = DecisionTreeClassifier()\n", + "model.fit(x_train, y_train)\n", + "\n", + "print('Base model accuracy: ', model.score(x_test, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run minimization\n", + "We will try to run minimization with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n", + "Improving accuracy\n", + "feature to remove: 0\n", + "Removed feature: 0, new relative accuracy: 0.939867\n", + "feature to remove: 4\n", + "Removed feature: 4, new relative accuracy: 0.967247\n", + "feature to remove: 2\n", + "Removed feature: 2, new relative accuracy: 0.972620\n", + "feature to remove: 1\n", + "Removed feature: 1, new relative accuracy: 0.992323\n", + "feature to remove: 3\n", + "Removed feature: 3, new relative accuracy: 1.000000\n", + "Accuracy on minimized data: 0.8237371411024106\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath('..'))\n", + "\n", + "from apt.minimization import GeneralizeToRepresentative\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# default target_accuracy is 0.998\n", + "minimizer = GeneralizeToRepresentative(model)\n", + "# Can be done either on training or test data. Doing it with test data is better as the resulting accuracy on test\n", + "# data will be closer to the desired target accuracy (when working with training data it could result in a larger gap)\n", + "# Don't forget to leave a hold-out set for final validation!\n", + "X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n", + " test_size = 0.4, random_state = 38)\n", + "x_train_predictions = model.predict(X_generalizer_train)\n", + "minimizer.fit(X_generalizer_train, x_train_predictions)\n", + "transformed = minimizer.transform(x_test)\n", + "\n", + "print('Accuracy on minimized data: ', model.score(transformed, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Let's see what features were generalized" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ranges': {}, 'untouched': [0, 1, 2, 3, 4]}\n" + ] + } + ], + "source": [ + "generalizations = minimizer.generalizations\n", + "print(generalizations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n", + "\n", + "Let's change to a slightly lower target accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n", + "Improving accuracy\n", + "feature to remove: 0\n", + "Removed feature: 0, new relative accuracy: 0.939867\n", + "feature to remove: 4\n", + "Removed feature: 4, new relative accuracy: 0.967247\n", + "feature to remove: 2\n", + "Removed feature: 2, new relative accuracy: 0.972620\n", + "feature to remove: 1\n", + "Removed feature: 1, new relative accuracy: 0.992323\n", + "Accuracy on minimized data: 0.820205742361431\n", + "{'ranges': {3: [546.0, 704.0, 705.5, 742.5, 782.0, 834.0, 870.0, 1446.5, 1538.5, 1612.5, 1699.0, 1744.0, 1801.0, 1814.0, 1846.0, 1881.5, 1978.5, 2248.0, 2298.5, 2537.5]}, 'untouched': [0, 1, 2, 4]}\n" + ] + } + ], + "source": [ + "# We allow a 2.5% deviation in accuracy from the original model accuracy\n", + "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.975)\n", + "\n", + "minimizer2.fit(X_generalizer_train, x_train_predictions)\n", + "transformed2 = minimizer2.transform(x_test)\n", + "print('Accuracy on minimized data: ', model.score(transformed2, y_test))\n", + "generalizations2 = minimizer2.generalizations\n", + "print(generalizations2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time we were able to generalize one feature, feature number 3 (capital-loss)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}