Minimization fixes (#12)

* Fixes related to corner cases in calculating generalizations

* Fix print

* Fix corner cases in transform as well

* Improve prints + bug fixes in calculation of feature to remove

* Notebook demonstrating ai minimization
This commit is contained in:
abigailgold 2021-08-17 21:19:48 +03:00 committed by GitHub
parent d2591d7840
commit 43952e2332
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 326 additions and 47 deletions

View file

@ -117,6 +117,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.cells = params['cells']
return self
@property
def generalizations(self):
return self.generalizations_
def fit_transform(self, X=None, y=None):
"""Learns the generalizations based on training data, and applies them to the data.
@ -206,40 +210,45 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
nodes = self._get_nodes_level(0)
self._attach_cells_representatives(X_train, y_train, nodes)
# self.cells_ currently holds the generalization created from the tree leaves
self._calculate_generalizations()
# apply generalizations to test data
generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_)
# check accuracy
accuracy = self.estimator.score(generalized, y_test)
print('Initial accuracy is %f' % accuracy)
print('Initial accuracy of model on generalized data, relative to original model predictions '
'(base generalization derived from tree, before improvements): %f' % accuracy)
# if accuracy above threshold, improve generalization
if accuracy > self.target_accuracy:
print('Improving generalizations')
level = 1
while accuracy > self.target_accuracy:
nodes = self._get_nodes_level(level)
self._calculate_level_cells(level)
self._attach_cells_representatives(X_train, y_train, nodes)
self._calculate_generalizations()
generalized = self._generalize(X_test, nodes, self.cells_,
self.cells_by_id_)
accuracy = self.estimator.score(generalized, y_test)
print('Level: %d, accuracy: %f' % (level, accuracy))
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
level+=1
# if accuracy below threshold, improve accuracy by removing features from generalization
if accuracy < self.target_accuracy:
print('Improving accuracy')
while accuracy < self.target_accuracy:
self._calculate_generalizations()
removed_feature = self._remove_feature_from_generalization(X_test,
nodes, y_test,
feature_data)
if not removed_feature:
feature_data, accuracy)
if removed_feature is None:
break
generalized = self._generalize(X_test, nodes, self.cells_,
self.cells_by_id_)
self._calculate_generalizations()
generalized = self._generalize(X_test, nodes, self.cells_, self.cells_by_id_)
accuracy = self.estimator.score(generalized, y_test)
print('Removed feature: %s, accuracy: %f' % (removed_feature, accuracy))
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self.cells_ currently holds the chosen generalization based on target accuracy
@ -304,12 +313,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# replace the values in the representative columns with the representative
# values (leaves others untouched)
if not representatives.columns.empty:
if indexes and not representatives.columns.empty:
if len(indexes) > 1:
replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True)
replace.index = indexes
else:
replace = representatives.loc[i].to_frame().T
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
replace.index = indexes
generalized.loc[indexes, representatives.columns] = replace
return generalized.to_numpy()
@ -409,30 +418,31 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
new_cells = []
new_cells_by_id = {}
nodes = self._get_nodes_level(level)
for node in nodes:
if self.dt_.tree_.feature[node] == -2: # leaf node
new_cell = self.cells_by_id_[node]
else:
left_child = self.dt_.tree_.children_left[node]
right_child = self.dt_.tree_.children_right[node]
left_cell = self.cells_by_id_[left_child]
right_cell = self.cells_by_id_[right_child]
new_cell = {'id': int(node), 'ranges': {}, 'categories': {},
'label': None, 'representative': None}
for feature in left_cell['ranges'].keys():
new_cell['ranges'][feature] = {}
new_cell['ranges'][feature]['start'] = left_cell['ranges'][feature]['start']
new_cell['ranges'][feature]['end'] = right_cell['ranges'][feature]['start']
for feature in left_cell['categories'].keys():
new_cell['categories'][feature] = \
list(set(left_cell['categories'][feature]) |
set(right_cell['categories'][feature]))
self._calculate_level_cell_label(left_cell, right_cell, new_cell)
new_cells.append(new_cell)
new_cells_by_id[new_cell['id']] = new_cell
self.cells_ = new_cells
self.cells_by_id_ = new_cells_by_id
# else: nothing to do, stay with previous cells
if nodes:
for node in nodes:
if self.dt_.tree_.feature[node] == -2: # leaf node
new_cell = self.cells_by_id_[node]
else:
left_child = self.dt_.tree_.children_left[node]
right_child = self.dt_.tree_.children_right[node]
left_cell = self.cells_by_id_[left_child]
right_cell = self.cells_by_id_[right_child]
new_cell = {'id': int(node), 'ranges': {}, 'categories': {},
'label': None, 'representative': None}
for feature in left_cell['ranges'].keys():
new_cell['ranges'][feature] = {}
new_cell['ranges'][feature]['start'] = left_cell['ranges'][feature]['start']
new_cell['ranges'][feature]['end'] = right_cell['ranges'][feature]['start']
for feature in left_cell['categories'].keys():
new_cell['categories'][feature] = \
list(set(left_cell['categories'][feature]) |
set(right_cell['categories'][feature]))
self._calculate_level_cell_label(left_cell, right_cell, new_cell)
new_cells.append(new_cell)
new_cells_by_id[new_cell['id']] = new_cell
self.cells_ = new_cells
self.cells_by_id_ = new_cells_by_id
# else: nothing to do, stay with previous cells
def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])]
@ -445,6 +455,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
# depth = distance from root
node_depth[node_id] = parent_depth + 1
if self.dt_.tree_.children_left[node_id] != self.dt_.tree_.children_right[node_id]:
@ -453,10 +464,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
else:
is_leaves[node_id] = True
# depth of entire tree
max_depth = max(node_depth)
# depth of current level
depth = max_depth - level
# level is higher than root
if depth < 0:
return None
# return all nodes with depth == level or leaves higher than level
return [i for i, x in enumerate(node_depth) if x == depth or (x < depth and is_leaves[i])]
def _attach_cells_representatives(self, samples, labels, level_nodes):
@ -518,12 +533,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
indexes = [j for j in range(len(mapping_to_cells)) if mapping_to_cells[j]['id'] == cells[i]['id']]
# replaces the values in the representative columns with the representative values
# (leaves others untouched)
if not representatives.columns.empty:
if indexes and not representatives.columns.empty:
if len(indexes) > 1:
replace = pd.concat([representatives.loc[i].to_frame().T]*len(indexes)).reset_index(drop=True)
replace.index = indexes
else:
replace = representatives.loc[i].to_frame().T
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
replace.index = indexes
generalized.loc[indexes, representatives.columns] = replace
return generalized.to_numpy()
@ -539,14 +554,16 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
node_ids = self._find_sample_nodes(samples, nodes)
return [cells_by_id[nodeId] for nodeId in node_ids]
def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data):
feature = self._get_feature_to_remove(samples, nodes, labels, feature_data)
if not feature:
def _remove_feature_from_generalization(self, samples, nodes, labels, feature_data, current_accuracy):
feature = self._get_feature_to_remove(samples, nodes, labels, feature_data, current_accuracy)
if feature is None:
return None
GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
# del self.generalizations_['ranges'][feature]
# self.generalizations_['untouched'].append(feature)
return feature
def _get_feature_to_remove(self, samples, nodes, labels, feature_data):
def _get_feature_to_remove(self, samples, nodes, labels, feature_data, current_accuracy):
# We want to remove features with low iLoss (NCP) and high accuracy gain
# (after removing them)
ranges = self.generalizations_['ranges']
@ -567,13 +584,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(samples, nodes, new_cells, cells_by_id)
accuracy = self.estimator.score(generalized, labels)
feature_ncp = feature_ncp / accuracy
accuracy_gain = self.estimator.score(generalized, labels) - current_accuracy
if accuracy_gain < 0:
accuracy_gain = 0
if accuracy_gain != 0:
feature_ncp = feature_ncp / accuracy_gain
if feature_ncp < range_min:
range_min = feature_ncp
remove_feature = feature
print('feature to remove: ' + (remove_feature if remove_feature else ''))
print('feature to remove: ' + (str(remove_feature) if remove_feature is not None else 'none'))
return remove_feature
def _calculate_generalizations(self):
@ -660,5 +681,3 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
del cell['categories'][feature]
cell['untouched'].append(feature)
cells_by_id[cell['id']] = cell.copy()

View file

@ -0,0 +1,260 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Applying data minimization to a trained ML model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this tutorial we will show how to perform data minimization for ML models using the minimization module. \n",
"\n",
"This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). \n",
"\n",
"We use only the numerical features in the dataset because this is what is currently supported by the module."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[3.9000e+01 1.3000e+01 2.1740e+03 0.0000e+00 4.0000e+01]\n",
" [5.0000e+01 1.3000e+01 0.0000e+00 0.0000e+00 1.3000e+01]\n",
" [3.8000e+01 9.0000e+00 0.0000e+00 0.0000e+00 4.0000e+01]\n",
" ...\n",
" [5.8000e+01 9.0000e+00 0.0000e+00 0.0000e+00 4.0000e+01]\n",
" [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
" [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
"x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
" usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n",
"\n",
"y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
" usecols=14, dtype=str, delimiter=\", \")\n",
"\n",
"\n",
"x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
" usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n",
"\n",
"y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
" usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n",
"\n",
"# Trim trailing period \".\" from label\n",
"y_test = np.array([a[:-1] for a in y_test])\n",
"\n",
"y_train[y_train == '<=50K'] = 0\n",
"y_train[y_train == '>50K'] = 1\n",
"y_train = y_train.astype(np.int)\n",
"\n",
"y_test[y_test == '<=50K'] = 0\n",
"y_test[y_test == '>50K'] = 1\n",
"y_test = y_test.astype(np.int)\n",
"\n",
"print(x_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train decision tree model"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy: 0.8189914624408821\n"
]
}
],
"source": [
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"model = DecisionTreeClassifier()\n",
"model.fit(x_train, y_train)\n",
"\n",
"print('Base model accuracy: ', model.score(x_test, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run minimization\n",
"We will try to run minimization with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
"Improving accuracy\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.939867\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.967247\n",
"feature to remove: 2\n",
"Removed feature: 2, new relative accuracy: 0.972620\n",
"feature to remove: 1\n",
"Removed feature: 1, new relative accuracy: 0.992323\n",
"feature to remove: 3\n",
"Removed feature: 3, new relative accuracy: 1.000000\n",
"Accuracy on minimized data: 0.8237371411024106\n"
]
}
],
"source": [
"import sys\n",
"import os\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"\n",
"from apt.minimization import GeneralizeToRepresentative\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# default target_accuracy is 0.998\n",
"minimizer = GeneralizeToRepresentative(model)\n",
"# Can be done either on training or test data. Doing it with test data is better as the resulting accuracy on test\n",
"# data will be closer to the desired target accuracy (when working with training data it could result in a larger gap)\n",
"# Don't forget to leave a hold-out set for final validation!\n",
"X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
" test_size = 0.4, random_state = 38)\n",
"x_train_predictions = model.predict(X_generalizer_train)\n",
"minimizer.fit(X_generalizer_train, x_train_predictions)\n",
"transformed = minimizer.transform(x_test)\n",
"\n",
"print('Accuracy on minimized data: ', model.score(transformed, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Let's see what features were generalized"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ranges': {}, 'untouched': [0, 1, 2, 3, 4]}\n"
]
}
],
"source": [
"generalizations = minimizer.generalizations\n",
"print(generalizations)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
"\n",
"Let's change to a slightly lower target accuracy."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
"Improving accuracy\n",
"feature to remove: 0\n",
"Removed feature: 0, new relative accuracy: 0.939867\n",
"feature to remove: 4\n",
"Removed feature: 4, new relative accuracy: 0.967247\n",
"feature to remove: 2\n",
"Removed feature: 2, new relative accuracy: 0.972620\n",
"feature to remove: 1\n",
"Removed feature: 1, new relative accuracy: 0.992323\n",
"Accuracy on minimized data: 0.820205742361431\n",
"{'ranges': {3: [546.0, 704.0, 705.5, 742.5, 782.0, 834.0, 870.0, 1446.5, 1538.5, 1612.5, 1699.0, 1744.0, 1801.0, 1814.0, 1846.0, 1881.5, 1978.5, 2248.0, 2298.5, 2537.5]}, 'untouched': [0, 1, 2, 4]}\n"
]
}
],
"source": [
"# We allow a 2.5% deviation in accuracy from the original model accuracy\n",
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.975)\n",
"\n",
"minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
"transformed2 = minimizer2.transform(x_test)\n",
"print('Accuracy on minimized data: ', model.score(transformed2, y_test))\n",
"generalizations2 = minimizer2.generalizations\n",
"print(generalizations2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This time we were able to generalize one feature, feature number 3 (capital-loss)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}