mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Revert to having generalize_using_transform as an instance param (passed at init) and throwing an exception when used incorrectly.
Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
256dfbbc71
commit
5e84f3fac4
2 changed files with 64 additions and 32 deletions
|
|
@ -68,6 +68,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
|
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
|
||||||
Default is False.
|
Default is False.
|
||||||
:type is_regression: boolean, optional
|
:type is_regression: boolean, optional
|
||||||
|
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization
|
||||||
|
process. True means that the `transform` method is used to transform original
|
||||||
|
data into generalized data that is used for accuracy and NCP calculation.
|
||||||
|
False indicates that the `generalizations` structure should be used.
|
||||||
|
Default is True.
|
||||||
|
:type generalize_using_transform: boolean, optional
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, estimator: Union[BaseEstimator, Model] = None,
|
def __init__(self, estimator: Union[BaseEstimator, Model] = None,
|
||||||
|
|
@ -77,7 +83,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
|
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
|
||||||
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
|
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
|
||||||
train_only_features_to_minimize: Optional[bool] = True,
|
train_only_features_to_minimize: Optional[bool] = True,
|
||||||
is_regression: Optional[bool] = False):
|
is_regression: Optional[bool] = False,
|
||||||
|
generalize_using_transform: bool = True):
|
||||||
|
|
||||||
self.estimator = estimator
|
self.estimator = estimator
|
||||||
if estimator is not None and not issubclass(estimator.__class__, Model):
|
if estimator is not None and not issubclass(estimator.__class__, Model):
|
||||||
|
|
@ -96,6 +103,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self.train_only_features_to_minimize = train_only_features_to_minimize
|
self.train_only_features_to_minimize = train_only_features_to_minimize
|
||||||
self.is_regression = is_regression
|
self.is_regression = is_regression
|
||||||
self.encoder = encoder
|
self.encoder = encoder
|
||||||
|
self.generalize_using_transform = generalize_using_transform
|
||||||
self._ncp_scores = NCPScores()
|
self._ncp_scores = NCPScores()
|
||||||
self._feature_data = None
|
self._feature_data = None
|
||||||
self._categorical_values = {}
|
self._categorical_values = {}
|
||||||
|
|
@ -199,11 +207,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
||||||
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
||||||
"""
|
"""
|
||||||
|
if not self.generalize_using_transform:
|
||||||
|
raise ValueError('fit_transform method called even though generalize_using_transform parameter was False. '
|
||||||
|
'This can lead to inconsistent results.')
|
||||||
self.fit(X, y, features_names, dataset=dataset)
|
self.fit(X, y, features_names, dataset=dataset)
|
||||||
return self.transform(X, features_names, dataset=dataset)
|
return self.transform(X, features_names, dataset=dataset)
|
||||||
|
|
||||||
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
||||||
features_names: Optional = None, dataset: ArrayDataset = None, generalize_using_transform: bool = True):
|
features_names: Optional = None, dataset: ArrayDataset = None):
|
||||||
"""Learns the generalizations based on training data. Also sets the fit_score and generalizations_score in
|
"""Learns the generalizations based on training data. Also sets the fit_score and generalizations_score in
|
||||||
self.ncp.
|
self.ncp.
|
||||||
|
|
||||||
|
|
@ -217,12 +228,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
|
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
|
||||||
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
|
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
|
||||||
:type dataset: `ArrayDataset`, optional
|
:type dataset: `ArrayDataset`, optional
|
||||||
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization
|
|
||||||
process. True means that the `transform` method is used to transform original
|
|
||||||
data into generalized data that is used for accuracy and NCP calculation.
|
|
||||||
False indicates that the `generalizations` structure should be used.
|
|
||||||
Default is True.
|
|
||||||
:type generalize_using_transform: boolean, optional
|
|
||||||
:return: self
|
:return: self
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -329,7 +334,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
|
|
||||||
# self._cells currently holds the generalization created from the tree leaves
|
# self._cells currently holds the generalization created from the tree leaves
|
||||||
self._calculate_generalizations(x_test)
|
self._calculate_generalizations(x_test)
|
||||||
if generalize_using_transform:
|
if self.generalize_using_transform:
|
||||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
||||||
else:
|
else:
|
||||||
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
|
||||||
|
|
@ -359,7 +364,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
|
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
|
||||||
|
|
||||||
self._calculate_generalizations(x_test)
|
self._calculate_generalizations(x_test)
|
||||||
if generalize_using_transform:
|
if self.generalize_using_transform:
|
||||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
||||||
self._cells_by_id)
|
self._cells_by_id)
|
||||||
else:
|
else:
|
||||||
|
|
@ -384,12 +389,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
removed_feature = self._remove_feature_from_generalization(x_test, x_prepared_test,
|
removed_feature = self._remove_feature_from_generalization(x_test, x_prepared_test,
|
||||||
nodes, y_test,
|
nodes, y_test,
|
||||||
self._feature_data, accuracy,
|
self._feature_data, accuracy,
|
||||||
generalize_using_transform)
|
self.generalize_using_transform)
|
||||||
if removed_feature is None:
|
if removed_feature is None:
|
||||||
break
|
break
|
||||||
|
|
||||||
self._calculate_generalizations(x_test)
|
self._calculate_generalizations(x_test)
|
||||||
if generalize_using_transform:
|
if self.generalize_using_transform:
|
||||||
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
|
||||||
self._cells_by_id)
|
self._cells_by_id)
|
||||||
else:
|
else:
|
||||||
|
|
@ -401,8 +406,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
|
|
||||||
# calculate iLoss
|
# calculate iLoss
|
||||||
x_test_dataset = ArrayDataset(x_test, features_names=self._features)
|
x_test_dataset = ArrayDataset(x_test, features_names=self._features)
|
||||||
self._ncp_scores.fit_score = self.calculate_ncp(x_test_dataset, generalize_using_transform)
|
self._ncp_scores.fit_score = self.calculate_ncp(x_test_dataset)
|
||||||
self._ncp_scores.generalizations_score = self.calculate_ncp(x_test_dataset, False)
|
self._ncp_scores.generalizations_score = self.calculate_ncp(x_test_dataset)
|
||||||
|
|
||||||
# Return the transformer
|
# Return the transformer
|
||||||
return self
|
return self
|
||||||
|
|
@ -422,12 +427,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
|
||||||
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
|
||||||
"""
|
"""
|
||||||
|
if not self.generalize_using_transform:
|
||||||
|
raise ValueError('transform method called even though generalize_using_transform parameter was False. This '
|
||||||
|
'can lead to inconsistent results.')
|
||||||
transformed = self._inner_transform(X, features_names, dataset)
|
transformed = self._inner_transform(X, features_names, dataset)
|
||||||
transformed_dataset = ArrayDataset(transformed, features_names=self._features)
|
transformed_dataset = ArrayDataset(transformed, features_names=self._features)
|
||||||
self._ncp_scores.transform_score = self.calculate_ncp(transformed_dataset, True)
|
self._ncp_scores.transform_score = self.calculate_ncp(transformed_dataset)
|
||||||
return transformed
|
return transformed
|
||||||
|
|
||||||
def calculate_ncp(self, samples: ArrayDataset, generalize_using_transform: bool = True):
|
def calculate_ncp(self, samples: ArrayDataset):
|
||||||
"""
|
"""
|
||||||
Compute the NCP score of the generalization. Calculation is based on the value of the
|
Compute the NCP score of the generalization. Calculation is based on the value of the
|
||||||
generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the
|
generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the
|
||||||
|
|
@ -438,11 +446,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
|
|
||||||
:param samples: The input samples to compute the NCP score on.
|
:param samples: The input samples to compute the NCP score on.
|
||||||
:type samples: ArrayDataset, optional. feature_names should be set.
|
:type samples: ArrayDataset, optional. feature_names should be set.
|
||||||
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization process.
|
|
||||||
True means that the `transform` method is used to transform original data into
|
|
||||||
generalized data that is used for accuracy and NCP calculation. False indicates
|
|
||||||
that the `generalizations` structure should be used. Default is True.
|
|
||||||
:type generalize_using_transform: boolean, optional
|
|
||||||
:return: NCP score as float.
|
:return: NCP score as float.
|
||||||
"""
|
"""
|
||||||
if not samples.features_names:
|
if not samples.features_names:
|
||||||
|
|
@ -454,7 +457,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
self._feature_data = self._get_feature_data(samples_pd)
|
self._feature_data = self._get_feature_data(samples_pd)
|
||||||
total_samples = samples_pd.shape[0]
|
total_samples = samples_pd.shape[0]
|
||||||
|
|
||||||
if generalize_using_transform:
|
if self.generalize_using_transform:
|
||||||
generalizations = self._calculate_cell_generalizations()
|
generalizations = self._calculate_cell_generalizations()
|
||||||
# count how many records are mapped to each cell
|
# count how many records are mapped to each cell
|
||||||
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted
|
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted
|
||||||
|
|
|
||||||
|
|
@ -256,8 +256,8 @@ def test_minimizer_params_not_transform(cells):
|
||||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
model.fit(ArrayDataset(x, y))
|
model.fit(ArrayDataset(x, y))
|
||||||
|
|
||||||
gen = GeneralizeToRepresentative(model, cells=cells)
|
gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False)
|
||||||
ncp = gen.calculate_ncp(samples, generalize_using_transform=False)
|
ncp = gen.calculate_ncp(samples)
|
||||||
assert (ncp > 0.0)
|
assert (ncp > 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -304,10 +304,10 @@ def test_minimizer_ncp(data_two_features):
|
||||||
target_accuracy = 0.4
|
target_accuracy = 0.4
|
||||||
train_dataset = ArrayDataset(x, predictions, features_names=features)
|
train_dataset = ArrayDataset(x, predictions, features_names=features)
|
||||||
|
|
||||||
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False)
|
||||||
gen1.fit(dataset=train_dataset, generalize_using_transform=False)
|
gen1.fit(dataset=train_dataset)
|
||||||
ncp1 = gen1.ncp.fit_score
|
ncp1 = gen1.ncp.fit_score
|
||||||
ncp2 = gen1.calculate_ncp(ad1, generalize_using_transform=False)
|
ncp2 = gen1.calculate_ncp(ad1)
|
||||||
|
|
||||||
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
||||||
gen2.fit(dataset=train_dataset)
|
gen2.fit(dataset=train_dataset)
|
||||||
|
|
@ -348,10 +348,10 @@ def test_minimizer_ncp_categorical(data_four_features):
|
||||||
train_dataset = ArrayDataset(x, predictions, features_names=features)
|
train_dataset = ArrayDataset(x, predictions, features_names=features)
|
||||||
|
|
||||||
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||||
categorical_features=categorical_features)
|
categorical_features=categorical_features, generalize_using_transform=False)
|
||||||
gen1.fit(dataset=train_dataset, generalize_using_transform=False)
|
gen1.fit(dataset=train_dataset)
|
||||||
ncp1 = gen1.ncp.fit_score
|
ncp1 = gen1.ncp.fit_score
|
||||||
ncp2 = gen1.calculate_ncp(ad1, generalize_using_transform=False)
|
ncp2 = gen1.calculate_ncp(ad1)
|
||||||
|
|
||||||
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features)
|
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features)
|
||||||
gen2.fit(dataset=train_dataset)
|
gen2.fit(dataset=train_dataset)
|
||||||
|
|
@ -381,10 +381,10 @@ def test_minimizer_fit_not_transform(data_two_features):
|
||||||
if predictions.shape[1] > 1:
|
if predictions.shape[1] > 1:
|
||||||
predictions = np.argmax(predictions, axis=1)
|
predictions = np.argmax(predictions, axis=1)
|
||||||
target_accuracy = 0.5
|
target_accuracy = 0.5
|
||||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False)
|
||||||
train_dataset = ArrayDataset(x, predictions, features_names=features)
|
train_dataset = ArrayDataset(x, predictions, features_names=features)
|
||||||
|
|
||||||
gen.fit(dataset=train_dataset, generalize_using_transform=False)
|
gen.fit(dataset=train_dataset)
|
||||||
gener = gen.generalizations
|
gener = gen.generalizations
|
||||||
expected_generalizations = {'ranges': {'age': [], 'height': [157.0]}, 'categories': {}, 'untouched': []}
|
expected_generalizations = {'ranges': {'age': [], 'height': [157.0]}, 'categories': {}, 'untouched': []}
|
||||||
|
|
||||||
|
|
@ -954,3 +954,32 @@ def test_untouched():
|
||||||
gener = gen.generalizations
|
gener = gen.generalizations
|
||||||
expected_generalizations = {'ranges': {'age': [38, 39]}, 'categories': {}, 'untouched': ['gender']}
|
expected_generalizations = {'ranges': {'age': [38, 39]}, 'categories': {}, 'untouched': ['gender']}
|
||||||
compare_generalizations(gener, expected_generalizations)
|
compare_generalizations(gener, expected_generalizations)
|
||||||
|
|
||||||
|
|
||||||
|
def test_errors():
|
||||||
|
features = ['age', 'height']
|
||||||
|
X = np.array([[23, 165],
|
||||||
|
[45, 158],
|
||||||
|
[56, 123],
|
||||||
|
[67, 154],
|
||||||
|
[45, 149],
|
||||||
|
[42, 166],
|
||||||
|
[73, 172],
|
||||||
|
[94, 168],
|
||||||
|
[69, 175],
|
||||||
|
[24, 181],
|
||||||
|
[18, 190]])
|
||||||
|
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||||
|
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||||
|
min_samples_leaf=1)
|
||||||
|
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
|
model.fit(ArrayDataset(X, y))
|
||||||
|
ad = ArrayDataset(X)
|
||||||
|
predictions = model.predict(ad)
|
||||||
|
if predictions.shape[1] > 1:
|
||||||
|
predictions = np.argmax(predictions, axis=1)
|
||||||
|
gen = GeneralizeToRepresentative(model, generalize_using_transform=False)
|
||||||
|
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||||
|
gen.fit(dataset=train_dataset)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
gen.transform(X)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue