Revert to having generalize_using_transform as an instance param (passed at init) and throwing an exception when used incorrectly.

Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailt 2023-08-21 18:09:06 +03:00
parent 256dfbbc71
commit 5e84f3fac4
2 changed files with 64 additions and 32 deletions

View file

@ -68,6 +68,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model). :param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
Default is False. Default is False.
:type is_regression: boolean, optional :type is_regression: boolean, optional
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization
process. True means that the `transform` method is used to transform original
data into generalized data that is used for accuracy and NCP calculation.
False indicates that the `generalizations` structure should be used.
Default is True.
:type generalize_using_transform: boolean, optional
""" """
def __init__(self, estimator: Union[BaseEstimator, Model] = None, def __init__(self, estimator: Union[BaseEstimator, Model] = None,
@ -77,7 +83,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None, encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
features_to_minimize: Optional[Union[np.ndarray, list]] = None, features_to_minimize: Optional[Union[np.ndarray, list]] = None,
train_only_features_to_minimize: Optional[bool] = True, train_only_features_to_minimize: Optional[bool] = True,
is_regression: Optional[bool] = False): is_regression: Optional[bool] = False,
generalize_using_transform: bool = True):
self.estimator = estimator self.estimator = estimator
if estimator is not None and not issubclass(estimator.__class__, Model): if estimator is not None and not issubclass(estimator.__class__, Model):
@ -96,6 +103,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.train_only_features_to_minimize = train_only_features_to_minimize self.train_only_features_to_minimize = train_only_features_to_minimize
self.is_regression = is_regression self.is_regression = is_regression
self.encoder = encoder self.encoder = encoder
self.generalize_using_transform = generalize_using_transform
self._ncp_scores = NCPScores() self._ncp_scores = NCPScores()
self._feature_data = None self._feature_data = None
self._categorical_values = {} self._categorical_values = {}
@ -199,11 +207,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or :return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features) pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
""" """
if not self.generalize_using_transform:
raise ValueError('fit_transform method called even though generalize_using_transform parameter was False. '
'This can lead to inconsistent results.')
self.fit(X, y, features_names, dataset=dataset) self.fit(X, y, features_names, dataset=dataset)
return self.transform(X, features_names, dataset=dataset) return self.transform(X, features_names, dataset=dataset)
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None, def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional = None, dataset: ArrayDataset = None, generalize_using_transform: bool = True): features_names: Optional = None, dataset: ArrayDataset = None):
"""Learns the generalizations based on training data. Also sets the fit_score and generalizations_score in """Learns the generalizations based on training data. Also sets the fit_score and generalizations_score in
self.ncp. self.ncp.
@ -217,12 +228,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:param dataset: Data wrapper containing the training input samples and the predictions of the original model :param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both. on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
:type dataset: `ArrayDataset`, optional :type dataset: `ArrayDataset`, optional
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization
process. True means that the `transform` method is used to transform original
data into generalized data that is used for accuracy and NCP calculation.
False indicates that the `generalizations` structure should be used.
Default is True.
:type generalize_using_transform: boolean, optional
:return: self :return: self
""" """
@ -329,7 +334,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# self._cells currently holds the generalization created from the tree leaves # self._cells currently holds the generalization created from the tree leaves
self._calculate_generalizations(x_test) self._calculate_generalizations(x_test)
if generalize_using_transform: if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id) generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
else: else:
generalized = self._generalize_from_generalizations(x_test, self.generalizations) generalized = self._generalize_from_generalizations(x_test, self.generalizations)
@ -359,7 +364,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes) self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
self._calculate_generalizations(x_test) self._calculate_generalizations(x_test)
if generalize_using_transform: if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id) self._cells_by_id)
else: else:
@ -384,12 +389,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
removed_feature = self._remove_feature_from_generalization(x_test, x_prepared_test, removed_feature = self._remove_feature_from_generalization(x_test, x_prepared_test,
nodes, y_test, nodes, y_test,
self._feature_data, accuracy, self._feature_data, accuracy,
generalize_using_transform) self.generalize_using_transform)
if removed_feature is None: if removed_feature is None:
break break
self._calculate_generalizations(x_test) self._calculate_generalizations(x_test)
if generalize_using_transform: if self.generalize_using_transform:
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id) self._cells_by_id)
else: else:
@ -401,8 +406,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# calculate iLoss # calculate iLoss
x_test_dataset = ArrayDataset(x_test, features_names=self._features) x_test_dataset = ArrayDataset(x_test, features_names=self._features)
self._ncp_scores.fit_score = self.calculate_ncp(x_test_dataset, generalize_using_transform) self._ncp_scores.fit_score = self.calculate_ncp(x_test_dataset)
self._ncp_scores.generalizations_score = self.calculate_ncp(x_test_dataset, False) self._ncp_scores.generalizations_score = self.calculate_ncp(x_test_dataset)
# Return the transformer # Return the transformer
return self return self
@ -422,12 +427,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or :return: Array containing the representative values to which each record in ``X`` is mapped, as numpy array or
pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features) pandas DataFrame (depending on the type of ``X``), shape (n_samples, n_features)
""" """
if not self.generalize_using_transform:
raise ValueError('transform method called even though generalize_using_transform parameter was False. This '
'can lead to inconsistent results.')
transformed = self._inner_transform(X, features_names, dataset) transformed = self._inner_transform(X, features_names, dataset)
transformed_dataset = ArrayDataset(transformed, features_names=self._features) transformed_dataset = ArrayDataset(transformed, features_names=self._features)
self._ncp_scores.transform_score = self.calculate_ncp(transformed_dataset, True) self._ncp_scores.transform_score = self.calculate_ncp(transformed_dataset)
return transformed return transformed
def calculate_ncp(self, samples: ArrayDataset, generalize_using_transform: bool = True): def calculate_ncp(self, samples: ArrayDataset):
""" """
Compute the NCP score of the generalization. Calculation is based on the value of the Compute the NCP score of the generalization. Calculation is based on the value of the
generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the
@ -438,11 +446,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:param samples: The input samples to compute the NCP score on. :param samples: The input samples to compute the NCP score on.
:type samples: ArrayDataset, optional. feature_names should be set. :type samples: ArrayDataset, optional. feature_names should be set.
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization process.
True means that the `transform` method is used to transform original data into
generalized data that is used for accuracy and NCP calculation. False indicates
that the `generalizations` structure should be used. Default is True.
:type generalize_using_transform: boolean, optional
:return: NCP score as float. :return: NCP score as float.
""" """
if not samples.features_names: if not samples.features_names:
@ -454,7 +457,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._feature_data = self._get_feature_data(samples_pd) self._feature_data = self._get_feature_data(samples_pd)
total_samples = samples_pd.shape[0] total_samples = samples_pd.shape[0]
if generalize_using_transform: if self.generalize_using_transform:
generalizations = self._calculate_cell_generalizations() generalizations = self._calculate_cell_generalizations()
# count how many records are mapped to each cell # count how many records are mapped to each cell
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted

View file

@ -256,8 +256,8 @@ def test_minimizer_params_not_transform(cells):
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(x, y)) model.fit(ArrayDataset(x, y))
gen = GeneralizeToRepresentative(model, cells=cells) gen = GeneralizeToRepresentative(model, cells=cells, generalize_using_transform=False)
ncp = gen.calculate_ncp(samples, generalize_using_transform=False) ncp = gen.calculate_ncp(samples)
assert (ncp > 0.0) assert (ncp > 0.0)
@ -304,10 +304,10 @@ def test_minimizer_ncp(data_two_features):
target_accuracy = 0.4 target_accuracy = 0.4
train_dataset = ArrayDataset(x, predictions, features_names=features) train_dataset = ArrayDataset(x, predictions, features_names=features)
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy) gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False)
gen1.fit(dataset=train_dataset, generalize_using_transform=False) gen1.fit(dataset=train_dataset)
ncp1 = gen1.ncp.fit_score ncp1 = gen1.ncp.fit_score
ncp2 = gen1.calculate_ncp(ad1, generalize_using_transform=False) ncp2 = gen1.calculate_ncp(ad1)
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy) gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
gen2.fit(dataset=train_dataset) gen2.fit(dataset=train_dataset)
@ -348,10 +348,10 @@ def test_minimizer_ncp_categorical(data_four_features):
train_dataset = ArrayDataset(x, predictions, features_names=features) train_dataset = ArrayDataset(x, predictions, features_names=features)
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features) categorical_features=categorical_features, generalize_using_transform=False)
gen1.fit(dataset=train_dataset, generalize_using_transform=False) gen1.fit(dataset=train_dataset)
ncp1 = gen1.ncp.fit_score ncp1 = gen1.ncp.fit_score
ncp2 = gen1.calculate_ncp(ad1, generalize_using_transform=False) ncp2 = gen1.calculate_ncp(ad1)
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features) gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features)
gen2.fit(dataset=train_dataset) gen2.fit(dataset=train_dataset)
@ -381,10 +381,10 @@ def test_minimizer_fit_not_transform(data_two_features):
if predictions.shape[1] > 1: if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1) predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.5 target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy) gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False)
train_dataset = ArrayDataset(x, predictions, features_names=features) train_dataset = ArrayDataset(x, predictions, features_names=features)
gen.fit(dataset=train_dataset, generalize_using_transform=False) gen.fit(dataset=train_dataset)
gener = gen.generalizations gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'height': [157.0]}, 'categories': {}, 'untouched': []} expected_generalizations = {'ranges': {'age': [], 'height': [157.0]}, 'categories': {}, 'untouched': []}
@ -954,3 +954,32 @@ def test_untouched():
gener = gen.generalizations gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [38, 39]}, 'categories': {}, 'untouched': ['gender']} expected_generalizations = {'ranges': {'age': [38, 39]}, 'categories': {}, 'untouched': ['gender']}
compare_generalizations(gener, expected_generalizations) compare_generalizations(gener, expected_generalizations)
def test_errors():
features = ['age', 'height']
X = np.array([[23, 165],
[45, 158],
[56, 123],
[67, 154],
[45, 149],
[42, 166],
[73, 172],
[94, 168],
[69, 175],
[24, 181],
[18, 190]])
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
gen = GeneralizeToRepresentative(model, generalize_using_transform=False)
train_dataset = ArrayDataset(X, predictions, features_names=features)
gen.fit(dataset=train_dataset)
with pytest.raises(ValueError):
gen.transform(X)