mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-08 15:05:13 +02:00
Consistent one-hot-encoding (#38)
* Reuse code between generalize and transform methods * Option to get encoder from user * Consistent encoding for decision tree and generalizations (separate from target model encoding)
This commit is contained in:
parent
7055d5ecf6
commit
dfa684da6b
2 changed files with 153 additions and 128 deletions
|
|
@ -11,7 +11,7 @@ from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin
|
|||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
|
||||
from sklearn.utils.validation import check_is_fitted
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
|
@ -47,12 +47,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
:type cells: list of objects, optional
|
||||
:param categorical_features: The list of categorical features (if supplied, these featurtes will be one-hot
|
||||
encoded before using them to train the decision tree model).
|
||||
:param encoder: Optional encoder for encoding data before feeding it into the estimator (e.g., for categorical
|
||||
features)
|
||||
:type encoder: sklearn OrdinalEncoder or OneHotEncoder
|
||||
:type categorical_features: list of strings, optional
|
||||
:param features_to_minimize: The features to be minimized.
|
||||
:type features_to_minimize: list of strings or int, optional
|
||||
:param train_only_QI: Whether to train the tree just on the ``features_to_minimize`` or on all features. Default
|
||||
is only on ``features_to_minimize``.
|
||||
:type train_only_QI: boolean, optional
|
||||
:param train_only_features_to_minimize: Whether to train the tree just on the ``features_to_minimize`` or on all
|
||||
features. Default is only on ``features_to_minimize``.
|
||||
:type train_only_features_to_minimize: boolean, optional
|
||||
:param is_regression: Whether the model is a regression model or not (if False, assumes a classification model).
|
||||
Default is False.
|
||||
:type is_regression: boolean, optional
|
||||
|
|
@ -60,7 +63,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: Optional[float] = 0.998,
|
||||
cells: Optional[list] = None, categorical_features: Optional[Union[np.ndarray, list]] = None,
|
||||
features_to_minimize: Optional[Union[np.ndarray, list]] = None, train_only_QI: Optional[bool] = True,
|
||||
encoder: Optional[Union[OrdinalEncoder, OneHotEncoder]] = None,
|
||||
features_to_minimize: Optional[Union[np.ndarray, list]] = None,
|
||||
train_only_features_to_minimize: Optional[bool] = True,
|
||||
is_regression: Optional[bool] = False):
|
||||
|
||||
self.estimator = estimator
|
||||
|
|
@ -75,8 +80,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
if categorical_features:
|
||||
self.categorical_features = categorical_features
|
||||
self.features_to_minimize = features_to_minimize
|
||||
self.train_only_QI = train_only_QI
|
||||
self.train_only_features_to_minimize = train_only_features_to_minimize
|
||||
self.is_regression = is_regression
|
||||
self.encoder = encoder
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""
|
||||
|
|
@ -89,9 +95,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
"""
|
||||
ret = {}
|
||||
ret['target_accuracy'] = self.target_accuracy
|
||||
ret['categorical_features'] = self.categorical_features
|
||||
ret['features_to_minimize'] = self.features_to_minimize
|
||||
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
|
||||
ret['is_regression'] = self.is_regression
|
||||
if deep:
|
||||
ret['cells'] = copy.deepcopy(self.cells)
|
||||
ret['estimator'] = self.estimator
|
||||
ret['encoder'] = self.encoder
|
||||
else:
|
||||
ret['cells'] = copy.copy(self.cells)
|
||||
return ret
|
||||
|
|
@ -111,6 +122,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
"""
|
||||
if 'target_accuracy' in params:
|
||||
self.target_accuracy = params['target_accuracy']
|
||||
if 'categorical_features' in params:
|
||||
self.categorical_features = params['categorical_features']
|
||||
if 'features_to_minimize' in params:
|
||||
self.features_to_minimize = params['features_to_minimize']
|
||||
if 'train_only_features_to_minimize' in params:
|
||||
self.train_only_features_to_minimize = params['train_only_features_to_minimize']
|
||||
if 'is_regression' in params:
|
||||
self.is_regression = params['is_regression']
|
||||
if 'cells' in params:
|
||||
self.cells = params['cells']
|
||||
return self
|
||||
|
|
@ -208,7 +227,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
# divide dataset into train and test
|
||||
used_data = x
|
||||
if self.train_only_QI:
|
||||
if self.train_only_features_to_minimize:
|
||||
used_data = x_QI
|
||||
if self.is_regression:
|
||||
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4, random_state=14)
|
||||
|
|
@ -219,8 +238,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
X_train_QI = X_train.loc[:, self.features_to_minimize]
|
||||
X_test_QI = X_test.loc[:, self.features_to_minimize]
|
||||
used_X_train = X_train
|
||||
if self.train_only_QI:
|
||||
used_X_test = X_test
|
||||
if self.train_only_features_to_minimize:
|
||||
used_X_train = X_train_QI
|
||||
used_X_test = X_test_QI
|
||||
|
||||
# collect feature data (such as min, max)
|
||||
feature_data = {}
|
||||
|
|
@ -236,46 +257,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
fd['range'] = len(np.unique(values))
|
||||
feature_data[feature] = fd
|
||||
|
||||
# prepare data for DT
|
||||
|
||||
# preprocessor to fit data that have features not included in QI (to get accuracy)
|
||||
numeric_features = [f for f in self._features if f not in self.categorical_features]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("cat", categorical_transformer, self.categorical_features),
|
||||
]
|
||||
)
|
||||
preprocessor.fit(x)
|
||||
|
||||
if self.train_only_QI:
|
||||
categorical_features = [f for f in self._features if f in self.categorical_features and
|
||||
f in self.features_to_minimize]
|
||||
|
||||
# default encoder in case none provided
|
||||
if self.encoder is None:
|
||||
numeric_features = [f for f in self._features if f not in self.categorical_features]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
|
||||
numeric_features = [f for f in self._features if f not in self.categorical_features and
|
||||
f in self.features_to_minimize]
|
||||
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
|
||||
preprocessor_QI_features = ColumnTransformer(
|
||||
self.encoder = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("cat", categorical_transformer, categorical_features),
|
||||
("cat", categorical_transformer, self.categorical_features),
|
||||
]
|
||||
)
|
||||
preprocessor_QI_features.fit(x_QI)
|
||||
x_prepared = preprocessor_QI_features.transform(X_train_QI)
|
||||
else:
|
||||
x_prepared = preprocessor.transform(X_train)
|
||||
|
||||
self._preprocessor = preprocessor
|
||||
self.encoder.fit(x)
|
||||
|
||||
self.cells = []
|
||||
self._categorical_values = {}
|
||||
|
|
@ -285,11 +280,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
else:
|
||||
self._dt = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
|
||||
# prepare data for DT
|
||||
self._encode_categorical_features(used_data, save_mapping=True)
|
||||
x_prepared = self._encode_categorical_features(used_X_train)
|
||||
self._dt.fit(x_prepared, y_train)
|
||||
|
||||
self._modify_categorical_features(used_data)
|
||||
|
||||
x_prepared = pd.DataFrame(x_prepared, columns=self._categorical_data.columns)
|
||||
x_prepared_test = self._encode_categorical_features(used_X_test)
|
||||
|
||||
self._calculate_cells()
|
||||
self._modify_cells()
|
||||
|
|
@ -303,19 +299,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
# self._cells currently holds the generalization created from the tree leaves
|
||||
self._calculate_generalizations()
|
||||
|
||||
# apply generalizations to test data
|
||||
if self.train_only_QI:
|
||||
x_prepared_test = preprocessor_QI_features.transform(X_test_QI)
|
||||
else:
|
||||
x_prepared_test = preprocessor.transform(X_test)
|
||||
|
||||
x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self._categorical_data.columns)
|
||||
|
||||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
||||
|
||||
# check accuracy
|
||||
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
||||
print('Initial accuracy of model on generalized data, relative to original model predictions '
|
||||
'(base generalization derived from tree, before improvements): %f' % accuracy)
|
||||
|
||||
|
|
@ -340,7 +327,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self._calculate_generalizations()
|
||||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells,
|
||||
self._cells_by_id)
|
||||
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
||||
# if accuracy passed threshold roll back to previous iteration generalizations
|
||||
if accuracy < self.target_accuracy:
|
||||
self.cells = cells_previous_iter
|
||||
|
|
@ -364,7 +351,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
self._calculate_generalizations()
|
||||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
|
||||
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
|
||||
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
|
||||
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
|
||||
|
||||
# self._cells currently holds the chosen generalization based on target accuracy
|
||||
|
|
@ -416,38 +403,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
if not self._features:
|
||||
self._features = [i for i in range(x.shape[1])]
|
||||
|
||||
representatives = pd.DataFrame(columns=self._features) # only columns
|
||||
generalized = pd.DataFrame(x, columns=self._features, copy=True) # original data
|
||||
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
|
||||
|
||||
# iterate over cells (leaves in decision tree)
|
||||
all_indexes = []
|
||||
for i in range(len(self.cells)):
|
||||
# Copy the representatives from the cells into another data structure:
|
||||
# iterate over features in test data
|
||||
for feature in self._features:
|
||||
# if feature has a representative value in the cell and should not
|
||||
# be left untouched, take the representative value
|
||||
if feature in self.cells[i]['representative'] and \
|
||||
('untouched' not in self.cells[i]
|
||||
or feature not in self.cells[i]['untouched']):
|
||||
representatives.loc[i, feature] = self.cells[i]['representative'][feature]
|
||||
# else, drop the feature (removes from representatives columns that
|
||||
# do not have a representative value or should remain untouched)
|
||||
elif feature in representatives.columns.tolist():
|
||||
representatives = representatives.drop(feature, axis=1)
|
||||
|
||||
# get the indexes of all records that map to this cell
|
||||
indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
|
||||
all_indexes.append(indexes)
|
||||
generalized = self._generalize_indexes(x, self.cells, all_indexes)
|
||||
|
||||
# replace the values in the representative columns with the representative
|
||||
# values (leaves others untouched)
|
||||
if indexes and not representatives.columns.empty:
|
||||
if len(indexes) > 1:
|
||||
replace = pd.concat([representatives.loc[i].to_frame().T] * len(indexes)).reset_index(drop=True)
|
||||
else:
|
||||
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
|
||||
replace.index = indexes
|
||||
generalized.loc[indexes, representatives.columns] = replace
|
||||
if dataset and dataset.is_pandas:
|
||||
return generalized
|
||||
elif isinstance(X, pd.DataFrame):
|
||||
|
|
@ -477,29 +439,36 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
mapped.itemset(i, 1)
|
||||
return True
|
||||
|
||||
def _modify_categorical_features(self, X):
|
||||
self._categorical_values = {}
|
||||
self._one_hot_vector_features_to_features = {}
|
||||
def _encode_categorical_features(self, X, save_mapping=False):
|
||||
if save_mapping:
|
||||
self._categorical_values = {}
|
||||
self._one_hot_vector_features_to_features = {}
|
||||
features_to_remove = []
|
||||
used_features = self._features
|
||||
if self.train_only_QI:
|
||||
if self.train_only_features_to_minimize:
|
||||
used_features = self.features_to_minimize
|
||||
for feature in self.categorical_features:
|
||||
if feature in used_features:
|
||||
try:
|
||||
all_values = X.loc[:, feature]
|
||||
values = list(all_values.unique())
|
||||
self._categorical_values[feature] = values
|
||||
X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False)
|
||||
if save_mapping:
|
||||
self._categorical_values[feature] = values
|
||||
X[feature] = pd.Categorical(X.loc[:, feature], categories=self._categorical_values[feature],
|
||||
ordered=False)
|
||||
ohe = pd.get_dummies(X[feature], prefix=feature)
|
||||
for one_hot_vector_feature in ohe.columns:
|
||||
self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
|
||||
if save_mapping:
|
||||
for one_hot_vector_feature in ohe.columns:
|
||||
self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
|
||||
X = pd.concat([X, ohe], axis=1)
|
||||
features_to_remove.append(feature)
|
||||
except KeyError:
|
||||
print("feature " + feature + "not found in training data")
|
||||
|
||||
self._categorical_data = X.drop(features_to_remove, axis=1)
|
||||
new_data = X.drop(features_to_remove, axis=1)
|
||||
if save_mapping:
|
||||
self._encoded_features = new_data.columns
|
||||
return new_data
|
||||
|
||||
def _cell_contains_numeric(self, f, range, x):
|
||||
i = self._features.index(f)
|
||||
|
|
@ -538,7 +507,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
return [cell]
|
||||
|
||||
cells = []
|
||||
feature = self._categorical_data.columns[feature_index]
|
||||
feature = self._encoded_features[feature_index]
|
||||
threshold = self._dt.tree_.threshold[node]
|
||||
left_child = self._dt.tree_.children_left[node]
|
||||
right_child = self._dt.tree_.children_right[node]
|
||||
|
|
@ -569,7 +538,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
def _modify_cells(self):
|
||||
cells = []
|
||||
features = self._categorical_data.columns
|
||||
features = self._encoded_features
|
||||
for cell in self.cells:
|
||||
new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {}, 'categories': {}, 'hist': cell['hist'],
|
||||
'untouched': [], 'representative': None}
|
||||
|
|
@ -711,11 +680,19 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]
|
||||
|
||||
def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
|
||||
mapping_to_cells = self._map_to_cells(prepared_data, level_nodes, cells_by_id)
|
||||
all_indexes = []
|
||||
for i in range(len(cells)):
|
||||
# get the indexes of all records that map to this cell
|
||||
indexes = [j for j in mapping_to_cells if mapping_to_cells[j]['id'] == cells[i]['id']]
|
||||
all_indexes.append(indexes)
|
||||
return self._generalize_indexes(original_data, cells, all_indexes)
|
||||
|
||||
def _generalize_indexes(self, original_data, cells, all_indexes):
|
||||
# prepared data include one hot encoded categorical data + QI
|
||||
representatives = pd.DataFrame(columns=self._features) # empty except for columns
|
||||
generalized = pd.DataFrame(prepared_data, columns=self._categorical_data.columns, copy=True)
|
||||
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
|
||||
mapping_to_cells = self._map_to_cells(generalized, level_nodes, cells_by_id)
|
||||
|
||||
# iterate over cells (leaves in decision tree)
|
||||
for i in range(len(cells)):
|
||||
# This code just copies the representatives from the cells into another data structure
|
||||
|
|
@ -731,9 +708,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
elif feature in representatives.columns.tolist():
|
||||
representatives = representatives.drop(feature, axis=1)
|
||||
|
||||
# get the indexes of all records that map to this cell
|
||||
indexes = [j for j in mapping_to_cells if mapping_to_cells[j]['id'] == cells[i]['id']]
|
||||
|
||||
indexes = all_indexes[i]
|
||||
# replaces the values in the representative columns with the representative values
|
||||
# (leaves others untouched)
|
||||
if indexes and not representatives.columns.empty:
|
||||
|
|
@ -794,7 +769,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
||||
labels)) - current_accuracy
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
|
|
@ -817,7 +792,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
cells_by_id = copy.deepcopy(self._cells_by_id)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self.encoder.transform(generalized),
|
||||
labels)) - current_accuracy
|
||||
|
||||
if accuracy_gain < 0:
|
||||
|
|
@ -838,7 +813,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self._remove_categorical_untouched(self._generalizations)
|
||||
|
||||
def _find_range_count(self, samples, ranges):
|
||||
samples_df = pd.DataFrame(samples, columns=self._categorical_data.columns)
|
||||
samples_df = pd.DataFrame(samples, columns=self._encoded_features)
|
||||
range_counts = {}
|
||||
last_value = None
|
||||
for r in ranges.keys():
|
||||
|
|
|
|||
|
|
@ -74,8 +74,8 @@ def test_minimizer_fit(data):
|
|||
predictions = model.predict(ad)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
||||
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||
|
||||
gen.fit(dataset=train_dataset)
|
||||
|
|
@ -102,6 +102,9 @@ def test_minimizer_fit(data):
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_minimizer_fit_pandas(data):
|
||||
features = ['age', 'height', 'sex', 'ola']
|
||||
|
|
@ -145,7 +148,8 @@ def test_minimizer_fit_pandas(data):
|
|||
|
||||
# Append classifier to preprocessing pipeline.
|
||||
# Now we have a full prediction pipeline.
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features)
|
||||
train_dataset = ArrayDataset(X, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
|
|
@ -169,6 +173,9 @@ def test_minimizer_fit_pandas(data):
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_minimizer_params_categorical(data):
|
||||
# Assume three features, age, sex and height, and boolean label
|
||||
|
|
@ -226,12 +233,16 @@ def test_minimizer_params_categorical(data):
|
|||
predictions = np.argmax(predictions, axis=1)
|
||||
# Append classifier to preprocessing pipeline.
|
||||
# Now we have a full prediction pipeline.
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, cells=cells)
|
||||
train_dataset = ArrayDataset(X, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(X))
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_minimizer_fit_QI(data):
|
||||
features = ['age', 'height', 'weight']
|
||||
|
|
@ -257,8 +268,8 @@ def test_minimizer_fit_QI(data):
|
|||
predictions = model.predict(ad)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
|
||||
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ad)
|
||||
|
|
@ -284,6 +295,9 @@ def test_minimizer_fit_QI(data):
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_minimizer_fit_pandas_QI(data):
|
||||
features = ['age', 'height', 'weight', 'sex', 'ola']
|
||||
|
|
@ -329,7 +343,8 @@ def test_minimizer_fit_pandas_QI(data):
|
|||
|
||||
# Append classifier to preprocessing pipeline.
|
||||
# Now we have a full prediction pipeline.
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
train_dataset = ArrayDataset(X, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
|
|
@ -356,6 +371,9 @@ def test_minimizer_fit_pandas_QI(data):
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_minimize_ndarray_iris():
|
||||
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
|
||||
|
|
@ -368,8 +386,8 @@ def test_minimize_ndarray_iris():
|
|||
predictions = model.predict(ArrayDataset(x_train))
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI)
|
||||
target_accuracy = 0.3
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
|
||||
# gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
gener = gen.generalizations
|
||||
|
|
@ -397,6 +415,9 @@ def test_minimize_ndarray_iris():
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_minimize_pandas_adult():
|
||||
(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()
|
||||
|
|
@ -433,8 +454,8 @@ def test_minimize_pandas_adult():
|
|||
predictions = model.predict(ArrayDataset(encoded))
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
|
||||
target_accuracy = 0.7
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
|
|
@ -472,6 +493,9 @@ def test_minimize_pandas_adult():
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_german_credit_pandas():
|
||||
(x_train, y_train), (x_test, y_test) = get_german_credit_dataset_pd()
|
||||
|
|
@ -506,8 +530,8 @@ def test_german_credit_pandas():
|
|||
predictions = model.predict(ArrayDataset(encoded))
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
|
||||
target_accuracy = 0.7
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
|
|
@ -545,6 +569,9 @@ def test_german_credit_pandas():
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_regression():
|
||||
dataset = load_diabetes()
|
||||
|
|
@ -558,7 +585,8 @@ def test_regression():
|
|||
features = ['age', 'sex', 'bmi', 'bp',
|
||||
's1', 's2', 's3', 's4', 's5', 's6']
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
|
||||
target_accuracy = 0.7
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, is_regression=True,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
|
||||
|
|
@ -615,6 +643,9 @@ def test_regression():
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_X_y(data):
|
||||
features = [0, 1, 2]
|
||||
|
|
@ -640,8 +671,8 @@ def test_X_y(data):
|
|||
predictions = model.predict(ad)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
|
||||
gen.fit(X=X, y=predictions)
|
||||
transformed = gen.transform(X)
|
||||
gener = gen.generalizations
|
||||
|
|
@ -666,6 +697,9 @@ def test_X_y(data):
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_X_y_features_names(data):
|
||||
features = ['age', 'height', 'weight']
|
||||
|
|
@ -691,8 +725,8 @@ def test_X_y_features_names(data):
|
|||
predictions = model.predict(ad)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
|
||||
gen.fit(X=X, y=predictions, features_names=features)
|
||||
transformed = gen.transform(X=X, features_names=features)
|
||||
gener = gen.generalizations
|
||||
|
|
@ -717,6 +751,9 @@ def test_X_y_features_names(data):
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_BaseEstimator_classification(data):
|
||||
features = ['age', 'height', 'weight', 'sex', 'ola']
|
||||
|
|
@ -760,7 +797,8 @@ def test_BaseEstimator_classification(data):
|
|||
|
||||
# Append classifier to preprocessing pipeline.
|
||||
# Now we have a full prediction pipeline.
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
train_dataset = ArrayDataset(X, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
|
|
@ -787,6 +825,9 @@ def test_BaseEstimator_classification(data):
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
|
||||
|
||||
rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_BaseEstimator_regression():
|
||||
dataset = load_diabetes()
|
||||
|
|
@ -799,8 +840,8 @@ def test_BaseEstimator_regression():
|
|||
QI = ['age', 'bmi', 's2', 's5']
|
||||
features = ['age', 'sex', 'bmi', 'bp',
|
||||
's1', 's2', 's3', 's4', 's5', 's6']
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
|
||||
target_accuracy = 0.7
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, is_regression=True,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
|
||||
|
|
@ -857,6 +898,9 @@ def test_BaseEstimator_regression():
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(transformed, predictions)
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_keras_model():
|
||||
(X, y), (x_test, y_test) = get_iris_dataset_np()
|
||||
|
|
@ -874,8 +918,8 @@ def test_keras_model():
|
|||
predictions = model.predict(ad)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
||||
test_dataset = ArrayDataset(x_test, predictions)
|
||||
|
||||
gen.fit(dataset=test_dataset)
|
||||
|
|
@ -895,6 +939,9 @@ def test_keras_model():
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_blackbox_model():
|
||||
(X, y), (x_test, y_test) = get_iris_dataset_np()
|
||||
|
|
@ -907,8 +954,8 @@ def test_blackbox_model():
|
|||
predictions = model.predict(ad)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
|
||||
target_accuracy = 0.5
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
||||
train_dataset = ArrayDataset(x_test, predictions)
|
||||
|
||||
gen.fit(dataset=train_dataset)
|
||||
|
|
@ -939,6 +986,9 @@ def test_blackbox_model():
|
|||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
|
||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||
|
||||
|
||||
def test_untouched():
|
||||
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}}, "label": 0,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue