New model wrappers (#32)

* keras wrapper + blackbox classifier wrapper (fix #7)

* fix error in NCP calculation

* Update notebooks

* Fix #25 (incorrect attack_feature indexes for social feature in notebook)

* Consistent naming of internal parameters
This commit is contained in:
abigailgold 2022-05-12 15:44:29 +03:00 committed by GitHub
parent fd6be8e778
commit fe676fa426
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 1407 additions and 656 deletions

View file

@ -101,11 +101,11 @@ class Anonymize:
# build DT just on QI features
x_anonymizer_train = x_prepared[:, self.quasi_identifiers]
if self.is_regression:
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self._anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
else:
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self._anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self.anonymizer.fit(x_anonymizer_train, y)
self._anonymizer.fit(x_anonymizer_train, y)
cells_by_id = self._calculate_cells(x, x_anonymizer_train)
return self._anonymize_data(x, x_anonymizer_train, cells_by_id)
@ -113,16 +113,16 @@ class Anonymize:
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
cells_by_id = {}
leaves = []
for node, feature in enumerate(self.anonymizer.tree_.feature):
for node, feature in enumerate(self._anonymizer.tree_.feature):
if feature == -2: # leaf node
leaves.append(node)
hist = [int(i) for i in self.anonymizer.tree_.value[node][0]]
hist = [int(i) for i in self._anonymizer.tree_.value[node][0]]
# TODO we may change the method for choosing representative for cell
# label_hist = self.anonymizer.tree_.value[node][0]
# label = int(self.anonymizer.classes_[np.argmax(label_hist)])
cell = {'label': 1, 'hist': hist, 'id': int(node)}
cells_by_id[cell['id']] = cell
self.nodes = leaves
self._nodes = leaves
self._find_representatives(x, x_anonymizer_train, cells_by_id.values())
return cells_by_id
@ -153,8 +153,8 @@ class Anonymize:
cell['representative'][feature] = min_value
def _find_sample_nodes(self, samples):
paths = self.anonymizer.decision_path(samples).toarray()
node_set = set(self.nodes)
paths = self._anonymizer.decision_path(samples).toarray()
node_set = set(self._nodes)
return [(list(set([i for i, v in enumerate(p) if v == 1]) & node_set))[0] for p in paths]
def _find_sample_cells(self, samples, cells_by_id):

View file

@ -12,7 +12,7 @@ from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.validation import check_is_fitted
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
@ -68,7 +68,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if is_regression:
self.estimator = SklearnRegressor(estimator)
else:
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_VECTOR)
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_PROBABILITIES)
self.target_accuracy = target_accuracy
self.cells = cells
self.categorical_features = []
@ -124,7 +124,16 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
'categories' that contains sub-groups of categories for categorical features, and
'untouched' that contains the features that could not be generalized.
"""
return self.generalizations_
return self._generalizations
@property
def ncp(self):
"""
Return the NCP score of the generalizations.
:return: ncp score as float.
"""
return self._ncp
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
@ -172,27 +181,20 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
dataset = ArrayDataset(X, y, features_names)
if dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
self.n_features_ = dataset.get_samples().shape[1]
self._n_features = dataset.get_samples().shape[1]
elif dataset and dataset.features_names:
self.n_features_ = len(dataset.features_names)
self._n_features = len(dataset.features_names)
else:
self.n_features_ = 0
self._n_features = 0
if dataset and dataset.features_names:
self._features = dataset.features_names
# if features is None, use numbers instead of names
elif self.n_features_ != 0:
self._features = [str(i) for i in range(self.n_features_)]
elif self._n_features != 0:
self._features = [str(i) for i in range(self._n_features)]
else:
self._features = None
if self.cells:
self.cells_ = self.cells
else:
self.cells_ = {}
self.categorical_values = {}
# Going to fit
# (currently not dealing with option to fit with only X and y and no estimator)
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
@ -231,28 +233,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
fd['max'] = max(values)
fd['range'] = max(values) - min(values)
else:
fd['range'] = len(values)
fd['range'] = len(np.unique(values))
feature_data[feature] = fd
# prepare data for DT
categorical_features = [f for f in self._features if f in self.categorical_features and
f in self.features_to_minimize]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
numeric_features = [f for f in self._features if f not in self.categorical_features and
f in self.features_to_minimize]
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor_QI_features = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
preprocessor_QI_features.fit(x_QI)
# preprocessor to fit data that have features not included in QI (to get accuracy)
numeric_features = [f for f in self._features if f not in self.categorical_features]
@ -267,44 +251,68 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
]
)
preprocessor.fit(x)
x_prepared = preprocessor.transform(X_train)
if self.train_only_QI:
categorical_features = [f for f in self._features if f in self.categorical_features and
f in self.features_to_minimize]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
numeric_features = [f for f in self._features if f not in self.categorical_features and
f in self.features_to_minimize]
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor_QI_features = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
preprocessor_QI_features.fit(x_QI)
x_prepared = preprocessor_QI_features.transform(X_train_QI)
else:
x_prepared = preprocessor.transform(X_train)
self._preprocessor = preprocessor
self.cells_ = {}
self.cells = []
self._categorical_values = {}
if self.is_regression:
self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
self._dt = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
else:
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
self._dt = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
self.dt_.fit(x_prepared, y_train)
self._dt.fit(x_prepared, y_train)
self._modify_categorical_features(used_data)
x_prepared = pd.DataFrame(x_prepared, columns=self.categorical_data.columns)
x_prepared = pd.DataFrame(x_prepared, columns=self._categorical_data.columns)
self._calculate_cells()
self._modify_cells()
# features that are not from QI should not be part of generalizations
for feature in self._features:
if feature not in self.features_to_minimize:
self._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
nodes = self._get_nodes_level(0)
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
# self.cells_ currently holds the generalization created from the tree leaves
# self._cells currently holds the generalization created from the tree leaves
self._calculate_generalizations()
# apply generalizations to test data
x_prepared_test = preprocessor.transform(X_test)
if self.train_only_QI:
x_prepared_test = preprocessor_QI_features.transform(X_test_QI)
else:
x_prepared_test = preprocessor.transform(X_test)
x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self.categorical_data.columns)
x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self._categorical_data.columns)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
# check accuracy
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
@ -317,22 +325,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
level = 1
while accuracy > self.target_accuracy:
try:
cells_previous_iter = self.cells_
generalization_prev_iter = self.generalizations_
cells_by_id_prev = self.cells_by_id_
cells_previous_iter = self.cells
generalization_prev_iter = self._generalizations
cells_by_id_prev = self._cells_by_id
nodes = self._get_nodes_level(level)
self._calculate_level_cells(level)
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
self.cells_by_id_)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
# if accuracy passed threshold roll back to previous iteration generalizations
if accuracy < self.target_accuracy:
self.cells_ = cells_previous_iter
self.generalizations_ = generalization_prev_iter
self.cells_by_id_ = cells_by_id_prev
self.cells = cells_previous_iter
self._generalizations = generalization_prev_iter
self._cells_by_id = cells_by_id_prev
break
else:
print('Pruned tree to level: %d, new relative accuracy: %f' % (level, accuracy))
@ -352,14 +360,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
break
self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self.cells_ currently holds the chosen generalization based on target accuracy
# self._cells currently holds the chosen generalization based on target accuracy
# calculate iLoss
self.ncp_ = self._calculate_ncp(X_test, self.generalizations_, feature_data)
self._ncp = self._calculate_ncp(X_test, self._generalizations, feature_data)
# Return the transformer
return self
@ -398,7 +406,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if dataset and dataset.get_samples() is not None:
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
if x.shape[1] != self.n_features_ and self.n_features_ != 0:
if x.shape[1] != self._n_features and self._n_features != 0:
raise ValueError('Shape of input is different from what was seen'
'in `fit`')
@ -410,23 +418,23 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
# iterate over cells (leaves in decision tree)
for i in range(len(self.cells_)):
for i in range(len(self.cells)):
# Copy the representatives from the cells into another data structure:
# iterate over features in test data
for feature in self._features:
# if feature has a representative value in the cell and should not
# be left untouched, take the representative value
if feature in self.cells_[i]['representative'] and \
('untouched' not in self.cells_[i]
or feature not in self.cells_[i]['untouched']):
representatives.loc[i, feature] = self.cells_[i]['representative'][feature]
if feature in self.cells[i]['representative'] and \
('untouched' not in self.cells[i]
or feature not in self.cells[i]['untouched']):
representatives.loc[i, feature] = self.cells[i]['representative'][feature]
# else, drop the feature (removes from representatives columns that
# do not have a representative value or should remain untouched)
elif feature in representatives.columns.tolist():
representatives = representatives.drop(feature, axis=1)
# get the indexes of all records that map to this cell
indexes = self._get_record_indexes_for_cell(x, self.cells_[i], mapped)
indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
# replace the values in the representative columns with the representative
# values (leaves others untouched)
@ -467,8 +475,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return True
def _modify_categorical_features(self, X):
self.categorical_values = {}
self.oneHotVectorFeaturesToFeatures = {}
self._categorical_values = {}
self._one_hot_vector_features_to_features = {}
features_to_remove = []
used_features = self._features
if self.train_only_QI:
@ -478,17 +486,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
try:
all_values = X.loc[:, feature]
values = list(all_values.unique())
self.categorical_values[feature] = values
self._categorical_values[feature] = values
X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False)
ohe = pd.get_dummies(X[feature], prefix=feature)
for oneHotVectorFeature in ohe.columns:
self.oneHotVectorFeaturesToFeatures[oneHotVectorFeature] = feature
for one_hot_vector_feature in ohe.columns:
self._one_hot_vector_features_to_features[one_hot_vector_feature] = feature
X = pd.concat([X, ohe], axis=1)
features_to_remove.append(feature)
except KeyError:
print("feature " + feature + "not found in training data")
self.categorical_data = X.drop(features_to_remove, axis=1)
self._categorical_data = X.drop(features_to_remove, axis=1)
def _cell_contains_numeric(self, f, range, x):
i = self._features.index(f)
@ -513,24 +521,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return False
def _calculate_cells(self):
self.cells_by_id_ = {}
self.cells_ = self._calculate_cells_recursive(0)
self._cells_by_id = {}
self.cells = self._calculate_cells_recursive(0)
def _calculate_cells_recursive(self, node):
feature_index = self.dt_.tree_.feature[node]
feature_index = self._dt.tree_.feature[node]
if feature_index == -2:
# this is a leaf
# if it is a regression problem we do not use label
label = self._calculate_cell_label(node) if not self.is_regression else 1
hist = [int(i) for i in self.dt_.tree_.value[node][0]] if not self.is_regression else []
hist = [int(i) for i in self._dt.tree_.value[node][0]] if not self.is_regression else []
cell = {'label': label, 'hist': hist, 'ranges': {}, 'id': int(node)}
return [cell]
cells = []
feature = self.categorical_data.columns[feature_index]
threshold = self.dt_.tree_.threshold[node]
left_child = self.dt_.tree_.children_left[node]
right_child = self.dt_.tree_.children_right[node]
feature = self._categorical_data.columns[feature_index]
threshold = self._dt.tree_.threshold[node]
left_child = self._dt.tree_.children_left[node]
right_child = self._dt.tree_.children_right[node]
left_child_cells = self._calculate_cells_recursive(left_child)
for cell in left_child_cells:
@ -539,7 +547,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if cell['ranges'][feature]['end'] is None:
cell['ranges'][feature]['end'] = threshold
cells.append(cell)
self.cells_by_id_[cell['id']] = cell
self._cells_by_id[cell['id']] = cell
right_child_cells = self._calculate_cells_recursive(right_child)
for cell in right_child_cells:
@ -548,26 +556,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if cell['ranges'][feature]['start'] is None:
cell['ranges'][feature]['start'] = threshold
cells.append(cell)
self.cells_by_id_[cell['id']] = cell
self._cells_by_id[cell['id']] = cell
return cells
def _calculate_cell_label(self, node):
label_hist = self.dt_.tree_.value[node][0]
return int(self.dt_.classes_[np.argmax(label_hist)])
label_hist = self._dt.tree_.value[node][0]
return int(self._dt.classes_[np.argmax(label_hist)])
def _modify_cells(self):
cells = []
features = self.categorical_data.columns
for cell in self.cells_:
features = self._categorical_data.columns
for cell in self.cells:
new_cell = {'id': cell['id'], 'label': cell['label'], 'ranges': {}, 'categories': {}, 'hist': cell['hist'],
'representative': None}
for feature in features:
if feature in self.oneHotVectorFeaturesToFeatures.keys():
if feature in self._one_hot_vector_features_to_features.keys():
# feature is categorical and should be mapped
categorical_feature = self.oneHotVectorFeaturesToFeatures[feature]
categorical_feature = self._one_hot_vector_features_to_features[feature]
if categorical_feature not in new_cell['categories'].keys():
new_cell['categories'][categorical_feature] = self.categorical_values[
new_cell['categories'][categorical_feature] = self._categorical_values[
categorical_feature].copy()
if feature in cell['ranges'].keys():
categorical_value = feature[len(categorical_feature) + 1:]
@ -584,11 +592,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
else:
new_cell['ranges'][feature] = {'start': None, 'end': None}
cells.append(new_cell)
self.cells_by_id_[new_cell['id']] = new_cell
self.cells_ = cells
self._cells_by_id[new_cell['id']] = new_cell
self.cells = cells
def _calculate_level_cells(self, level):
if level < 0 or level > self.dt_.get_depth():
if level < 0 or level > self._dt.get_depth():
raise TypeError("Illegal level %d' % level", level)
if level > 0:
@ -597,13 +605,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
nodes = self._get_nodes_level(level)
if nodes:
for node in nodes:
if self.dt_.tree_.feature[node] == -2: # leaf node
new_cell = self.cells_by_id_[node]
if self._dt.tree_.feature[node] == -2: # leaf node
new_cell = self._cells_by_id[node]
else:
left_child = self.dt_.tree_.children_left[node]
right_child = self.dt_.tree_.children_right[node]
left_cell = self.cells_by_id_[left_child]
right_cell = self.cells_by_id_[right_child]
left_child = self._dt.tree_.children_left[node]
right_child = self._dt.tree_.children_right[node]
left_cell = self._cells_by_id[left_child]
right_cell = self._cells_by_id[right_child]
new_cell = {'id': int(node), 'ranges': {}, 'categories': {}, 'untouched': [],
'label': None, 'representative': None}
for feature in left_cell['ranges'].keys():
@ -620,28 +628,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._calculate_level_cell_label(left_cell, right_cell, new_cell)
new_cells.append(new_cell)
new_cells_by_id[new_cell['id']] = new_cell
self.cells_ = new_cells
self.cells_by_id_ = new_cells_by_id
self.cells = new_cells
self._cells_by_id = new_cells_by_id
# else: nothing to do, stay with previous cells
def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
new_cell['hist'] = [x + y for x, y in
zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
new_cell['label'] = int(self._dt.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
def _get_nodes_level(self, level):
# level = distance from lowest leaf
node_depth = np.zeros(shape=self.dt_.tree_.node_count, dtype=np.int64)
is_leaves = np.zeros(shape=self.dt_.tree_.node_count, dtype=bool)
node_depth = np.zeros(shape=self._dt.tree_.node_count, dtype=np.int64)
is_leaves = np.zeros(shape=self._dt.tree_.node_count, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
# depth = distance from root
node_depth[node_id] = parent_depth + 1
if self.dt_.tree_.children_left[node_id] != self.dt_.tree_.children_right[node_id]:
stack.append((self.dt_.tree_.children_left[node_id], parent_depth + 1))
stack.append((self.dt_.tree_.children_right[node_id], parent_depth + 1))
if self._dt.tree_.children_left[node_id] != self._dt.tree_.children_right[node_id]:
stack.append((self._dt.tree_.children_left[node_id], parent_depth + 1))
stack.append((self._dt.tree_.children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
@ -660,7 +668,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# if there is no categorical data prepared data is original data
nodeIds = self._find_sample_nodes(prepared_data, level_nodes)
labels_df = pd.DataFrame(labelFeature, columns=['label'])
for cell in self.cells_:
for cell in self.cells:
cell['representative'] = {}
# get all rows in cell
indexes = [i for i, x in enumerate(nodeIds) if x == cell['id']]
@ -695,14 +703,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cell['representative'][feature] = row[feature]
def _find_sample_nodes(self, samples, nodes):
paths = self.dt_.decision_path(samples).toarray()
paths = self._dt.decision_path(samples).toarray()
nodeSet = set(nodes)
return [(list(set([i for i, v in enumerate(p) if v == 1]) & nodeSet))[0] for p in paths]
def _generalize(self, original_data, prepared_data, level_nodes, cells, cells_by_id):
# prepared data include one hot encoded categorical data + QI
representatives = pd.DataFrame(columns=self._features) # empty except for columns
generalized = pd.DataFrame(prepared_data, columns=self.categorical_data.columns, copy=True)
generalized = pd.DataFrame(prepared_data, columns=self._categorical_data.columns, copy=True)
original_data_generalized = pd.DataFrame(original_data, columns=self._features, copy=True)
mapping_to_cells = self._map_to_cells(generalized, level_nodes, cells_by_id)
# iterate over cells (leaves in decision tree)
@ -755,7 +763,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
current_accuracy)
if feature is None:
return None
GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
GeneralizeToRepresentative._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
return feature
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
@ -763,7 +771,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# if there is no categorical data prepared data is original data
# We want to remove features with low iLoss (NCP) and high accuracy gain
# (after removing them)
ranges = self.generalizations_['ranges']
ranges = self._generalizations['ranges']
range_counts = self._find_range_count(original_data, ranges)
total = prepared_data.size
range_min = sys.float_info.max
@ -772,15 +780,15 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
category_counts = self._find_categories_count(original_data, categories)
for feature in ranges.keys():
if feature not in self.generalizations_['untouched']:
if feature not in self._generalizations['untouched']:
feature_ncp = self._calc_ncp_numeric(ranges[feature],
range_counts[feature],
feature_data[feature],
total)
if feature_ncp > 0:
# divide by accuracy gain
new_cells = copy.deepcopy(self.cells_)
cells_by_id = copy.deepcopy(self.cells_by_id_)
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
@ -802,8 +810,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
total)
if feature_ncp > 0:
# divide by accuracy loss
new_cells = copy.deepcopy(self.cells_)
cells_by_id = copy.deepcopy(self.cells_by_id_)
new_cells = copy.deepcopy(self.cells)
cells_by_id = copy.deepcopy(self._cells_by_id)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
@ -821,12 +829,12 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return remove_feature
def _calculate_generalizations(self):
self.generalizations_ = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells_),
'categories': GeneralizeToRepresentative._calculate_categories(self.cells_),
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells_)}
self._generalizations = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells),
'categories': GeneralizeToRepresentative._calculate_categories(self.cells),
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}
def _find_range_count(self, samples, ranges):
samples_df = pd.DataFrame(samples, columns=self.categorical_data.columns)
samples_df = pd.DataFrame(samples, columns=self._categorical_data.columns)
range_counts = {}
last_value = None
for r in ranges.keys():

View file

@ -6,7 +6,7 @@ from os import path, mkdir
from six.moves.urllib.request import urlretrieve
def get_iris_dataset(test_set: float = 0.3):
def get_iris_dataset_np(test_set: float = 0.3):
"""
Loads the Iris dataset from scikit-learn.
@ -29,7 +29,7 @@ def _load_iris(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_diabetes_dataset(test_set: float = 0.3):
def get_diabetes_dataset_np(test_set: float = 0.3):
"""
Loads the Diabetes dataset from scikit-learn.
@ -52,7 +52,7 @@ def _load_diabetes(test_set_size: float = 0.3):
return (x_train, y_train), (x_test, y_test)
def get_german_credit_dataset(test_set: float = 0.3):
def get_german_credit_dataset_pd(test_set: float = 0.3):
"""
Loads the UCI German credit dataset from `tests/datasets/german` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/ if necessary.
@ -122,11 +122,16 @@ def _modify_german_dataset(data):
return 1
else:
raise Exception('Bad value')
def modify_label(value):
return value - 1
data['Foreign_worker'] = data['Foreign_worker'].apply(modify_Foreign_worker)
data['Telephone'] = data['Telephone'].apply(modify_Telephone)
data['label'] = data['label'].apply(modify_label)
def get_adult_dataset():
def get_adult_dataset_pd():
"""
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/ if necessary.
@ -228,7 +233,7 @@ def _modify_adult_dataset(data):
return data.drop(['fnlwgt', 'education'], axis=1)
def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
def get_nursery_dataset_pd(raw: bool = True, test_set: float = 0.2, transform_social: bool = False):
"""
Loads the UCI Nursery dataset from `tests/datasets/nursery` or downloads it from
https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/ if necessary.

View file

@ -5,7 +5,7 @@ Implementation of utility classes for dataset handling
"""
from abc import ABCMeta, abstractmethod
from typing import Callable, Collection, Any, Union, List, Optional
from typing import Callable, Collection, Any, Union, List, Optional, Type
import tarfile
import os
@ -19,9 +19,9 @@ from torch import Tensor
logger = logging.getLogger(__name__)
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, pd.Series, List, Tensor]
OUTPUT_DATA_ARRAY_TYPE = np.ndarray
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame, pd.Series]
class Dataset(metaclass=ABCMeta):
@ -323,7 +323,7 @@ class DatasetFactory:
:return: a Callable that returns the registered dataset class
"""
def inner_wrapper(wrapped_class: Dataset) -> Any:
def inner_wrapper(wrapped_class: Type[Dataset]) -> Any:
if name in cls.registry:
logger.warning('Dataset %s already exists. Will replace it', name)
cls.registry[name] = wrapped_class
@ -414,14 +414,18 @@ class Data:
"""
Get test set samples
:return: test samples
:return: test samples, or None if no test data provided
"""
if self.test is None:
return None
return self.test.get_samples()
def get_test_labels(self) -> Collection[Any]:
"""
Get test set labels
:return: test labels
:return: test labels, or None if no test data provided
"""
if self.test is None:
return None
return self.test.get_labels()

View file

@ -1,2 +1,3 @@
from apt.utils.models.model import Model, ModelOutputType
from apt.utils.models.model import Model, BlackboxClassifier, ModelOutputType, ScoringMethod
from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor
from apt.utils.models.keras_model import KerasClassifier

View file

@ -0,0 +1,149 @@
from typing import Optional
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from tensorflow import keras
tf.compat.v1.disable_eager_execution()
from apt.utils.models import Model, ModelOutputType, ScoringMethod
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from art.utils import check_and_transform_label_format
from art.estimators.classification.keras import KerasClassifier as ArtKerasClassifier
# from art.estimators.regression.keras import KerasRegressor as ArtKerasRegressor
class KerasModel(Model):
"""
Wrapper class for keras models.
"""
class KerasClassifier(KerasModel):
"""
Wrapper class for keras classification models.
:param model: The original keras model object.
:type model: `keras.models.Model`
:param output_type: The type of output the model yields (vector/label only)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Set to True if the model is only available via query (API) access, i.e.,
only the outputs of the model are exposed, and False if the model internals
are also available. Default is True.
:type black_box_access: boolean, optional
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
unlimited queries to the model API or whether there is a limit to the number of
queries that can be submitted. Default is True.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: keras.models.Model, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
logits = False
if output_type == ModelOutputType.CLASSIFIER_LOGITS:
logits = True
self._art_model = ArtKerasClassifier(model, use_logits=logits)
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
Fit the model using the training data.
:param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
labels (consecutive integers starting at 0).
:type train_data: `Dataset`
:return: None
"""
y_encoded = check_and_transform_label_format(train_data.get_labels())
self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
return self._art_model.predict(x.get_samples(), **kwargs)
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:param scoring_method: The method for scoring predictions. Default is ACCURACY.
:type scoring_method: `ScoringMethod`, optional
:return: the score as float (between 0 and 1)
"""
y = check_and_transform_label_format(test_data.get_labels(), self._art_model.nb_classes)
predicted = self.predict(test_data)
if scoring_method == ScoringMethod.ACCURACY:
return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
else:
raise NotImplementedError
# class KerasRegressor(KerasModel):
# """
# Wrapper class for keras regression models.
#
# :param model: The original keras model object.
# :type model: `keras.models.Model`
# :param black_box_access: Boolean describing the type of deployment of the model (when in production).
# Set to True if the model is only available via query (API) access, i.e.,
# only the outputs of the model are exposed, and False if the model internals
# are also available. Default is True.
# :type black_box_access: boolean, optional
# :param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
# unlimited queries to the model API or whether there is a limit to the number of
# queries that can be submitted. Default is True.
# :type unlimited_queries: boolean, optional
# """
# def __init__(self, model: keras.models.Model, black_box_access: Optional[bool] = True,
# unlimited_queries: Optional[bool] = True, **kwargs):
# super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
# self._art_model = ArtKerasRegressor(model)
#
# def fit(self, train_data: Dataset, **kwargs) -> None:
# """
# Fit the model using the training data.
#
# :param train_data: Training data.
# :type train_data: `Dataset`
# :return: None
# """
# self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
#
# def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
# """
# Perform predictions using the model for input `x`.
#
# :param x: Input samples.
# :type x: `Dataset`
# :return: Predictions from the model as numpy array.
# """
# return self._art_model.predict(x.get_samples(), **kwargs)
#
# def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.MEAN_SQUARED_ERROR,
# **kwargs):
# """
# Score the model using test data.
#
# :param test_data: Test data.
# :type train_data: `Dataset`
# :param scoring_method: The method for scoring predictions. Default is ACCURACY.
# :type scoring_method: `ScoringMethod`, optional
# :return: the score as float
# """
# y = check_and_transform_label_format(test_data.get_labels(), self._art_model.nb_classes)
# predicted = self.predict(test_data)
# if scoring_method == ScoringMethod.MEAN_SQUARED_ERROR:
# mse = keras.losses.MeanSquaredError(reduction=keras.losses.Reduction.SUM)
# return mse(y, predicted).numpy()
# else:
# raise NotImplementedError('Only MEAN_SQUARED_ERROR supported as scoring method')

View file

@ -1,16 +1,25 @@
from abc import ABCMeta, abstractmethod
from typing import Any, Optional
from enum import Enum, auto
import numpy as np
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from apt.utils.datasets import Dataset, Data, OUTPUT_DATA_ARRAY_TYPE
from art.estimators.classification import BlackBoxClassifier
from art.utils import check_and_transform_label_format
class ModelOutputType(Enum):
CLASSIFIER_VECTOR = auto() # probabilities or logits
CLASSIFIER_PROBABILITIES = auto() # vector of probabilities
CLASSIFIER_LOGITS = auto() # vector of logits
CLASSIFIER_SCALAR = auto() # label only
REGRESSOR_SCALAR = auto() # value
class ScoringMethod(Enum):
ACCURACY = auto() # number of correct predictions divided by the number of samples
MEAN_SQUARED_ERROR = auto() # mean squared error between the predictions and true labels
class Model(metaclass=ABCMeta):
"""
Abstract base class for ML model wrappers.
@ -54,7 +63,7 @@ class Model(metaclass=ABCMeta):
Perform predictions using the model for input `x`.
:param x: Input samples.
:type x: `np.ndarray` or `pandas.DataFrame`
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
raise NotImplementedError
@ -107,3 +116,87 @@ class Model(metaclass=ABCMeta):
:return: True if a user can perform unlimited queries to the model API, otherwise False.
"""
return self._unlimited_queries
def get_nb_classes(self, y: OUTPUT_DATA_ARRAY_TYPE) -> int:
"""
Get the number of classes from an array of labels
:param y: the labels
:type y: numpy array
:return: the number of classes as integer
"""
if len(y.shape) == 1:
return len(np.unique(y))
else:
return y.shape[1]
class BlackboxClassifier(Model):
"""
Wrapper for black-box ML classification models.
:param model: The training and/or test data along with the model's predictions for the data. Assumes that the data
is represented as numpy arrays. Labels are expected to either be one-hot encoded or
a 1D-array of categorical labels (consecutive integers starting at 0).
:type model: `Data` object
:param output_type: The type of output the model yields (vector/label only for classifiers,
value for regressors)
:type output_type: `ModelOutputType`
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
Always assumed to be True for this wrapper.
:type black_box_access: boolean, optional
:param unlimited_queries: Boolean indicating whether a user can perform unlimited queries to the model API.
Always assumed to be False for this wrapper.
:type unlimited_queries: boolean, optional
"""
def __init__(self, model: Data, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
unlimited_queries: Optional[bool] = True, **kwargs):
super().__init__(model, output_type, black_box_access=True, unlimited_queries=False, **kwargs)
x = model.get_train_samples()
y = model.get_train_labels()
self.nb_classes = self.get_nb_classes(y)
y = check_and_transform_label_format(y, nb_classes=self.nb_classes)
if model.get_test_samples() is not None and type(x) == np.ndarray:
x = np.vstack((x, model.get_test_samples()))
if model.get_test_labels() is not None and type(y) == np.ndarray:
y = np.vstack((y, check_and_transform_label_format(model.get_test_labels(), nb_classes=self.nb_classes)))
predict_fn = (x, y)
self._art_model = BlackBoxClassifier(predict_fn, x.shape[1:], self.nb_classes, fuzzy_float_compare=True)
def fit(self, train_data: Dataset, **kwargs) -> None:
"""
A blackbox model cannot be fit.
"""
raise NotImplementedError
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
"""
Get predictions from the model for input `x`. `x` must be a subset of the data provided in the `model` data in
`__init__()`.
:param x: Input samples.
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
return self._art_model.predict(x.get_samples())
def score(self, test_data: Dataset, scoring_method: Optional[ScoringMethod] = ScoringMethod.ACCURACY, **kwargs):
"""
Score the model using test data.
:param test_data: Test data.
:type train_data: `Dataset`
:param scoring_method: The method for scoring predictions. Default is ACCURACY.
:type scoring_method: `ScoringMethod`, optional
:return: the score as float (for classifiers, between 0 and 1)
"""
predicted = self._art_model.predict(test_data.get_samples())
y = check_and_transform_label_format(test_data.get_labels(), nb_classes=self.nb_classes)
if scoring_method == ScoringMethod.ACCURACY:
return np.count_nonzero(np.argmax(y, axis=1) == np.argmax(predicted, axis=1)) / predicted.shape[0]
else:
raise NotImplementedError

View file

@ -1,7 +1,5 @@
from typing import Optional
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
@ -10,6 +8,7 @@ from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
from art.estimators.regression.scikitlearn import ScikitlearnRegressor
from art.utils import check_and_transform_label_format
class SklearnModel(Model):
@ -54,12 +53,14 @@ class SklearnClassifier(SklearnModel):
"""
Fit the model using the training data.
:param train_data: Training data.
:param train_data: Training data. Labels are expected to either be one-hot encoded or a 1D-array of categorical
labels (consecutive integers starting at 0).
:type train_data: `Dataset`
:return: None
"""
encoder = OneHotEncoder(sparse=False)
y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
y = train_data.get_labels()
self.nb_classes = self.get_nb_classes(y)
y_encoded = check_and_transform_label_format(y, nb_classes=self.nb_classes)
self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
@ -70,7 +71,7 @@ class SklearnClassifier(SklearnModel):
:type x: `Dataset`
:return: Predictions from the model as numpy array (class probabilities, if supported).
"""
return self._art_model.predict(x, **kwargs)
return self._art_model.predict(x.get_samples(), **kwargs)
class SklearnRegressor(SklearnModel):
@ -112,4 +113,4 @@ class SklearnRegressor(SklearnModel):
:type x: `Dataset`
:return: Predictions from the model as numpy array.
"""
return self._art_model.predict(x, **kwargs)
return self._art_model.predict(x.get_samples(), **kwargs)