Update comments, use lowercase variables, mode data in tests to fixtures

Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
abigailt 2023-08-08 12:34:41 +03:00
parent c2e0fced03
commit 3de93a87f1
2 changed files with 309 additions and 362 deletions

View file

@ -96,7 +96,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.train_only_features_to_minimize = train_only_features_to_minimize
self.is_regression = is_regression
self.encoder = encoder
# self._ncp = 0.0
self._ncp_scores = NCPScores()
self._feature_data = None
self._categorical_values = {}
@ -119,13 +118,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
ret['features_to_minimize'] = self.features_to_minimize
ret['train_only_features_to_minimize'] = self.train_only_features_to_minimize
ret['is_regression'] = self.is_regression
ret['generalize_using_transform'] = self.generalize_using_transform
ret['estimator'] = self.estimator
ret['encoder'] = self.encoder
if deep:
ret['cells'] = copy.deepcopy(self.cells)
ret['estimator'] = self.estimator
ret['encoder'] = self.encoder
else:
ret['cells'] = copy.copy(self.cells)
return ret
def set_params(self, **params):
@ -153,8 +152,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.is_regression = params['is_regression']
if 'cells' in params:
self.cells = params['cells']
if 'generalize_using_transform' in params:
self.generalize_using_transform = params['generalize_using_transform']
if 'estimator' in params:
self.estimator = params['estimator']
if 'encoder' in params:
self.encoder = params['encoder']
return self
@property
@ -182,8 +183,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
"""
Learns the generalizations based on training data, and applies them to the data. Updates stored ncp value to the
one computed on the training data.
Learns the generalizations based on training data, and applies them to the data. Also sets the fit_score,
transform_score and generalizations_score in self.ncp.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -203,7 +204,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
features_names: Optional = None, dataset: ArrayDataset = None, generalize_using_transform: bool = True):
"""Learns the generalizations based on training data.
"""Learns the generalizations based on training data. Also sets the fit_score and generalizations_score in
self.ncp.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -215,10 +217,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
:param dataset: Data wrapper containing the training input samples and the predictions of the original model
on the training data. Either ``X``, ``y`` OR ``dataset`` need to be provided, not both.
:type dataset: `ArrayDataset`, optional
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization process.
True means that the `transform` method is used to transform original data into
generalized data that is used for accuracy and NCP calculation. False indicates
that the `generalizations` structure should be used. Default is True.
:param generalize_using_transform: Indicates how to calculate NCP and accuracy during the generalization
process. True means that the `transform` method is used to transform original
data into generalized data that is used for accuracy and NCP calculation.
False indicates that the `generalizations` structure should be used.
Default is True.
:type generalize_using_transform: boolean, optional
:return: self
"""
@ -254,32 +257,32 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.features_to_minimize = [str(i) for i in self.features_to_minimize]
if not all(elem in self._features for elem in self.features_to_minimize):
raise ValueError('features to minimize should be a subset of features names')
x_QI = x.loc[:, self.features_to_minimize]
x_qi = x.loc[:, self.features_to_minimize]
# divide dataset into train and test
used_data = x
if self.train_only_features_to_minimize:
used_data = x_QI
used_data = x_qi
if self.is_regression:
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
random_state=14)
else:
try:
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(),
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(),
stratify=dataset.get_labels(), test_size=0.4,
random_state=18)
except ValueError:
print('Could not stratify split due to uncommon class value, doing unstratified split instead')
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
x_train, x_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4,
random_state=18)
X_train_QI = X_train.loc[:, self.features_to_minimize]
X_test_QI = X_test.loc[:, self.features_to_minimize]
used_X_train = X_train
used_X_test = X_test
x_train_qi = x_train.loc[:, self.features_to_minimize]
x_test_qi = x_test.loc[:, self.features_to_minimize]
used_x_train = x_train
used_x_test = x_test
if self.train_only_features_to_minimize:
used_X_train = X_train_QI
used_X_test = X_test_QI
used_x_train = x_train_qi
used_x_test = x_test_qi
# collect feature data (such as min, max)
self._feature_data = self._get_feature_data(x)
@ -310,9 +313,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# prepare data for DT
self._encode_categorical_features(used_data, save_mapping=True)
x_prepared = self._encode_categorical_features(used_X_train)
x_prepared = self._encode_categorical_features(used_x_train)
self._dt.fit(x_prepared, y_train)
x_prepared_test = self._encode_categorical_features(used_X_test)
x_prepared_test = self._encode_categorical_features(used_x_test)
self._calculate_cells()
self._modify_cells()
@ -322,14 +325,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
nodes = self._get_nodes_level(0)
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
# self._cells currently holds the generalization created from the tree leaves
self._calculate_generalizations(X_test)
self._calculate_generalizations(x_test)
if generalize_using_transform:
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells, self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(X_test, self.generalizations)
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
# check accuracy
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
@ -353,14 +356,14 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._level -= 1
break
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
self._attach_cells_representatives(x_prepared, used_x_train, y_train, nodes)
self._calculate_generalizations(X_test)
self._calculate_generalizations(x_test)
if generalize_using_transform:
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells,
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(X_test, self.generalizations)
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
# if accuracy passed threshold roll back to previous iteration generalizations
@ -378,26 +381,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
elif accuracy < self.target_accuracy:
print('Improving accuracy')
while accuracy < self.target_accuracy:
removed_feature = self._remove_feature_from_generalization(X_test, x_prepared_test,
removed_feature = self._remove_feature_from_generalization(x_test, x_prepared_test,
nodes, y_test,
self._feature_data, accuracy,
generalize_using_transform)
if removed_feature is None:
break
self._calculate_generalizations(X_test)
self._calculate_generalizations(x_test)
if generalize_using_transform:
generalized = self._generalize_from_tree(X_test, x_prepared_test, nodes, self.cells,
generalized = self._generalize_from_tree(x_test, x_prepared_test, nodes, self.cells,
self._cells_by_id)
else:
generalized = self._generalize_from_generalizations(X_test, self.generalizations)
generalized = self._generalize_from_generalizations(x_test, self.generalizations)
accuracy = self.estimator.score(ArrayDataset(self.encoder.transform(generalized), y_test))
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
# self._cells currently holds the chosen generalization based on target accuracy
# calculate iLoss
X_test_dataset = ArrayDataset(X_test, features_names=self._features)
X_test_dataset = ArrayDataset(x_test, features_names=self._features)
self._ncp_scores.fit_score = self.calculate_ncp(X_test_dataset, generalize_using_transform)
self._ncp_scores.generalizations_score = self.calculate_ncp(X_test_dataset, False)
@ -406,8 +409,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
dataset: Optional[ArrayDataset] = None):
""" Transforms data records to representative points. Updates stored ncp value to the one computed on the
transformed data.
""" Transforms data records to representative points. Also sets the transform_score in self.ncp.
:param X: The training input samples.
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
@ -457,8 +459,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# count how many records are mapped to each cell
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted
ncp = 0
for i in range(len(self.cells)):
cell = self.cells[i]
for cell in self.cells:
count = self._get_record_count_for_cell(samples_pd, cell, counted)
range_counts = {}
category_counts = {}
@ -471,7 +472,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
else: # use generalizations
generalizations = self.generalizations
range_counts = self._find_range_counts(samples_pd, generalizations['ranges'])
category_counts = self._find_categories_counts(samples_pd, generalizations['categories'])
category_counts = self._find_category_counts(samples_pd, generalizations['categories'])
ncp = self._calc_ncp_for_generalization(generalizations, range_counts, category_counts, total_samples)
return ncp
@ -515,8 +516,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
else:
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
all_indexes = []
for i in range(len(self.cells)):
indexes = self._get_record_indexes_for_cell(x, self.cells[i], mapped)
for cell in self.cells:
indexes = self._get_record_indexes_for_cell(x, cell, mapped)
all_indexes.append(indexes)
generalized = self._generalize_indexes(x, self.cells, all_indexes)
@ -556,13 +557,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return average_group_size / feature_data['range'] # number of values in category
@staticmethod
def _calc_ncp_numeric(feature_range, range_count, feature_data, total):
def _calc_ncp_numeric(range, range_count, feature_data, total):
# if there are no ranges, feature is suppressed and iLoss is 1
if not feature_range:
if not range:
return 1
# range only contains the split values, need to add min and max value of feature
# to enable computing sizes of all ranges
new_range = [feature_data['min']] + feature_range + [feature_data['max']]
new_range = [feature_data['min']] + range + [feature_data['max']]
range_sizes = [b - a for a, b in zip(new_range[::1], new_range[1::1])]
normalized_range_sizes = [s * n / total for s, n in zip(range_sizes, range_count)]
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
@ -600,8 +601,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return count
def _cell_contains(self, cell, x, index, mapped):
for f in self._features:
i = self._features.index(f)
for i, f in enumerate(self._features):
if f in cell['ranges']:
if not self._cell_contains_numeric(i, cell['ranges'][f], x):
return False
@ -989,7 +989,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
range_min = sys.float_info.max
remove_feature = None
categories = self.generalizations['categories']
category_counts = self._find_categories_counts(original_data, categories)
category_counts = self._find_category_counts(original_data, categories)
for feature in ranges.keys():
if feature not in self._generalizations['untouched']:
@ -1053,8 +1053,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
counted = np.zeros(samples_pd.shape[0]) # to mark records we already counted
total = samples_pd.shape[0]
feature_ncp = 0
for i in range(len(self.cells)):
cell = self.cells[i]
for cell in self.cells:
count = self._get_record_count_for_cell(samples_pd, cell, counted)
generalizations = self._calculate_generalizations_for_cell(cell)
cell_ncp = 0
@ -1169,7 +1168,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
return range_counts
@staticmethod
def _find_categories_counts(samples, categories):
def _find_category_counts(samples, categories):
category_counts = {}
for c in categories.keys():
category_counts[c] = []

View file

@ -24,12 +24,12 @@ tf.compat.v1.disable_eager_execution()
@pytest.fixture
def dataset():
def diabetes_dataset():
return load_diabetes()
def test_minimizer_params():
# Assume two features, age and height, and boolean label
@pytest.fixture
def get_cells():
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": None, "end": 170}}, "label": 0,
'categories': {}, "representative": {"age": 26, "height": 149}},
{"id": 2, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": None, "end": 170}}, "label": 1,
@ -40,14 +40,136 @@ def test_minimizer_params():
'categories': {}, "representative": {"age": 45, "height": 176}}
]
features = ['age', 'height']
X = np.array([[23, 165],
x = np.array([[23, 165],
[45, 158],
[18, 190]])
y = [1, 1, 0]
return cells, features, x, y
@pytest.fixture
def get_cells_categorical():
cells = [{'id': 1, 'label': 0, 'ranges': {'age': {'start': None, 'end': None}},
'categories': {'sex': ['f', 'm']}, 'hist': [2, 0],
'representative': {'age': 45, 'height': 149, 'sex': 'f'},
'untouched': ['height']},
{'id': 3, 'label': 1, 'ranges': {'age': {'start': None, 'end': None}},
'categories': {'sex': ['f', 'm']}, 'hist': [0, 3],
'representative': {'age': 23, 'height': 165, 'sex': 'f'},
'untouched': ['height']},
{'id': 4, 'label': 0, 'ranges': {'age': {'start': None, 'end': None}},
'categories': {'sex': ['f', 'm']}, 'hist': [1, 0],
'representative': {'age': 18, 'height': 190, 'sex': 'm'},
'untouched': ['height']}
]
features = ['age', 'height', 'sex']
x = [[23, 165, 'f'],
[45, 158, 'f'],
[56, 123, 'f'],
[67, 154, 'm'],
[45, 149, 'f'],
[42, 166, 'm'],
[73, 172, 'm'],
[94, 168, 'f'],
[69, 175, 'm'],
[24, 181, 'm'],
[18, 190, 'm']]
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
return cells, features, x, y
@pytest.fixture
def get_data_two_features():
x = np.array([[23, 165],
[45, 158],
[56, 123],
[67, 154],
[45, 149],
[42, 166],
[73, 172],
[94, 168],
[69, 175],
[24, 181],
[18, 190]])
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
x1 = np.array([[33, 165],
[43, 150],
[71, 143],
[92, 194],
[13, 125],
[22, 169]])
features = ['age', 'height']
return x, y, features, x1
@pytest.fixture
def get_data_three_features():
features = ['age', 'height', 'weight']
x = np.array([[23, 165, 70],
[45, 158, 67],
[56, 123, 65],
[67, 154, 90],
[45, 149, 67],
[42, 166, 58],
[73, 172, 68],
[94, 168, 69],
[69, 175, 80],
[24, 181, 95],
[18, 190, 102]])
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
return x, y, features
@pytest.fixture
def get_data_four_features():
features = ['age', 'height', 'sex', 'ola']
x = [[23, 165, 'f', 'aa'],
[45, 158, 'f', 'aa'],
[56, 123, 'f', 'bb'],
[67, 154, 'm', 'aa'],
[45, 149, 'f', 'bb'],
[42, 166, 'm', 'bb'],
[73, 172, 'm', 'bb'],
[94, 168, 'f', 'aa'],
[69, 175, 'm', 'aa'],
[24, 181, 'm', 'bb'],
[18, 190, 'm', 'bb']]
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
x1 = [[33, 165, 'f', 'aa'],
[43, 150, 'm', 'aa'],
[71, 143, 'f', 'aa'],
[92, 194, 'm', 'aa'],
[13, 125, 'f', 'aa'],
[22, 169, 'f', 'bb']]
return x, y, features, x1
@pytest.fixture
def get_data_five_features():
features = ['age', 'height', 'weight', 'sex', 'ola']
x = [[23, 165, 65, 'f', 'aa'],
[45, 158, 76, 'f', 'aa'],
[56, 123, 78, 'f', 'bb'],
[67, 154, 87, 'm', 'aa'],
[45, 149, 45, 'f', 'bb'],
[42, 166, 76, 'm', 'bb'],
[73, 172, 85, 'm', 'bb'],
[94, 168, 92, 'f', 'aa'],
[69, 175, 95, 'm', 'aa'],
[24, 181, 49, 'm', 'bb'],
[18, 190, 69, 'm', 'bb']]
y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
return x, y, features
def test_minimizer_params(get_cells):
# Assume two features, age and height, and boolean label
cells, features, x, y = get_cells
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
model.fit(ArrayDataset(x, y))
expected_generalizations = {'categories': {}, 'category_representatives': {},
'range_representatives': {'age': [38, 0.5, 40], 'height': [170, 0.5, 172]},
@ -68,60 +190,36 @@ def test_minimizer_params():
== set([frozenset(sl) for sl in gener['category_representatives'][key]]))
gen.fit()
gen.transform(dataset=ArrayDataset(X, features_names=features))
gen.transform(dataset=ArrayDataset(x, features_names=features))
def test_minimizer_params_not_transform():
def test_minimizer_params_not_transform(get_cells):
# Assume two features, age and height, and boolean label
cells = [{"id": 1, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": None, "end": 170}}, "label": 0,
'categories': {}, "representative": {"age": 26, "height": 149}},
{"id": 2, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": None, "end": 170}}, "label": 1,
'categories': {}, "representative": {"age": 58, "height": 163}},
{"id": 3, "ranges": {"age": {"start": None, "end": 38}, "height": {"start": 171, "end": None}}, "label": 0,
'categories': {}, "representative": {"age": 31, "height": 184}},
{"id": 4, "ranges": {"age": {"start": 39, "end": None}, "height": {"start": 171, "end": None}}, "label": 1,
'categories': {}, "representative": {"age": 45, "height": 176}}
]
features = ['age', 'height']
X = np.array([[23, 165],
[45, 158],
[18, 190]])
y = [1, 1, 0]
samples = ArrayDataset(X, y, features)
cells, features, x, y = get_cells
samples = ArrayDataset(x, y, features)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
model.fit(ArrayDataset(x, y))
gen = GeneralizeToRepresentative(model, cells=cells)
ncp = gen.calculate_ncp(samples, generalize_using_transform=False)
assert (ncp > 0.0)
def test_minimizer_fit():
features = ['age', 'height']
X = np.array([[23, 165],
[45, 158],
[56, 123],
[67, 154],
[45, 149],
[42, 166],
[73, 172],
[94, 168],
[69, 175],
[24, 181],
[18, 190]])
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
def test_minimizer_fit(get_data_two_features):
x, y, features, _ = get_data_two_features
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
train_dataset = ArrayDataset(X, predictions, features_names=features)
train_dataset = ArrayDataset(x, predictions, features_names=features)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ad)
@ -141,48 +239,30 @@ def test_minimizer_fit():
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x, indexes, axis=1)).all())
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0.0)
assert (((transformed[indexes]) != (X[indexes])).any())
assert (((transformed[indexes]) != (x[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_ncp():
features = ['age', 'height']
X = np.array([[23, 165],
[45, 158],
[56, 123],
[67, 154],
[45, 149],
[42, 166],
[73, 172],
[94, 168],
[69, 175],
[24, 181],
[18, 190]])
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X1 = np.array([[33, 165],
[43, 150],
[71, 143],
[92, 194],
[13, 125],
[22, 169]])
def test_minimizer_ncp(get_data_two_features):
x, y, features, x1 = get_data_two_features
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
ad1 = ArrayDataset(X1, features_names=features)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
ad1 = ArrayDataset(x1, features_names=features)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.4
train_dataset = ArrayDataset(X, predictions, features_names=features)
train_dataset = ArrayDataset(x, predictions, features_names=features)
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
gen1.fit(dataset=train_dataset, generalize_using_transform=False)
@ -206,28 +286,10 @@ def test_minimizer_ncp():
assert (ncp6 == ncp4)
def test_minimizer_ncp_categorical():
features = ['age', 'height', 'sex', 'ola']
X = [[23, 165, 'f', 'aa'],
[45, 158, 'f', 'aa'],
[56, 123, 'f', 'bb'],
[67, 154, 'm', 'aa'],
[45, 149, 'f', 'bb'],
[42, 166, 'm', 'bb'],
[73, 172, 'm', 'bb'],
[94, 168, 'f', 'aa'],
[69, 175, 'm', 'aa'],
[24, 181, 'm', 'bb'],
[18, 190, 'm', 'bb']]
X = pd.DataFrame(X, columns=features)
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X1 = [[33, 165, 'f', 'aa'],
[43, 150, 'm', 'aa'],
[71, 143, 'f', 'aa'],
[92, 194, 'm', 'aa'],
[13, 125, 'f', 'aa'],
[22, 169, 'f', 'bb']]
X1 = pd.DataFrame(X1, columns=features)
def test_minimizer_ncp_categorical(get_data_four_features):
x, y, features, x1 = get_data_four_features
x = pd.DataFrame(x, columns=features)
x1 = pd.DataFrame(x1, columns=features)
numeric_features = ["age", "height"]
numeric_transformer = Pipeline(
@ -243,20 +305,20 @@ def test_minimizer_ncp_categorical():
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(X)
encoded = preprocessor.fit_transform(x)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(encoded, y))
ad = ArrayDataset(X)
ad1 = ArrayDataset(X1)
ad = ArrayDataset(x)
ad1 = ArrayDataset(x1)
predictions = model.predict(ArrayDataset(encoded))
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.4
train_dataset = ArrayDataset(X, predictions, features_names=features)
train_dataset = ArrayDataset(x, predictions, features_names=features)
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features)
@ -281,31 +343,19 @@ def test_minimizer_ncp_categorical():
assert (ncp6 == ncp4)
def test_minimizer_fit_not_transform():
features = ['age', 'height']
X = np.array([[23, 165],
[45, 158],
[56, 123],
[67, 154],
[45, 149],
[42, 166],
[73, 172],
[94, 168],
[69, 175],
[24, 181],
[18, 190]])
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
def test_minimizer_fit_not_transform(get_data_two_features):
x, y, features, x1 = get_data_two_features
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
train_dataset = ArrayDataset(X, predictions, features_names=features)
train_dataset = ArrayDataset(x, predictions, features_names=features)
gen.fit(dataset=train_dataset, generalize_using_transform=False)
gener = gen.generalizations
@ -330,21 +380,9 @@ def test_minimizer_fit_not_transform():
assert (ncp > 0.0)
def test_minimizer_fit_pandas():
features = ['age', 'height', 'sex', 'ola']
X = [[23, 165, 'f', 'aa'],
[45, 158, 'f', 'aa'],
[56, 123, 'f', 'bb'],
[67, 154, 'm', 'aa'],
[45, 149, 'f', 'bb'],
[42, 166, 'm', 'bb'],
[73, 172, 'm', 'bb'],
[94, 168, 'f', 'aa'],
[69, 175, 'm', 'aa'],
[24, 181, 'm', 'bb'],
[18, 190, 'm', 'bb']]
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
def test_minimizer_fit_pandas(get_data_four_features):
x, y, features, _ = get_data_four_features
x = pd.DataFrame(x, columns=features)
numeric_features = ["age", "height"]
numeric_transformer = Pipeline(
@ -360,7 +398,7 @@ def test_minimizer_fit_pandas():
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(X)
encoded = preprocessor.fit_transform(x)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
@ -375,9 +413,9 @@ def test_minimizer_fit_pandas():
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features)
train_dataset = ArrayDataset(X, predictions)
train_dataset = ArrayDataset(x, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
transformed = gen.transform(dataset=ArrayDataset(x))
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': []}, 'categories': {},
'untouched': ['height', 'sex', 'ola']}
@ -391,47 +429,21 @@ def test_minimizer_fit_pandas():
modified_features = [f for f in features if
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x.drop(modified_features, axis=1))
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0.0)
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
assert (((transformed[modified_features]).equals(x[modified_features])) is False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_params_categorical():
def test_minimizer_params_categorical(get_cells_categorical):
# Assume three features, age, sex and height, and boolean label
cells = [{'id': 1, 'label': 0, 'ranges': {'age': {'start': None, 'end': None}},
'categories': {'sex': ['f', 'm']}, 'hist': [2, 0],
'representative': {'age': 45, 'height': 149, 'sex': 'f'},
'untouched': ['height']},
{'id': 3, 'label': 1, 'ranges': {'age': {'start': None, 'end': None}},
'categories': {'sex': ['f', 'm']}, 'hist': [0, 3],
'representative': {'age': 23, 'height': 165, 'sex': 'f'},
'untouched': ['height']},
{'id': 4, 'label': 0, 'ranges': {'age': {'start': None, 'end': None}},
'categories': {'sex': ['f', 'm']}, 'hist': [1, 0],
'representative': {'age': 18, 'height': 190, 'sex': 'm'},
'untouched': ['height']}
]
cells, features, x, y = get_cells_categorical
features = ['age', 'height', 'sex']
X = [[23, 165, 'f'],
[45, 158, 'f'],
[56, 123, 'f'],
[67, 154, 'm'],
[45, 149, 'f'],
[42, 166, 'm'],
[73, 172, 'm'],
[94, 168, 'f'],
[69, 175, 'm'],
[24, 181, 'm'],
[18, 190, 'm']]
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
x = pd.DataFrame(x, columns=features)
numeric_features = ["age", "height"]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
@ -446,7 +458,7 @@ def test_minimizer_params_categorical():
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(X)
encoded = preprocessor.fit_transform(x)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
@ -460,41 +472,28 @@ def test_minimizer_params_categorical():
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, cells=cells)
train_dataset = ArrayDataset(X, predictions)
train_dataset = ArrayDataset(x, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
transformed = gen.transform(dataset=ArrayDataset(x))
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_fit_QI():
features = ['age', 'height', 'weight']
X = np.array([[23, 165, 70],
[45, 158, 67],
[56, 123, 65],
[67, 154, 90],
[45, 149, 67],
[42, 166, 58],
[73, 172, 68],
[94, 168, 69],
[69, 175, 80],
[24, 181, 95],
[18, 190, 102]])
print(X)
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
QI = ['age', 'weight']
def test_minimizer_fit_qi(get_data_three_features):
x, y, features = get_data_three_features
qi = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions, features_names=features)
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
train_dataset = ArrayDataset(x, predictions, features_names=features)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ad)
gener = gen.generalizations
@ -505,7 +504,7 @@ def test_minimizer_fit_QI():
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
== set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
@ -513,33 +512,20 @@ def test_minimizer_fit_QI():
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x, indexes, axis=1)).all())
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0.0)
assert (((transformed[indexes]) != (X[indexes])).any())
assert (((transformed[indexes]) != (x[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_minimizer_fit_pandas_QI():
features = ['age', 'height', 'weight', 'sex', 'ola']
X = [[23, 165, 65, 'f', 'aa'],
[45, 158, 76, 'f', 'aa'],
[56, 123, 78, 'f', 'bb'],
[67, 154, 87, 'm', 'aa'],
[45, 149, 45, 'f', 'bb'],
[42, 166, 76, 'm', 'bb'],
[73, 172, 85, 'm', 'bb'],
[94, 168, 92, 'f', 'aa'],
[69, 175, 95, 'm', 'aa'],
[24, 181, 49, 'm', 'bb'],
[18, 190, 69, 'm', 'bb']]
y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
QI = ['age', 'weight', 'ola']
def test_minimizer_fit_pandas_qi(get_data_five_features):
x, y, features = get_data_five_features
x = pd.DataFrame(x, columns=features)
qi = ['age', 'weight', 'ola']
numeric_features = ["age", "height", "weight"]
numeric_transformer = Pipeline(
@ -555,7 +541,7 @@ def test_minimizer_fit_pandas_QI():
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(X)
encoded = preprocessor.fit_transform(x)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
@ -569,10 +555,10 @@ def test_minimizer_fit_pandas_QI():
# Now we have a full prediction pipeline.
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions)
categorical_features=categorical_features, features_to_minimize=qi)
train_dataset = ArrayDataset(x, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
transformed = gen.transform(dataset=ArrayDataset(x))
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
'untouched': ['height', 'sex']}
@ -583,17 +569,15 @@ def test_minimizer_fit_pandas_QI():
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
== set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
np.testing.assert_array_equal(transformed.drop(qi, axis=1), x.drop(qi, axis=1))
modified_features = [f for f in features if
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x.drop(modified_features, axis=1))
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0.0)
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
assert (((transformed[modified_features]).equals(x[modified_features])) is False)
rel_accuracy = model.score(ArrayDataset(preprocessor.transform(transformed), predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
@ -601,8 +585,8 @@ def test_minimizer_fit_pandas_QI():
def test_minimize_ndarray_iris():
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(x_train, y_train), (x_test, y_test) = get_iris_dataset_np()
QI = ['sepal length (cm)', 'petal length (cm)']
(x_train, y_train), _ = get_iris_dataset_np()
qi = ['sepal length (cm)', 'petal length (cm)']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
@ -611,8 +595,7 @@ def test_minimize_ndarray_iris():
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.3
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
# gen.fit(dataset=ArrayDataset(x_train, predictions))
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
gener = gen.generalizations
expected_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
@ -644,7 +627,7 @@ def test_minimize_ndarray_iris():
def test_minimize_pandas_adult():
(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()
(x_train, y_train), _ = get_adult_dataset_pd()
x_train = x_train.head(1000)
y_train = y_train.head(1000)
@ -655,7 +638,7 @@ def test_minimize_pandas_adult():
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'hours-per-week', 'native-country']
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
qi = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
numeric_features = [f for f in features if f not in categorical_features]
@ -680,7 +663,7 @@ def test_minimize_pandas_adult():
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
categorical_features=categorical_features, features_to_minimize=qi)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations
@ -704,13 +687,13 @@ def test_minimize_pandas_adult():
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
== set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
np.testing.assert_array_equal(transformed.drop(qi, axis=1), x_train.drop(qi, axis=1))
modified_features = [f for f in features if
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
@ -722,7 +705,7 @@ def test_minimize_pandas_adult():
def test_german_credit_pandas():
(x_train, y_train), (x_test, y_test) = get_german_credit_dataset_pd()
(x_train, y_train), _ = get_german_credit_dataset_pd()
features = ["Existing_checking_account", "Duration_in_month", "Credit_history", "Purpose", "Credit_amount",
"Savings_account", "Present_employment_since", "Installment_rate", "Personal_status_sex", "debtors",
"Present_residence", "Property", "Age", "Other_installment_plans", "Housing",
@ -731,7 +714,7 @@ def test_german_credit_pandas():
categorical_features = ["Existing_checking_account", "Credit_history", "Purpose", "Savings_account",
"Present_employment_since", "Personal_status_sex", "debtors", "Property",
"Other_installment_plans", "Housing", "Job"]
QI = ["Duration_in_month", "Credit_history", "Purpose", "debtors", "Property", "Other_installment_plans",
qi = ["Duration_in_month", "Credit_history", "Purpose", "debtors", "Property", "Other_installment_plans",
"Housing", "Job"]
numeric_features = [f for f in features if f not in categorical_features]
@ -756,7 +739,7 @@ def test_german_credit_pandas():
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
categorical_features=categorical_features, features_to_minimize=qi)
gen.fit(dataset=ArrayDataset(x_train, predictions))
transformed = gen.transform(dataset=ArrayDataset(x_train))
gener = gen.generalizations
@ -782,13 +765,13 @@ def test_german_credit_pandas():
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
== set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
np.testing.assert_array_equal(transformed.drop(qi, axis=1), x_train.drop(qi, axis=1))
modified_features = [f for f in features if
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
@ -799,20 +782,21 @@ def test_german_credit_pandas():
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_regression(dataset):
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
def test_regression(diabetes_dataset):
x_train, x_test, y_train, y_test = train_test_split(diabetes_dataset.data, diabetes_dataset.target, test_size=0.5,
random_state=14)
base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
model = SklearnRegressor(base_est)
model.fit(ArrayDataset(x_train, y_train))
predictions = model.predict(ArrayDataset(x_train))
QI = ['age', 'bmi', 's2', 's5']
qi = ['age', 'bmi', 's2', 's5']
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
target_accuracy = 0.7
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, is_regression=True,
features_to_minimize=QI)
features_to_minimize=qi)
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
print('Base model accuracy (R2 score): ', model.score(ArrayDataset(x_test, y_test)))
@ -872,9 +856,9 @@ def test_regression(dataset):
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_X_y():
def test_x_y():
features = [0, 1, 2]
X = np.array([[23, 165, 70],
x = np.array([[23, 165, 70],
[45, 158, 67],
[56, 123, 65],
[67, 154, 90],
@ -885,21 +869,21 @@ def test_X_y():
[69, 175, 80],
[24, 181, 95],
[18, 190, 102]])
print(X)
print(x)
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
QI = [0, 2]
qi = [0, 2]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
gen.fit(X=X, y=predictions)
transformed = gen.transform(X)
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
gen.fit(X=x, y=predictions)
transformed = gen.transform(x)
gener = gen.generalizations
expected_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
for key in expected_generalizations['ranges']:
@ -908,7 +892,7 @@ def test_X_y():
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
== set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x, [0, 2], axis=1)).all())
modified_features = [f for f in features if
str(f) in expected_generalizations['categories'].keys() or str(f) in expected_generalizations[
'ranges'].keys()]
@ -916,19 +900,19 @@ def test_X_y():
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x, indexes, axis=1)).all())
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0.0)
assert (((transformed[indexes]) != (X[indexes])).any())
assert (((transformed[indexes]) != (x[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_X_y_features_names():
def test_x_y_features_names():
features = ['age', 'height', 'weight']
X = np.array([[23, 165, 70],
x = np.array([[23, 165, 70],
[45, 158, 67],
[56, 123, 65],
[67, 154, 90],
@ -939,21 +923,21 @@ def test_X_y_features_names():
[69, 175, 80],
[24, 181, 95],
[18, 190, 102]])
print(X)
print(x)
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
QI = ['age', 'weight']
qi = ['age', 'weight']
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=QI)
gen.fit(X=X, y=predictions, features_names=features)
transformed = gen.transform(X=X, features_names=features)
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, features_to_minimize=qi)
gen.fit(X=x, y=predictions, features_names=features)
transformed = gen.transform(X=x, features_names=features)
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expected_generalizations['ranges']:
@ -962,7 +946,7 @@ def test_X_y_features_names():
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
== set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x, [0, 2], axis=1)).all())
modified_features = [f for f in features if
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
@ -970,32 +954,19 @@ def test_X_y_features_names():
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x, indexes, axis=1)).all())
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0.0)
assert (((transformed[indexes]) != (X[indexes])).any())
assert (((transformed[indexes]) != (x[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_BaseEstimator_classification():
features = ['age', 'height', 'weight', 'sex', 'ola']
X = [[23, 165, 65, 'f', 'aa'],
[45, 158, 76, 'f', 'aa'],
[56, 123, 78, 'f', 'bb'],
[67, 154, 87, 'm', 'aa'],
[45, 149, 45, 'f', 'bb'],
[42, 166, 76, 'm', 'bb'],
[73, 172, 85, 'm', 'bb'],
[94, 168, 92, 'f', 'aa'],
[69, 175, 95, 'm', 'aa'],
[24, 181, 49, 'm', 'bb'],
[18, 190, 69, 'm', 'bb']]
y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
X = pd.DataFrame(X, columns=features)
def test_BaseEstimator_classification(get_data_five_features):
x, y, features = get_data_five_features
x = pd.DataFrame(x, columns=features)
QI = ['age', 'weight', 'ola']
numeric_features = ["age", "height", "weight"]
@ -1012,7 +983,7 @@ def test_BaseEstimator_classification():
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(X)
encoded = preprocessor.fit_transform(x)
encoded = pd.DataFrame(encoded)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
@ -1025,9 +996,9 @@ def test_BaseEstimator_classification():
target_accuracy = 0.5
gen = GeneralizeToRepresentative(model, target_accuracy=target_accuracy,
categorical_features=categorical_features, features_to_minimize=QI)
train_dataset = ArrayDataset(X, predictions)
train_dataset = ArrayDataset(x, predictions)
gen.fit(dataset=train_dataset)
transformed = gen.transform(dataset=ArrayDataset(X))
transformed = gen.transform(dataset=ArrayDataset(x))
gener = gen.generalizations
expected_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
'untouched': ['height', 'sex']}
@ -1038,24 +1009,25 @@ def test_BaseEstimator_classification():
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
== set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
# assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x.drop(QI, axis=1))
modified_features = [f for f in features if
f in expected_generalizations['categories'].keys() or f in expected_generalizations[
'ranges'].keys()]
# assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x.drop(modified_features, axis=1))
ncp = gen.ncp.transform_score
if len(expected_generalizations['ranges'].keys()) > 0 or len(expected_generalizations['categories'].keys()) > 0:
assert (ncp > 0.0)
assert (((transformed[modified_features]).equals(X[modified_features])) is False)
assert (((transformed[modified_features]).equals(x[modified_features])) is False)
rel_accuracy = model.score(preprocessor.transform(transformed), predictions)
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
def test_BaseEstimator_regression(dataset):
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
def test_BaseEstimator_regression(diabetes_dataset):
x_train, x_test, y_train, y_test = train_test_split(diabetes_dataset.data, diabetes_dataset.target, test_size=0.5,
random_state=14)
base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
model = base_est
@ -1127,7 +1099,7 @@ def test_BaseEstimator_regression(dataset):
def test_keras_model():
(X, y), (x_test, y_test) = get_iris_dataset_np()
(x, y), (x_test, y_test) = get_iris_dataset_np()
base_est = Sequential()
base_est.add(Input(shape=(4,)))
@ -1137,7 +1109,7 @@ def test_keras_model():
base_est.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model = KerasClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
model.fit(ArrayDataset(x, y))
ad = ArrayDataset(x_test)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
@ -1161,7 +1133,7 @@ def test_keras_model():
ncp = gen.ncp.transform_score
if len(gener['ranges'].keys()) > 0 or len(gener['categories'].keys()) > 0:
assert (ncp > 0.0)
assert (((transformed[indexes]) != (X[indexes])).any())
assert (((transformed[indexes]) != (x[indexes])).any())
rel_accuracy = model.score(ArrayDataset(transformed, predictions))
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
@ -1187,27 +1159,3 @@ def test_untouched():
assert (set([frozenset(sl) for sl in expected_generalizations['categories'][key]])
== set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expected_generalizations['untouched']) == set(gener['untouched']))
def test_errors():
features = ['age', 'height']
X = np.array([[23, 165],
[45, 158],
[56, 123],
[67, 154],
[45, 149],
[42, 166],
[73, 172],
[94, 168],
[69, 175],
[24, 181],
[18, 190]])
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
model.fit(ArrayDataset(X, y))
ad = ArrayDataset(X)
predictions = model.predict(ad)
if predictions.shape[1] > 1:
predictions = np.argmax(predictions, axis=1)