mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-06-02 14:45:13 +02:00
Fix computing generalizations from transformed data + add some tests
Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
26adcf3528
commit
aa38a1d716
2 changed files with 213 additions and 126 deletions
|
|
@ -178,7 +178,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
||||||
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
|
features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None):
|
||||||
"""
|
"""
|
||||||
Learns the generalizations based on training data, and applies them to the data.
|
Learns the generalizations based on training data, and applies them to the data. Updates stored ncp value to the
|
||||||
|
one computed on the training data.
|
||||||
|
|
||||||
:param X: The training input samples.
|
:param X: The training input samples.
|
||||||
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
||||||
|
|
@ -383,7 +384,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
|
|
||||||
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
|
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
|
||||||
dataset: Optional[ArrayDataset] = None):
|
dataset: Optional[ArrayDataset] = None):
|
||||||
""" Transforms data records to representative points.
|
""" Transforms data records to representative points. Updates stored ncp value to the one computed on the
|
||||||
|
transformed data.
|
||||||
|
|
||||||
:param X: The training input samples.
|
:param X: The training input samples.
|
||||||
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
:type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional
|
||||||
|
|
@ -407,7 +409,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
def calculate_ncp(self, samples: Optional[ArrayDataset] = None, transformed: Optional[bool] = False):
|
def calculate_ncp(self, samples: Optional[ArrayDataset] = None, transformed: Optional[bool] = False):
|
||||||
"""
|
"""
|
||||||
Compute the NCP score of the generalization. Calculation is based on the value of the
|
Compute the NCP score of the generalization. Calculation is based on the value of the
|
||||||
generalize_using_transform param.
|
generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the
|
||||||
|
provided data. If samples not provided, returns the last NCP score computed by the `fit` or `transform` method.
|
||||||
|
|
||||||
Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization
|
Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization
|
||||||
with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf)
|
with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf)
|
||||||
|
|
||||||
|
|
@ -423,13 +427,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
if samples is None:
|
if samples is None:
|
||||||
return self._ncp
|
return self._ncp
|
||||||
|
|
||||||
|
if not samples.features_names:
|
||||||
|
raise ValueError('features_names should be set in input ArrayDataset.')
|
||||||
samples_pd = pd.DataFrame(samples.get_samples(), columns=samples.features_names)
|
samples_pd = pd.DataFrame(samples.get_samples(), columns=samples.features_names)
|
||||||
if self._features is None:
|
if self._features is None:
|
||||||
self._features = samples.features_names
|
self._features = samples.features_names
|
||||||
if self._feature_data is None:
|
if self._feature_data is None:
|
||||||
self._feature_data = self._get_feature_data(samples_pd)
|
self._feature_data = self._get_feature_data(samples_pd)
|
||||||
|
total_samples = samples_pd.shape[0]
|
||||||
|
|
||||||
if self.generalize_using_transform:
|
if self.generalize_using_transform:
|
||||||
|
# TODO: not sure I need to transform, data should be mapped to correct cell both with or without transforming
|
||||||
if not transformed:
|
if not transformed:
|
||||||
# transform data
|
# transform data
|
||||||
transformed_data = self._inner_transform(dataset=samples) # can return numpy or pandas
|
transformed_data = self._inner_transform(dataset=samples) # can return numpy or pandas
|
||||||
|
|
@ -437,35 +445,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
transformed_data = pd.DataFrame(transformed_data, columns=samples.features_names)
|
transformed_data = pd.DataFrame(transformed_data, columns=samples.features_names)
|
||||||
else:
|
else:
|
||||||
transformed_data = samples_pd
|
transformed_data = samples_pd
|
||||||
range_counts, category_counts = self._calculate_transformed_generalizations(transformed_data)
|
generalizations = self._calculate_transformed_generalizations(transformed_data)
|
||||||
generalizations = self._transformed_generalizations
|
# count how many transformed values are mapped to each cell
|
||||||
|
counted = np.zeros(transformed_data.shape[0]) # to mark records we already counted
|
||||||
|
ncp = 0
|
||||||
|
for i in range(len(self.cells)):
|
||||||
|
cell = self.cells[i]
|
||||||
|
count = self._get_record_count_for_cell(transformed_data, cell, counted)
|
||||||
|
range_counts = {}
|
||||||
|
category_counts = {}
|
||||||
|
for feature in cell['ranges']:
|
||||||
|
range_counts[feature] = [count]
|
||||||
|
for feature in cell['categories']:
|
||||||
|
category_counts[feature] = [count]
|
||||||
|
ncp += self._calc_ncp_for_generalization(generalizations[cell['id']], range_counts, category_counts,
|
||||||
|
total_samples)
|
||||||
|
self._ncp = ncp
|
||||||
else: # use generalizations
|
else: # use generalizations
|
||||||
generalizations = self.generalizations
|
generalizations = self.generalizations
|
||||||
range_counts = self._find_range_counts(samples_pd, generalizations['ranges'])
|
range_counts = self._find_range_counts(samples_pd, generalizations['ranges'])
|
||||||
category_counts = self._find_categories_counts(samples_pd, generalizations['categories'])
|
category_counts = self._find_categories_counts(samples_pd, generalizations['categories'])
|
||||||
|
self._ncp = self._calc_ncp_for_generalization(generalizations, range_counts, category_counts, total_samples)
|
||||||
|
|
||||||
# suppressed features are already taken care of within _calc_ncp_numeric
|
|
||||||
#TODO: check that this is the case for tramsformed as well
|
|
||||||
ranges = generalizations['ranges']
|
|
||||||
categories = generalizations['categories']
|
|
||||||
|
|
||||||
total = samples_pd.shape[0]
|
|
||||||
total_ncp = 0
|
|
||||||
total_features = len(generalizations['untouched'])
|
|
||||||
for feature in ranges.keys():
|
|
||||||
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
|
|
||||||
self._feature_data[feature], total)
|
|
||||||
total_ncp = total_ncp + feature_ncp
|
|
||||||
total_features += 1
|
|
||||||
for feature in categories.keys():
|
|
||||||
feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature],
|
|
||||||
self._feature_data[feature],
|
|
||||||
total)
|
|
||||||
total_ncp = total_ncp + feature_ncp
|
|
||||||
total_features += 1
|
|
||||||
if total_features == 0:
|
|
||||||
return 0
|
|
||||||
self._ncp = total_ncp / total_features
|
|
||||||
return self._ncp
|
return self._ncp
|
||||||
|
|
||||||
def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
|
def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None,
|
||||||
|
|
@ -518,6 +519,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
return generalized
|
return generalized
|
||||||
return generalized.to_numpy()
|
return generalized.to_numpy()
|
||||||
|
|
||||||
|
def _calc_ncp_for_generalization(self, generalization, range_counts, category_counts, total_count):
|
||||||
|
total_ncp = 0
|
||||||
|
total_features = len(generalization['untouched'])
|
||||||
|
ranges = generalization['ranges']
|
||||||
|
categories = generalization['categories']
|
||||||
|
|
||||||
|
# suppressed features are already taken care of within _calc_ncp_numeric
|
||||||
|
for feature in ranges.keys():
|
||||||
|
feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature],
|
||||||
|
self._feature_data[feature], total_count)
|
||||||
|
total_ncp = total_ncp + feature_ncp
|
||||||
|
total_features += 1
|
||||||
|
for feature in categories.keys():
|
||||||
|
feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature],
|
||||||
|
self._feature_data[feature],
|
||||||
|
total_count)
|
||||||
|
total_ncp = total_ncp + feature_ncp
|
||||||
|
total_features += 1
|
||||||
|
if total_features == 0:
|
||||||
|
return 0
|
||||||
|
return total_ncp / total_features
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _calc_ncp_categorical(categories, category_count, feature_data, total):
|
def _calc_ncp_categorical(categories, category_count, feature_data, total):
|
||||||
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
|
category_sizes = [len(g) if len(g) > 1 else 0 for g in categories]
|
||||||
|
|
@ -538,7 +561,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
|
average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes)
|
||||||
return average_range_size / (feature_data['max'] - feature_data['min'])
|
return average_range_size / (feature_data['max'] - feature_data['min'])
|
||||||
|
|
||||||
|
|
||||||
def _get_feature_data(self, x):
|
def _get_feature_data(self, x):
|
||||||
feature_data = {}
|
feature_data = {}
|
||||||
for feature in self._features:
|
for feature in self._features:
|
||||||
|
|
@ -561,6 +583,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
indexes.append(index)
|
indexes.append(index)
|
||||||
return indexes
|
return indexes
|
||||||
|
|
||||||
|
def _get_record_count_for_cell(self, X, cell, mapped):
|
||||||
|
count = 0
|
||||||
|
for index, row in X.iterrows():
|
||||||
|
if not mapped.item(index) and self._cell_contains(cell, row, index, mapped):
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
def _cell_contains(self, cell, x, index, mapped):
|
def _cell_contains(self, cell, x, index, mapped):
|
||||||
for f in self._features:
|
for f in self._features:
|
||||||
i = self._features.index(f)
|
i = self._features.index(f)
|
||||||
|
|
@ -880,7 +909,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
current_accuracy)
|
current_accuracy)
|
||||||
if feature is None:
|
if feature is None:
|
||||||
return None
|
return None
|
||||||
GeneralizeToRepresentative._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
|
self._remove_feature_from_cells(self.cells, self._cells_by_id, feature)
|
||||||
return feature
|
return feature
|
||||||
|
|
||||||
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
|
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
|
||||||
|
|
@ -946,97 +975,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
return remove_feature
|
return remove_feature
|
||||||
|
|
||||||
def _calculate_generalizations(self):
|
def _calculate_generalizations(self):
|
||||||
self._generalizations = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells),
|
self._generalizations = {'ranges': self._calculate_ranges(self.cells),
|
||||||
'categories': GeneralizeToRepresentative._calculate_categories(self.cells),
|
'categories': self._calculate_categories(self.cells),
|
||||||
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}
|
'untouched': self._calculate_untouched(self.cells)}
|
||||||
self._remove_categorical_untouched(self._generalizations)
|
self._remove_categorical_untouched(self._generalizations)
|
||||||
|
|
||||||
|
def _calculate_generalizations_per_cell(self, cell):
|
||||||
|
generalizations = {'ranges': self._calculate_ranges([cell]),
|
||||||
|
'categories': self._calculate_categories([cell]),
|
||||||
|
'untouched': self._calculate_untouched([cell])}
|
||||||
|
self._remove_categorical_untouched(generalizations)
|
||||||
|
return generalizations
|
||||||
|
|
||||||
def _calculate_transformed_generalizations(self, transformed):
|
def _calculate_transformed_generalizations(self, transformed):
|
||||||
# transformed data should only consist of representative values from cells (when removing untouched features)
|
# calculate generalizations separately per cell
|
||||||
ranges = {}
|
cell_generalizations = {}
|
||||||
categories = {}
|
for cell in self.cells:
|
||||||
range_counts = {}
|
cell_generalizations[cell['id']] = self._calculate_generalizations_per_cell(cell)
|
||||||
category_counts = {}
|
return cell_generalizations
|
||||||
|
|
||||||
unique_records = transformed.value_counts().reset_index(name='count')
|
|
||||||
representatives = unique_records.drop('count', axis=1)
|
|
||||||
representative_counts = unique_records['count'] # needed to normalize ncp according to quantity
|
|
||||||
index = 0
|
|
||||||
for _, record in representatives.iterrows():
|
|
||||||
# TODO: what if some cells are not present, we will not take their generalizations into account. We need to
|
|
||||||
# "gain" ncp in this case...
|
|
||||||
record_dict = self.pandas_record_to_dict(record)
|
|
||||||
for cell in self.cells:
|
|
||||||
representative = cell["representative"].copy()
|
|
||||||
record_copy = record_dict.copy()
|
|
||||||
if 'untouched' in cell:
|
|
||||||
for feature in cell['untouched']:
|
|
||||||
record_copy.pop(feature)
|
|
||||||
if feature in representative:
|
|
||||||
representative.pop(feature)
|
|
||||||
if record_copy == representative:
|
|
||||||
# handle numerical features
|
|
||||||
for feature in [key for key in cell['ranges'].keys() if
|
|
||||||
'untouched' not in cell or key not in cell['untouched']]:
|
|
||||||
if feature not in ranges.keys():
|
|
||||||
ranges[feature] = []
|
|
||||||
if cell['ranges'][feature]['start'] is not None:
|
|
||||||
ranges[feature].append(cell['ranges'][feature]['start'])
|
|
||||||
if cell['ranges'][feature]['end'] is not None:
|
|
||||||
ranges[feature].append(cell['ranges'][feature]['end'])
|
|
||||||
if feature in range_counts:
|
|
||||||
range_counts[feature].append(representative_counts[index])
|
|
||||||
else:
|
|
||||||
range_counts[feature] = [representative_counts[index]]
|
|
||||||
# handle categorical features
|
|
||||||
categorical_features_values = {}
|
|
||||||
for feature in [key for key in cell['categories'].keys() if
|
|
||||||
'untouched' not in cell or key not in cell['untouched']]:
|
|
||||||
if feature not in categorical_features_values.keys():
|
|
||||||
categorical_features_values[feature] = []
|
|
||||||
for value in cell['categories'][feature]:
|
|
||||||
if value not in categorical_features_values[feature]:
|
|
||||||
categorical_features_values[feature].append(value)
|
|
||||||
for feature in categorical_features_values.keys():
|
|
||||||
partitions = []
|
|
||||||
values = categorical_features_values[feature]
|
|
||||||
assigned = []
|
|
||||||
for i in range(len(values)):
|
|
||||||
value1 = values[i]
|
|
||||||
if value1 in assigned:
|
|
||||||
continue
|
|
||||||
partition = [value1]
|
|
||||||
assigned.append(value1)
|
|
||||||
for j in range(len(values)):
|
|
||||||
if j <= i:
|
|
||||||
continue
|
|
||||||
value2 = values[j]
|
|
||||||
if GeneralizeToRepresentative._are_inseparable(self.cells, feature, value1, value2):
|
|
||||||
partition.append(value2)
|
|
||||||
assigned.append(value2)
|
|
||||||
partitions.append(partition)
|
|
||||||
if feature in categories:
|
|
||||||
categories[feature].append(partitions)
|
|
||||||
else:
|
|
||||||
categories[feature] = [partitions]
|
|
||||||
if feature in category_counts:
|
|
||||||
category_counts[feature].append(representative_counts[index])
|
|
||||||
else:
|
|
||||||
category_counts[feature] = [representative_counts[index]]
|
|
||||||
break
|
|
||||||
index += 1
|
|
||||||
|
|
||||||
for feature in ranges.keys():
|
|
||||||
ranges[feature] = list(set(ranges[feature]))
|
|
||||||
ranges[feature].sort()
|
|
||||||
|
|
||||||
self._transformed_generalizations = {
|
|
||||||
'ranges': ranges,
|
|
||||||
'categories': categories,
|
|
||||||
'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)}
|
|
||||||
self._remove_categorical_untouched(self._transformed_generalizations)
|
|
||||||
return range_counts, category_counts
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def _find_range_counts(self, samples, ranges):
|
def _find_range_counts(self, samples, ranges):
|
||||||
range_counts = {}
|
range_counts = {}
|
||||||
last_value = None
|
last_value = None
|
||||||
|
|
@ -1050,10 +1008,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
counter = [item for item in samples[r] if int(item) <= value]
|
counter = [item for item in samples[r] if int(item) <= value]
|
||||||
range_counts[r].append(len(counter))
|
range_counts[r].append(len(counter))
|
||||||
last_value = value
|
last_value = value
|
||||||
counter = [item for item in samples[r] if int(item) <= last_value]
|
counter = [item for item in samples[r] if int(item) > last_value]
|
||||||
range_counts[r].append(len(counter))
|
range_counts[r].append(len(counter))
|
||||||
return range_counts
|
return range_counts
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def _find_categories_counts(self, samples, categories):
|
def _find_categories_counts(self, samples, categories):
|
||||||
category_counts = {}
|
category_counts = {}
|
||||||
for c in categories.keys():
|
for c in categories.keys():
|
||||||
|
|
@ -1159,12 +1118,4 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
||||||
to_remove.append(feature)
|
to_remove.append(feature)
|
||||||
|
|
||||||
for feature in to_remove:
|
for feature in to_remove:
|
||||||
del generalizations['categories'][feature]
|
del generalizations['categories'][feature]
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def pandas_record_to_dict(record):
|
|
||||||
dict = {}
|
|
||||||
for feature in record.index:
|
|
||||||
dict[feature] = record[feature]
|
|
||||||
return dict
|
|
||||||
|
|
@ -164,6 +164,138 @@ def test_minimizer_fit(data):
|
||||||
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05)
|
||||||
|
|
||||||
|
|
||||||
|
def test_minimizer_ncp(data):
|
||||||
|
features = ['age', 'height']
|
||||||
|
X = np.array([[23, 165],
|
||||||
|
[45, 158],
|
||||||
|
[56, 123],
|
||||||
|
[67, 154],
|
||||||
|
[45, 149],
|
||||||
|
[42, 166],
|
||||||
|
[73, 172],
|
||||||
|
[94, 168],
|
||||||
|
[69, 175],
|
||||||
|
[24, 181],
|
||||||
|
[18, 190]])
|
||||||
|
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||||
|
X1 = np.array([[33, 165],
|
||||||
|
[43, 150],
|
||||||
|
[71, 143],
|
||||||
|
[92, 194],
|
||||||
|
[13, 125],
|
||||||
|
[22, 169]])
|
||||||
|
|
||||||
|
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||||
|
min_samples_leaf=1)
|
||||||
|
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
|
model.fit(ArrayDataset(X, y))
|
||||||
|
ad = ArrayDataset(X)
|
||||||
|
ad1 = ArrayDataset(X1, features_names=features)
|
||||||
|
predictions = model.predict(ad)
|
||||||
|
if predictions.shape[1] > 1:
|
||||||
|
predictions = np.argmax(predictions, axis=1)
|
||||||
|
target_accuracy = 0.4
|
||||||
|
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||||
|
|
||||||
|
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False)
|
||||||
|
gen1.fit(dataset=train_dataset)
|
||||||
|
ncp1 = gen1.ncp
|
||||||
|
gen1.calculate_ncp(ad1)
|
||||||
|
ncp2 = gen1.ncp
|
||||||
|
|
||||||
|
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy)
|
||||||
|
gen2.fit(dataset=train_dataset)
|
||||||
|
ncp3 = gen2.ncp
|
||||||
|
gen2.transform(dataset=ad1)
|
||||||
|
ncp4 = gen2.ncp
|
||||||
|
gen2.transform(dataset=ad)
|
||||||
|
ncp5 = gen2.ncp
|
||||||
|
gen2.transform(dataset=ad1)
|
||||||
|
ncp6 = gen2.ncp
|
||||||
|
|
||||||
|
assert(ncp1 <= ncp3)
|
||||||
|
assert(ncp2 != ncp3)
|
||||||
|
assert(ncp3 != ncp4)
|
||||||
|
assert(ncp4 != ncp5)
|
||||||
|
assert(ncp6 == ncp4)
|
||||||
|
|
||||||
|
|
||||||
|
def test_minimizer_ncp_categorical(data):
|
||||||
|
features = ['age', 'height', 'sex', 'ola']
|
||||||
|
X = [[23, 165, 'f', 'aa'],
|
||||||
|
[45, 158, 'f', 'aa'],
|
||||||
|
[56, 123, 'f', 'bb'],
|
||||||
|
[67, 154, 'm', 'aa'],
|
||||||
|
[45, 149, 'f', 'bb'],
|
||||||
|
[42, 166, 'm', 'bb'],
|
||||||
|
[73, 172, 'm', 'bb'],
|
||||||
|
[94, 168, 'f', 'aa'],
|
||||||
|
[69, 175, 'm', 'aa'],
|
||||||
|
[24, 181, 'm', 'bb'],
|
||||||
|
[18, 190, 'm', 'bb']]
|
||||||
|
X = pd.DataFrame(X, columns=features)
|
||||||
|
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||||
|
X1 = [[33, 165, 'f', 'aa'],
|
||||||
|
[43, 150, 'm', 'aa'],
|
||||||
|
[71, 143, 'f', 'aa'],
|
||||||
|
[92, 194, 'm', 'aa'],
|
||||||
|
[13, 125, 'f', 'aa'],
|
||||||
|
[22, 169, 'f', 'bb']]
|
||||||
|
X1 = pd.DataFrame(X1, columns=features)
|
||||||
|
|
||||||
|
numeric_features = ["age", "height"]
|
||||||
|
numeric_transformer = Pipeline(
|
||||||
|
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||||
|
)
|
||||||
|
|
||||||
|
categorical_features = ["sex", "ola"]
|
||||||
|
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
|
||||||
|
|
||||||
|
preprocessor = ColumnTransformer(
|
||||||
|
transformers=[
|
||||||
|
("num", numeric_transformer, numeric_features),
|
||||||
|
("cat", categorical_transformer, categorical_features),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
encoded = preprocessor.fit_transform(X)
|
||||||
|
encoded = pd.DataFrame(encoded)
|
||||||
|
|
||||||
|
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||||
|
min_samples_leaf=1)
|
||||||
|
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)
|
||||||
|
model.fit(ArrayDataset(encoded, y))
|
||||||
|
ad = ArrayDataset(X)
|
||||||
|
ad1 = ArrayDataset(X1)
|
||||||
|
predictions = model.predict(ArrayDataset(encoded))
|
||||||
|
if predictions.shape[1] > 1:
|
||||||
|
predictions = np.argmax(predictions, axis=1)
|
||||||
|
target_accuracy = 0.4
|
||||||
|
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||||
|
|
||||||
|
gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False,
|
||||||
|
categorical_features=categorical_features)
|
||||||
|
gen1.fit(dataset=train_dataset)
|
||||||
|
ncp1 = gen1.ncp
|
||||||
|
gen1.calculate_ncp(ad1)
|
||||||
|
ncp2 = gen1.ncp
|
||||||
|
|
||||||
|
gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features)
|
||||||
|
gen2.fit(dataset=train_dataset)
|
||||||
|
ncp3 = gen2.ncp
|
||||||
|
gen2.transform(dataset=ad1)
|
||||||
|
ncp4 = gen2.ncp
|
||||||
|
gen2.transform(dataset=ad)
|
||||||
|
ncp5 = gen2.ncp
|
||||||
|
gen2.transform(dataset=ad1)
|
||||||
|
ncp6 = gen2.ncp
|
||||||
|
|
||||||
|
assert(ncp1 <= ncp3)
|
||||||
|
assert(ncp2 != ncp3)
|
||||||
|
assert(ncp3 != ncp4)
|
||||||
|
assert(ncp4 != ncp5)
|
||||||
|
assert(ncp6 == ncp4)
|
||||||
|
|
||||||
|
|
||||||
def test_minimizer_fit_not_transform(data):
|
def test_minimizer_fit_not_transform(data):
|
||||||
features = ['age', 'height']
|
features = ['age', 'height']
|
||||||
X = np.array([[23, 165],
|
X = np.array([[23, 165],
|
||||||
|
|
@ -1099,5 +1231,9 @@ def test_errors():
|
||||||
gen = GeneralizeToRepresentative(model, generalize_using_transform=False)
|
gen = GeneralizeToRepresentative(model, generalize_using_transform=False)
|
||||||
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||||
gen.fit(dataset=train_dataset)
|
gen.fit(dataset=train_dataset)
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
gen.transform(X)
|
gen.transform(X)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
gen.calculate_ncp(ad)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue