diff --git a/apt/minimization/minimizer.py b/apt/minimization/minimizer.py index 5508c34..928e7a9 100644 --- a/apt/minimization/minimizer.py +++ b/apt/minimization/minimizer.py @@ -178,7 +178,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None): """ - Learns the generalizations based on training data, and applies them to the data. + Learns the generalizations based on training data, and applies them to the data. Updates stored ncp value to the + one computed on the training data. :param X: The training input samples. :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional @@ -383,7 +384,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, dataset: Optional[ArrayDataset] = None): - """ Transforms data records to representative points. + """ Transforms data records to representative points. Updates stored ncp value to the one computed on the + transformed data. :param X: The training input samples. :type X: {array-like, sparse matrix}, shape (n_samples, n_features), optional @@ -407,7 +409,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM def calculate_ncp(self, samples: Optional[ArrayDataset] = None, transformed: Optional[bool] = False): """ Compute the NCP score of the generalization. Calculation is based on the value of the - generalize_using_transform param. + generalize_using_transform param. If samples are provided, updates stored ncp value to the one computed on the + provided data. If samples not provided, returns the last NCP score computed by the `fit` or `transform` method. + Based on the NCP score presented in: Ghinita, G., Karras, P., Kalnis, P., Mamoulis, N.: Fast data anonymization with low information loss (https://www.vldb.org/conf/2007/papers/research/p758-ghinita.pdf) @@ -423,13 +427,17 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM if samples is None: return self._ncp + if not samples.features_names: + raise ValueError('features_names should be set in input ArrayDataset.') samples_pd = pd.DataFrame(samples.get_samples(), columns=samples.features_names) if self._features is None: self._features = samples.features_names if self._feature_data is None: self._feature_data = self._get_feature_data(samples_pd) + total_samples = samples_pd.shape[0] if self.generalize_using_transform: + # TODO: not sure I need to transform, data should be mapped to correct cell both with or without transforming if not transformed: # transform data transformed_data = self._inner_transform(dataset=samples) # can return numpy or pandas @@ -437,35 +445,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM transformed_data = pd.DataFrame(transformed_data, columns=samples.features_names) else: transformed_data = samples_pd - range_counts, category_counts = self._calculate_transformed_generalizations(transformed_data) - generalizations = self._transformed_generalizations + generalizations = self._calculate_transformed_generalizations(transformed_data) + # count how many transformed values are mapped to each cell + counted = np.zeros(transformed_data.shape[0]) # to mark records we already counted + ncp = 0 + for i in range(len(self.cells)): + cell = self.cells[i] + count = self._get_record_count_for_cell(transformed_data, cell, counted) + range_counts = {} + category_counts = {} + for feature in cell['ranges']: + range_counts[feature] = [count] + for feature in cell['categories']: + category_counts[feature] = [count] + ncp += self._calc_ncp_for_generalization(generalizations[cell['id']], range_counts, category_counts, + total_samples) + self._ncp = ncp else: # use generalizations generalizations = self.generalizations range_counts = self._find_range_counts(samples_pd, generalizations['ranges']) category_counts = self._find_categories_counts(samples_pd, generalizations['categories']) + self._ncp = self._calc_ncp_for_generalization(generalizations, range_counts, category_counts, total_samples) - # suppressed features are already taken care of within _calc_ncp_numeric - #TODO: check that this is the case for tramsformed as well - ranges = generalizations['ranges'] - categories = generalizations['categories'] - - total = samples_pd.shape[0] - total_ncp = 0 - total_features = len(generalizations['untouched']) - for feature in ranges.keys(): - feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], - self._feature_data[feature], total) - total_ncp = total_ncp + feature_ncp - total_features += 1 - for feature in categories.keys(): - feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature], - self._feature_data[feature], - total) - total_ncp = total_ncp + feature_ncp - total_features += 1 - if total_features == 0: - return 0 - self._ncp = total_ncp / total_features return self._ncp def _inner_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional[list] = None, @@ -518,6 +519,28 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return generalized return generalized.to_numpy() + def _calc_ncp_for_generalization(self, generalization, range_counts, category_counts, total_count): + total_ncp = 0 + total_features = len(generalization['untouched']) + ranges = generalization['ranges'] + categories = generalization['categories'] + + # suppressed features are already taken care of within _calc_ncp_numeric + for feature in ranges.keys(): + feature_ncp = self._calc_ncp_numeric(ranges[feature], range_counts[feature], + self._feature_data[feature], total_count) + total_ncp = total_ncp + feature_ncp + total_features += 1 + for feature in categories.keys(): + feature_ncp = self._calc_ncp_categorical(categories[feature], category_counts[feature], + self._feature_data[feature], + total_count) + total_ncp = total_ncp + feature_ncp + total_features += 1 + if total_features == 0: + return 0 + return total_ncp / total_features + @staticmethod def _calc_ncp_categorical(categories, category_count, feature_data, total): category_sizes = [len(g) if len(g) > 1 else 0 for g in categories] @@ -538,7 +561,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM average_range_size = sum(normalized_range_sizes) / len(normalized_range_sizes) return average_range_size / (feature_data['max'] - feature_data['min']) - def _get_feature_data(self, x): feature_data = {} for feature in self._features: @@ -561,6 +583,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM indexes.append(index) return indexes + def _get_record_count_for_cell(self, X, cell, mapped): + count = 0 + for index, row in X.iterrows(): + if not mapped.item(index) and self._cell_contains(cell, row, index, mapped): + count += 1 + return count + def _cell_contains(self, cell, x, index, mapped): for f in self._features: i = self._features.index(f) @@ -880,7 +909,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM current_accuracy) if feature is None: return None - GeneralizeToRepresentative._remove_feature_from_cells(self.cells, self._cells_by_id, feature) + self._remove_feature_from_cells(self.cells, self._cells_by_id, feature) return feature def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy): @@ -946,97 +975,26 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM return remove_feature def _calculate_generalizations(self): - self._generalizations = {'ranges': GeneralizeToRepresentative._calculate_ranges(self.cells), - 'categories': GeneralizeToRepresentative._calculate_categories(self.cells), - 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)} + self._generalizations = {'ranges': self._calculate_ranges(self.cells), + 'categories': self._calculate_categories(self.cells), + 'untouched': self._calculate_untouched(self.cells)} self._remove_categorical_untouched(self._generalizations) + def _calculate_generalizations_per_cell(self, cell): + generalizations = {'ranges': self._calculate_ranges([cell]), + 'categories': self._calculate_categories([cell]), + 'untouched': self._calculate_untouched([cell])} + self._remove_categorical_untouched(generalizations) + return generalizations + def _calculate_transformed_generalizations(self, transformed): - # transformed data should only consist of representative values from cells (when removing untouched features) - ranges = {} - categories = {} - range_counts = {} - category_counts = {} - - unique_records = transformed.value_counts().reset_index(name='count') - representatives = unique_records.drop('count', axis=1) - representative_counts = unique_records['count'] # needed to normalize ncp according to quantity - index = 0 - for _, record in representatives.iterrows(): - # TODO: what if some cells are not present, we will not take their generalizations into account. We need to - # "gain" ncp in this case... - record_dict = self.pandas_record_to_dict(record) - for cell in self.cells: - representative = cell["representative"].copy() - record_copy = record_dict.copy() - if 'untouched' in cell: - for feature in cell['untouched']: - record_copy.pop(feature) - if feature in representative: - representative.pop(feature) - if record_copy == representative: - # handle numerical features - for feature in [key for key in cell['ranges'].keys() if - 'untouched' not in cell or key not in cell['untouched']]: - if feature not in ranges.keys(): - ranges[feature] = [] - if cell['ranges'][feature]['start'] is not None: - ranges[feature].append(cell['ranges'][feature]['start']) - if cell['ranges'][feature]['end'] is not None: - ranges[feature].append(cell['ranges'][feature]['end']) - if feature in range_counts: - range_counts[feature].append(representative_counts[index]) - else: - range_counts[feature] = [representative_counts[index]] - # handle categorical features - categorical_features_values = {} - for feature in [key for key in cell['categories'].keys() if - 'untouched' not in cell or key not in cell['untouched']]: - if feature not in categorical_features_values.keys(): - categorical_features_values[feature] = [] - for value in cell['categories'][feature]: - if value not in categorical_features_values[feature]: - categorical_features_values[feature].append(value) - for feature in categorical_features_values.keys(): - partitions = [] - values = categorical_features_values[feature] - assigned = [] - for i in range(len(values)): - value1 = values[i] - if value1 in assigned: - continue - partition = [value1] - assigned.append(value1) - for j in range(len(values)): - if j <= i: - continue - value2 = values[j] - if GeneralizeToRepresentative._are_inseparable(self.cells, feature, value1, value2): - partition.append(value2) - assigned.append(value2) - partitions.append(partition) - if feature in categories: - categories[feature].append(partitions) - else: - categories[feature] = [partitions] - if feature in category_counts: - category_counts[feature].append(representative_counts[index]) - else: - category_counts[feature] = [representative_counts[index]] - break - index += 1 - - for feature in ranges.keys(): - ranges[feature] = list(set(ranges[feature])) - ranges[feature].sort() - - self._transformed_generalizations = { - 'ranges': ranges, - 'categories': categories, - 'untouched': GeneralizeToRepresentative._calculate_untouched(self.cells)} - self._remove_categorical_untouched(self._transformed_generalizations) - return range_counts, category_counts + # calculate generalizations separately per cell + cell_generalizations = {} + for cell in self.cells: + cell_generalizations[cell['id']] = self._calculate_generalizations_per_cell(cell) + return cell_generalizations + @staticmethod def _find_range_counts(self, samples, ranges): range_counts = {} last_value = None @@ -1050,10 +1008,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM counter = [item for item in samples[r] if int(item) <= value] range_counts[r].append(len(counter)) last_value = value - counter = [item for item in samples[r] if int(item) <= last_value] + counter = [item for item in samples[r] if int(item) > last_value] range_counts[r].append(len(counter)) return range_counts + @staticmethod def _find_categories_counts(self, samples, categories): category_counts = {} for c in categories.keys(): @@ -1159,12 +1118,4 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM to_remove.append(feature) for feature in to_remove: - del generalizations['categories'][feature] - - - @staticmethod - def pandas_record_to_dict(record): - dict = {} - for feature in record.index: - dict[feature] = record[feature] - return dict \ No newline at end of file + del generalizations['categories'][feature] \ No newline at end of file diff --git a/tests/test_minimizer.py b/tests/test_minimizer.py index 3b2f543..c6fe1d4 100644 --- a/tests/test_minimizer.py +++ b/tests/test_minimizer.py @@ -164,6 +164,138 @@ def test_minimizer_fit(data): assert ((rel_accuracy >= target_accuracy) or (target_accuracy - rel_accuracy) <= 0.05) +def test_minimizer_ncp(data): + features = ['age', 'height'] + X = np.array([[23, 165], + [45, 158], + [56, 123], + [67, 154], + [45, 149], + [42, 166], + [73, 172], + [94, 168], + [69, 175], + [24, 181], + [18, 190]]) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + X1 = np.array([[33, 165], + [43, 150], + [71, 143], + [92, 194], + [13, 125], + [22, 169]]) + + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(X, y)) + ad = ArrayDataset(X) + ad1 = ArrayDataset(X1, features_names=features) + predictions = model.predict(ad) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + target_accuracy = 0.4 + train_dataset = ArrayDataset(X, predictions, features_names=features) + + gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False) + gen1.fit(dataset=train_dataset) + ncp1 = gen1.ncp + gen1.calculate_ncp(ad1) + ncp2 = gen1.ncp + + gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy) + gen2.fit(dataset=train_dataset) + ncp3 = gen2.ncp + gen2.transform(dataset=ad1) + ncp4 = gen2.ncp + gen2.transform(dataset=ad) + ncp5 = gen2.ncp + gen2.transform(dataset=ad1) + ncp6 = gen2.ncp + + assert(ncp1 <= ncp3) + assert(ncp2 != ncp3) + assert(ncp3 != ncp4) + assert(ncp4 != ncp5) + assert(ncp6 == ncp4) + + +def test_minimizer_ncp_categorical(data): + features = ['age', 'height', 'sex', 'ola'] + X = [[23, 165, 'f', 'aa'], + [45, 158, 'f', 'aa'], + [56, 123, 'f', 'bb'], + [67, 154, 'm', 'aa'], + [45, 149, 'f', 'bb'], + [42, 166, 'm', 'bb'], + [73, 172, 'm', 'bb'], + [94, 168, 'f', 'aa'], + [69, 175, 'm', 'aa'], + [24, 181, 'm', 'bb'], + [18, 190, 'm', 'bb']] + X = pd.DataFrame(X, columns=features) + y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + X1 = [[33, 165, 'f', 'aa'], + [43, 150, 'm', 'aa'], + [71, 143, 'f', 'aa'], + [92, 194, 'm', 'aa'], + [13, 125, 'f', 'aa'], + [22, 169, 'f', 'bb']] + X1 = pd.DataFrame(X1, columns=features) + + numeric_features = ["age", "height"] + numeric_transformer = Pipeline( + steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] + ) + + categorical_features = ["sex", "ola"] + categorical_transformer = OneHotEncoder(handle_unknown="ignore") + + preprocessor = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ] + ) + encoded = preprocessor.fit_transform(X) + encoded = pd.DataFrame(encoded) + + base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, + min_samples_leaf=1) + model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES) + model.fit(ArrayDataset(encoded, y)) + ad = ArrayDataset(X) + ad1 = ArrayDataset(X1) + predictions = model.predict(ArrayDataset(encoded)) + if predictions.shape[1] > 1: + predictions = np.argmax(predictions, axis=1) + target_accuracy = 0.4 + train_dataset = ArrayDataset(X, predictions, features_names=features) + + gen1 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, generalize_using_transform=False, + categorical_features=categorical_features) + gen1.fit(dataset=train_dataset) + ncp1 = gen1.ncp + gen1.calculate_ncp(ad1) + ncp2 = gen1.ncp + + gen2 = GeneralizeToRepresentative(model, target_accuracy=target_accuracy, categorical_features=categorical_features) + gen2.fit(dataset=train_dataset) + ncp3 = gen2.ncp + gen2.transform(dataset=ad1) + ncp4 = gen2.ncp + gen2.transform(dataset=ad) + ncp5 = gen2.ncp + gen2.transform(dataset=ad1) + ncp6 = gen2.ncp + + assert(ncp1 <= ncp3) + assert(ncp2 != ncp3) + assert(ncp3 != ncp4) + assert(ncp4 != ncp5) + assert(ncp6 == ncp4) + + def test_minimizer_fit_not_transform(data): features = ['age', 'height'] X = np.array([[23, 165], @@ -1099,5 +1231,9 @@ def test_errors(): gen = GeneralizeToRepresentative(model, generalize_using_transform=False) train_dataset = ArrayDataset(X, predictions, features_names=features) gen.fit(dataset=train_dataset) + with pytest.raises(ValueError): gen.transform(X) + + with pytest.raises(ValueError): + gen.calculate_ncp(ad)