Merge pull request #15 from HRLDataSecurityAndPrivacy/dataset_wrapper_anonimizer

Dataset wrapper anonymizer
This commit is contained in:
Ola Saadi 2022-03-28 17:12:19 +03:00 committed by GitHub Enterprise
commit 8290be0173
7 changed files with 232 additions and 551 deletions

View file

@ -5,6 +5,7 @@ from collections import Counter
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
from typing import Union, Optional
@ -21,44 +22,60 @@ class Anonymize:
"""
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2.
:param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
in case of numpy data.
:param categorical_features: The list of categorical features (should only be supplied when passing data as a
pandas dataframe.
:param quasi_identifiers: The features that need to be minimized. It can be a list of feature names (strings) if
dataset.feature_names is set, otherwise a list of indexes (integers).
:param categorical_features: The list of categorical features. It can be a list of feature names (strings) if
dataset.feature_names is set, otherwise a list of indexes (integers).
:param is_regression: Boolean param indicates that is is a regression problem.
"""
if k < 2:
raise ValueError("k should be a positive integer with a value of 2 or higher")
if quasi_identifiers is None or len(quasi_identifiers) < 1:
raise ValueError("The list of quasi-identifiers cannot be empty")
self.k = k
self.quasi_identifiers = quasi_identifiers
self.categorical_features = categorical_features
self.is_regression = is_regression
self.features_names = None
def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \
-> Union[np.ndarray, pd.DataFrame]:
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
"""
Method for performing model-guided anonymization.
:param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and
categorical data.
:param y: The predictions of the original model on the training data.
:param dataset: Data wrapper containing the training data for the model and the predictions of the
original model on the training data.
:return: An array containing the anonymized training dataset.
"""
if type(x) == np.ndarray:
return self._anonymize_ndarray(x.copy(), y)
else: # pandas
if not self.categorical_features:
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
return self._anonymize_pandas(x.copy(), y)
if dataset.features_names is not None:
self.features_names = dataset.features_names
# if features is None, use numbers instead of names
elif dataset.get_samples().shape[1] != 0:
self.features_names = [i for i in range(dataset.get_samples().shape[1])]
else:
raise ValueError('No data provided')
if not set(self.quasi_identifiers).issubset(set(self.features_names)):
raise ValueError('Quasi identifiers should bs a subset of the supplied features or indexes in range of '
'the data columns')
if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
'the data columns')
self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
if self.categorical_features:
self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
def _anonymize_ndarray(self, x, y):
transformed = self._anonymize(dataset.get_samples().copy(), dataset.get_labels())
if dataset.is_pandas:
return pd.DataFrame(transformed, columns=self.features_names)
else:
return transformed
def _anonymize(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
x_anonymizer_train = x[:, self.quasi_identifiers]
if x.dtype.kind not in 'iufc':
if not self.categorical_features:
raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
x_prepared = self._modify_categorical_features(x_anonymizer_train)
else:
x_prepared = x_anonymizer_train
@ -66,23 +83,10 @@ class Anonymize:
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
else:
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self.anonymizer.fit(x_prepared, y)
cells_by_id = self._calculate_cells(x, x_prepared)
return self._anonymize_data_numpy(x, x_prepared, cells_by_id)
def _anonymize_pandas(self, x, y):
if x.shape[0] != y.shape[0]:
raise ValueError("x and y should have same number of rows")
x_anonymizer_train = x.loc[:, self.quasi_identifiers]
# need to one-hot encode before training the decision tree
x_prepared = self._modify_categorical_features(x_anonymizer_train)
if self.is_regression:
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
else:
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
self.anonymizer.fit(x_prepared, y)
cells_by_id = self._calculate_cells(x, x_prepared)
return self._anonymize_data_pandas(x, x_prepared, cells_by_id)
return self._anonymize_data(x, x_prepared, cells_by_id)
def _calculate_cells(self, x, x_anonymizer_train):
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
@ -109,15 +113,9 @@ class Anonymize:
# get all rows in cell
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
# TODO: should we filter only those with majority label? (using hist)
if type(x) == np.ndarray:
rows = x[indexes]
else: # pandas
rows = x.iloc[indexes]
rows = x[indexes]
for feature in self.quasi_identifiers:
if type(x) == np.ndarray:
values = rows[:, feature]
else: # pandas
values = rows.loc[:, feature]
values = rows[:, feature]
if self.categorical_features and feature in self.categorical_features:
# find most common value
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
@ -142,7 +140,7 @@ class Anonymize:
node_ids = self._find_sample_nodes(samples)
return [cells_by_id[node_id] for node_id in node_ids]
def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
def _anonymize_data(self, x, x_anonymizer_train, cells_by_id):
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
index = 0
for row in x:
@ -152,16 +150,6 @@ class Anonymize:
row[feature] = cell['representative'][feature]
return x
def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
index = 0
for i, row in x.iterrows():
cell = cells[index]
index += 1
for feature in cell['representative']:
x.at[i, feature] = cell['representative'][feature]
return x
def _modify_categorical_features(self, x):
encoder = OneHotEncoder()
one_hot_encoded = encoder.fit_transform(x)

View file

@ -273,7 +273,7 @@ def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_socia
raise Exception("Bad label value: %s" % value)
data["label"] = data["label"].apply(modify_label)
data["children"] = data["children"].apply(lambda x: 4 if x == "more" else x)
data["children"] = data["children"].apply(lambda x: "4" if x == "more" else x)
if transform_social:

View file

@ -24,13 +24,15 @@ OUTPUT_DATA_ARRAY_TYPE = np.ndarray
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
def array2numpy(arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
"""
converts from INPUT_DATA_ARRAY_TYPE to numpy array
"""
if type(arr) == np.ndarray:
return arr
if type(arr) == pd.DataFrame:
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return arr.to_numpy()
if isinstance(arr, list):
return np.array(arr)
@ -40,13 +42,14 @@ def array2numpy(arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
raise ValueError('Non supported type: ', type(arr).__name__)
def array2torch_tensor(arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
def array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
"""
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
"""
if type(arr) == np.ndarray:
return torch.from_numpy(arr)
if type(arr) == pd.DataFrame:
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
self.is_pandas = True
return torch.from_numpy(arr.to_numpy())
if isinstance(arr, list):
return torch.tensor(arr)
@ -111,7 +114,6 @@ class StoredDataset(Dataset):
if unzip:
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
@staticmethod
def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
"""
@ -162,15 +164,23 @@ class StoredDataset(Dataset):
class ArrayDataset(Dataset):
"""Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
features_names: Optional = None, **kwargs):
"""
ArrayDataset constructor.
:param x: collection of data samples
:param y: collection of labels (optional)
:param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
:param kwargs: dataset parameters
"""
self._x = array2numpy(x)
self._y = array2numpy(y) if y is not None else None
self.is_pandas = False
self.features_names = features_names
self._y = array2numpy(self, y) if y is not None else None
self._x = array2numpy(self, x)
if self.is_pandas:
if features_names and not np.array_equal(features_names, x.columns):
raise ValueError("The supplied features are not the same as in the data features")
self.features_names = x.columns
if y is not None and len(self._x) != len(self._y):
raise ValueError('Non equivalent lengths of x and y')
@ -193,12 +203,16 @@ class PytorchData(Dataset):
:param y: collection of labels (optional)
:param kwargs: dataset parameters
"""
self._x = array2torch_tensor(x)
self._y = array2torch_tensor(y) if y is not None else None
self.is_pandas = False
self._y = array2torch_tensor(self, y) if y is not None else None
self._x = array2torch_tensor(self, x)
if self.is_pandas:
self.features_names = x.columns
if y is not None and len(self._x) != len(self._y):
raise ValueError('Non equivalent lengths of x and y')
if self._y is not None:
self.__getitem__ = self.get_item
else:
@ -235,6 +249,7 @@ class DatasetFactory:
:param name: dataset name
:return:
"""
def inner_wrapper(wrapped_class: Dataset) -> Any:
if name in cls.registry:
logger.warning('Dataset %s already exists. Will replace it', name)

View file

@ -29,198 +29,15 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>parents</th>\n",
" <th>has_nurs</th>\n",
" <th>form</th>\n",
" <th>children</th>\n",
" <th>housing</th>\n",
" <th>finance</th>\n",
" <th>social</th>\n",
" <th>health</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8450</th>\n",
" <td>pretentious</td>\n",
" <td>very_crit</td>\n",
" <td>foster</td>\n",
" <td>1</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12147</th>\n",
" <td>great_pret</td>\n",
" <td>very_crit</td>\n",
" <td>complete</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>inconv</td>\n",
" <td>1</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2780</th>\n",
" <td>usual</td>\n",
" <td>critical</td>\n",
" <td>complete</td>\n",
" <td>4</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11924</th>\n",
" <td>great_pret</td>\n",
" <td>critical</td>\n",
" <td>foster</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>usual</td>\n",
" <td>proper</td>\n",
" <td>complete</td>\n",
" <td>2</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5193</th>\n",
" <td>pretentious</td>\n",
" <td>less_proper</td>\n",
" <td>complete</td>\n",
" <td>1</td>\n",
" <td>convenient</td>\n",
" <td>inconv</td>\n",
" <td>0</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1375</th>\n",
" <td>usual</td>\n",
" <td>less_proper</td>\n",
" <td>incomplete</td>\n",
" <td>2</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>priority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10318</th>\n",
" <td>great_pret</td>\n",
" <td>less_proper</td>\n",
" <td>foster</td>\n",
" <td>4</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>priority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6396</th>\n",
" <td>pretentious</td>\n",
" <td>improper</td>\n",
" <td>completed</td>\n",
" <td>3</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>485</th>\n",
" <td>usual</td>\n",
" <td>proper</td>\n",
" <td>incomplete</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>inconv</td>\n",
" <td>1</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10366 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" parents has_nurs form children housing finance \\\n",
"8450 pretentious very_crit foster 1 less_conv convenient \n",
"12147 great_pret very_crit complete 1 critical inconv \n",
"2780 usual critical complete 4 less_conv convenient \n",
"11924 great_pret critical foster 1 critical convenient \n",
"59 usual proper complete 2 convenient convenient \n",
"... ... ... ... ... ... ... \n",
"5193 pretentious less_proper complete 1 convenient inconv \n",
"1375 usual less_proper incomplete 2 less_conv convenient \n",
"10318 great_pret less_proper foster 4 convenient convenient \n",
"6396 pretentious improper completed 3 less_conv convenient \n",
"485 usual proper incomplete 1 critical inconv \n",
"\n",
" social health \n",
"8450 1 not_recom \n",
"12147 1 recommended \n",
"2780 1 not_recom \n",
"11924 1 not_recom \n",
"59 0 not_recom \n",
"... ... ... \n",
"5193 0 recommended \n",
"1375 1 priority \n",
"10318 0 priority \n",
"6396 1 recommended \n",
"485 1 not_recom \n",
"\n",
"[10366 rows x 8 columns]"
]
"text/plain": " parents has_nurs form children housing finance \\\n8450 pretentious very_crit foster 1 less_conv convenient \n12147 great_pret very_crit complete 1 critical inconv \n2780 usual critical complete 4 less_conv convenient \n11924 great_pret critical foster 1 critical convenient \n59 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n5193 pretentious less_proper complete 1 convenient inconv \n1375 usual less_proper incomplete 2 less_conv convenient \n10318 great_pret less_proper foster 4 convenient convenient \n6396 pretentious improper completed 3 less_conv convenient \n485 usual proper incomplete 1 critical inconv \n\n social health \n8450 1 not_recom \n12147 1 recommended \n2780 1 not_recom \n11924 1 not_recom \n59 0 not_recom \n... ... ... \n5193 0 recommended \n1375 1 priority \n10318 0 priority \n6396 1 recommended \n485 1 not_recom \n\n[10366 rows x 8 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>parents</th>\n <th>has_nurs</th>\n <th>form</th>\n <th>children</th>\n <th>housing</th>\n <th>finance</th>\n <th>social</th>\n <th>health</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>8450</th>\n <td>pretentious</td>\n <td>very_crit</td>\n <td>foster</td>\n <td>1</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>12147</th>\n <td>great_pret</td>\n <td>very_crit</td>\n <td>complete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>2780</th>\n <td>usual</td>\n <td>critical</td>\n <td>complete</td>\n <td>4</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>11924</th>\n <td>great_pret</td>\n <td>critical</td>\n <td>foster</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>59</th>\n <td>usual</td>\n <td>proper</td>\n <td>complete</td>\n <td>2</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5193</th>\n <td>pretentious</td>\n <td>less_proper</td>\n <td>complete</td>\n <td>1</td>\n <td>convenient</td>\n <td>inconv</td>\n <td>0</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>1375</th>\n <td>usual</td>\n <td>less_proper</td>\n <td>incomplete</td>\n <td>2</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10318</th>\n <td>great_pret</td>\n <td>less_proper</td>\n <td>foster</td>\n <td>4</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>6396</th>\n <td>pretentious</td>\n <td>improper</td>\n <td>completed</td>\n <td>3</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>485</th>\n <td>usual</td>\n <td>proper</td>\n <td>incomplete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n </tbody>\n</table>\n<p>10366 rows × 8 columns</p>\n</div>"
},
"execution_count": 61,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@ -230,7 +47,7 @@
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"\n",
"from apt.utils import get_nursery_dataset\n",
"from apt.utils.dataset_utils import get_nursery_dataset\n",
"\n",
"(x_train, y_train), (x_test, y_test) = get_nursery_dataset(transform_social=True)\n",
"\n",
@ -246,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@ -263,9 +80,9 @@
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"x_train_str = x_train.astype(str)\n",
"train_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_train_str)\n",
"train_encoded = OneHotEncoder(sparse=False).fit_transform(x_train_str)\n",
"x_test_str = x_test.astype(str)\n",
"test_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_test_str)\n",
"test_encoded = OneHotEncoder(sparse=False).fit_transform(x_test_str)\n",
" \n",
"model = DecisionTreeClassifier()\n",
"model.fit(train_encoded, y_train)\n",
@ -287,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": 91,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -323,14 +140,14 @@
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6430638626278217\n"
"1.0\n"
]
}
],
@ -361,14 +178,14 @@
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6980513216284006\n"
"0.5122515917422342\n"
]
}
],
@ -408,224 +225,43 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>parents</th>\n",
" <th>has_nurs</th>\n",
" <th>form</th>\n",
" <th>children</th>\n",
" <th>housing</th>\n",
" <th>finance</th>\n",
" <th>social</th>\n",
" <th>health</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8450</th>\n",
" <td>pretentious</td>\n",
" <td>very_crit</td>\n",
" <td>foster</td>\n",
" <td>1</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12147</th>\n",
" <td>great_pret</td>\n",
" <td>very_crit</td>\n",
" <td>complete</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>inconv</td>\n",
" <td>1</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2780</th>\n",
" <td>usual</td>\n",
" <td>critical</td>\n",
" <td>complete</td>\n",
" <td>4</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11924</th>\n",
" <td>great_pret</td>\n",
" <td>critical</td>\n",
" <td>foster</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>usual</td>\n",
" <td>proper</td>\n",
" <td>complete</td>\n",
" <td>2</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5193</th>\n",
" <td>pretentious</td>\n",
" <td>less_proper</td>\n",
" <td>complete</td>\n",
" <td>1</td>\n",
" <td>convenient</td>\n",
" <td>inconv</td>\n",
" <td>0</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1375</th>\n",
" <td>usual</td>\n",
" <td>less_proper</td>\n",
" <td>incomplete</td>\n",
" <td>2</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>priority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10318</th>\n",
" <td>great_pret</td>\n",
" <td>less_proper</td>\n",
" <td>foster</td>\n",
" <td>4</td>\n",
" <td>convenient</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>priority</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6396</th>\n",
" <td>pretentious</td>\n",
" <td>improper</td>\n",
" <td>completed</td>\n",
" <td>3</td>\n",
" <td>less_conv</td>\n",
" <td>convenient</td>\n",
" <td>1</td>\n",
" <td>recommended</td>\n",
" </tr>\n",
" <tr>\n",
" <th>485</th>\n",
" <td>usual</td>\n",
" <td>proper</td>\n",
" <td>incomplete</td>\n",
" <td>1</td>\n",
" <td>critical</td>\n",
" <td>convenient</td>\n",
" <td>0</td>\n",
" <td>not_recom</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10366 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" parents has_nurs form children housing finance \\\n",
"8450 pretentious very_crit foster 1 less_conv convenient \n",
"12147 great_pret very_crit complete 1 critical inconv \n",
"2780 usual critical complete 4 less_conv convenient \n",
"11924 great_pret critical foster 1 critical convenient \n",
"59 usual proper complete 2 convenient convenient \n",
"... ... ... ... ... ... ... \n",
"5193 pretentious less_proper complete 1 convenient inconv \n",
"1375 usual less_proper incomplete 2 less_conv convenient \n",
"10318 great_pret less_proper foster 4 convenient convenient \n",
"6396 pretentious improper completed 3 less_conv convenient \n",
"485 usual proper incomplete 1 critical convenient \n",
"\n",
" social health \n",
"8450 0 not_recom \n",
"12147 1 recommended \n",
"2780 0 not_recom \n",
"11924 0 not_recom \n",
"59 0 not_recom \n",
"... ... ... \n",
"5193 0 recommended \n",
"1375 1 priority \n",
"10318 0 priority \n",
"6396 1 recommended \n",
"485 0 not_recom \n",
"\n",
"[10366 rows x 8 columns]"
]
"text/plain": " parents has_nurs form children housing finance \\\n0 pretentious very_crit foster 1 less_conv convenient \n1 great_pret very_crit complete 1 critical inconv \n2 usual critical complete 4 less_conv convenient \n3 great_pret critical foster 1 critical convenient \n4 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n10361 pretentious less_proper complete 1 convenient inconv \n10362 usual less_proper incomplete 2 less_conv convenient \n10363 great_pret less_proper foster 4 convenient convenient \n10364 pretentious improper completed 3 less_conv convenient \n10365 usual proper incomplete 1 critical convenient \n\n social health \n0 0 not_recom \n1 1 recommended \n2 0 not_recom \n3 0 not_recom \n4 0 not_recom \n... ... ... \n10361 0 recommended \n10362 1 priority \n10363 0 priority \n10364 1 recommended \n10365 0 not_recom \n\n[10366 rows x 8 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>parents</th>\n <th>has_nurs</th>\n <th>form</th>\n <th>children</th>\n <th>housing</th>\n <th>finance</th>\n <th>social</th>\n <th>health</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>pretentious</td>\n <td>very_crit</td>\n <td>foster</td>\n <td>1</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>1</th>\n <td>great_pret</td>\n <td>very_crit</td>\n <td>complete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>2</th>\n <td>usual</td>\n <td>critical</td>\n <td>complete</td>\n <td>4</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>3</th>\n <td>great_pret</td>\n <td>critical</td>\n <td>foster</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>4</th>\n <td>usual</td>\n <td>proper</td>\n <td>complete</td>\n <td>2</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>10361</th>\n <td>pretentious</td>\n <td>less_proper</td>\n <td>complete</td>\n <td>1</td>\n <td>convenient</td>\n <td>inconv</td>\n <td>0</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>10362</th>\n <td>usual</td>\n <td>less_proper</td>\n <td>incomplete</td>\n <td>2</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10363</th>\n <td>great_pret</td>\n <td>less_proper</td>\n <td>foster</td>\n <td>4</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10364</th>\n <td>pretentious</td>\n <td>improper</td>\n <td>completed</td>\n <td>3</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>10365</th>\n <td>usual</td>\n <td>proper</td>\n <td>incomplete</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n </tbody>\n</table>\n<p>10366 rows × 8 columns</p>\n</div>"
},
"execution_count": 97,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from apt.utils.datasets import ArrayDataset\n",
"from apt.anonymization import Anonymize\n",
"\n",
"features = x_train.columns\n",
"QI = [\"finance\", \"social\", \"health\"]\n",
"categorical_features = [\"parents\", \"has_nurs\", \"form\", \"housing\", \"finance\", \"health\", 'children']\n",
"anonymizer = Anonymize(100, QI, categorical_features=categorical_features)\n",
"anon = anonymizer.anonymize(x_train, x_train_predictions)\n",
"anon"
"QI_indexes = [i for i, v in enumerate(features) if v in QI]\n",
"categorical_features_indexes = [i for i, v in enumerate(features) if v in categorical_features]\n",
"anonymizer = Anonymize(100, QI_indexes, categorical_features=categorical_features_indexes)\n",
"anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
"anon\n"
]
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"7585"
]
"text/plain": "7585"
},
"execution_count": 64,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -637,16 +273,14 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5766"
]
"text/plain": "5766"
},
"execution_count": 65,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -665,7 +299,7 @@
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -678,7 +312,7 @@
],
"source": [
"anon_str = anon.astype(str)\n",
"anon_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon_str)\n",
"anon_encoded = OneHotEncoder(sparse=False).fit_transform(anon_str)\n",
"\n",
"anon_model = DecisionTreeClassifier()\n",
"anon_model.fit(anon_encoded, y_train)\n",
@ -698,14 +332,14 @@
},
{
"cell_type": "code",
"execution_count": 98,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6471155701331275\n"
"1.0\n"
]
}
],
@ -734,14 +368,14 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6982442600810341\n"
"0.5245996527107852\n"
]
}
],
@ -765,15 +399,15 @@
},
{
"cell_type": "code",
"execution_count": 87,
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.33056202194878614, 0.2888695146759663)\n",
"(0.34112301200908796, 0.3054344667247893)\n"
"(0.49415432579890883, 0.48976438779451525)\n",
"(0.49415432579890883, 0.48976438779451525)\n"
]
}
],
@ -810,15 +444,15 @@
},
{
"cell_type": "code",
"execution_count": 88,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.6457357075913777, 0.2002324905550712)\n",
"(0.6472248353715898, 0.1999418773612322)\n"
"(1.0, 0.019204655674102813)\n",
"(0.9829787234042553, 0.04481086323957323)\n"
]
}
],
@ -849,26 +483,24 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"anonymizer2 = Anonymize(1000, QI, categorical_features=categorical_features)\n",
"anon2 = anonymizer2.anonymize(x_train, x_train_predictions)"
"anonymizer2 = Anonymize(1000, QI_indexes, categorical_features=categorical_features_indexes)\n",
"anon2 = anonymizer2.anonymize(ArrayDataset(x_train, x_train_predictions))"
]
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4226"
]
"text/plain": "4226"
},
"execution_count": 75,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@ -887,7 +519,7 @@
},
{
"cell_type": "code",
"execution_count": 104,
"execution_count": 16,
"metadata": {},
"outputs": [
{
@ -900,7 +532,7 @@
],
"source": [
"anon2_str = anon2.astype(str)\n",
"anon2_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon2_str)\n",
"anon2_encoded = OneHotEncoder(sparse=False).fit_transform(anon2_str)\n",
"\n",
"anon2_model = DecisionTreeClassifier()\n",
"anon2_model.fit(anon2_encoded, y_train)\n",
@ -920,14 +552,14 @@
},
{
"cell_type": "code",
"execution_count": 105,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6266640941539648\n"
"1.0\n"
]
}
],
@ -956,14 +588,14 @@
},
{
"cell_type": "code",
"execution_count": 106,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.6944819602546788\n"
"0.515820953115956\n"
]
}
],
@ -980,17 +612,17 @@
},
{
"cell_type": "code",
"execution_count": 107,
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.35793357933579334, 0.17037470725995316)\n",
"(0.3360655737704918, 0.1680327868852459)\n",
"(0.6457357075913777, 0.2002324905550712)\n",
"(0.6327519379844961, 0.1897704155768672)\n"
"(0.49415432579890883, 0.48976438779451525)\n",
"(0.49415432579890883, 0.48976438779451525)\n",
"(1.0, 0.019204655674102813)\n",
"(1.0, 0.026382153249272552)\n"
]
}
],
@ -1023,27 +655,26 @@
},
{
"cell_type": "code",
"execution_count": 111,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"QI2 = [\"parents\", \"has_nurs\", \"form\", \"children\", \"housing\", \"finance\", \"social\", \"health\"]\n",
"anonymizer3 = Anonymize(100, QI2, categorical_features=categorical_features)\n",
"anon3 = anonymizer3.anonymize(x_train, x_train_predictions)"
"QI2_indexes = [i for i, v in enumerate(features) if v in QI2]\n",
"anonymizer3 = Anonymize(100, QI2_indexes, categorical_features=categorical_features_indexes)\n",
"anon3 = anonymizer3.anonymize(ArrayDataset(x_train, x_train_predictions))"
]
},
{
"cell_type": "code",
"execution_count": 112,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"39"
]
"text/plain": "39"
},
"execution_count": 112,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@ -1055,22 +686,22 @@
},
{
"cell_type": "code",
"execution_count": 113,
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Anonymized model accuracy: 0.7723765432098766\n",
"BB attack accuracy: 0.5792012348060969\n",
"WB attack accuracy: 0.6680493922438742\n"
"Anonymized model accuracy: 0.751929012345679\n",
"BB attack accuracy: 1.0\n",
"WB attack accuracy: 0.5187150299054601\n"
]
}
],
"source": [
"anon3_str = anon3.astype(str)\n",
"anon3_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon3_str)\n",
"anon3_encoded = OneHotEncoder(sparse=False).fit_transform(anon3_str)\n",
"\n",
"anon3_model = DecisionTreeClassifier()\n",
"anon3_model.fit(anon3_encoded, y_train)\n",
@ -1105,17 +736,17 @@
},
{
"cell_type": "code",
"execution_count": 114,
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.35793357933579334, 0.17037470725995316)\n",
"(0.3393939393939394, 0.13114754098360656)\n",
"(0.6457357075913777, 0.2002324905550712)\n",
"(1, 0.0)\n"
"(0.49415432579890883, 0.48976438779451525)\n",
"(0.49415432579890883, 0.48976438779451525)\n",
"(1.0, 0.019204655674102813)\n",
"(1.0, 0.032201745877788554)\n"
]
}
],
@ -1162,4 +793,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View file

@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -44,6 +44,18 @@
" [ 26. 11. 0. 0. 48.]\n",
" [ 27. 9. 0. 0. 40.]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_85828/3975777015.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" y_train = y_train.astype(np.int)\n",
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_85828/3975777015.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" y_test = y_test.astype(np.int)\n"
]
}
],
"source": [
@ -90,14 +102,14 @@
},
{
"cell_type": "code",
"execution_count": 116,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy: 0.8075056814691972\n"
"Base model accuracy: 0.8074442601805786\n"
]
}
],
@ -126,9 +138,18 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
]
}
],
"source": [
"from art.attacks.inference.membership_inference import MembershipInferenceBlackBox\n",
"\n",
@ -154,14 +175,14 @@
},
{
"cell_type": "code",
"execution_count": 125,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5440363591696352\n"
"0.545264709495148\n"
]
}
],
@ -197,7 +218,7 @@
},
{
"cell_type": "code",
"execution_count": 128,
"execution_count": 10,
"metadata": {},
"outputs": [
{
@ -215,6 +236,7 @@
}
],
"source": [
"from apt.utils.datasets import ArrayDataset\n",
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
@ -223,22 +245,20 @@
"# QI = (age, education-num, capital-gain, hours-per-week)\n",
"QI = [0, 1, 2, 4]\n",
"anonymizer = Anonymize(100, QI)\n",
"anon = anonymizer.anonymize(x_train, x_train_predictions)\n",
"anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
"print(anon)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6739"
]
"text/plain": "6739"
},
"execution_count": 104,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -250,16 +270,14 @@
},
{
"cell_type": "code",
"execution_count": 129,
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"658"
]
"text/plain": "658"
},
"execution_count": 129,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@ -278,14 +296,14 @@
},
{
"cell_type": "code",
"execution_count": 130,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Anonymized model accuracy: 0.8304158221239482\n"
"Anonymized model accuracy: 0.83078434985566\n"
]
}
],
@ -308,14 +326,22 @@
},
{
"cell_type": "code",
"execution_count": 131,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5034393809114359\n"
"0.5047291487532244\n"
]
}
],
@ -345,15 +371,15 @@
},
{
"cell_type": "code",
"execution_count": 132,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(0.5298924372550654, 0.7806166318634075)\n",
"(0.5030507735890172, 0.5671293452892765)\n"
"(0.5312420517168291, 0.7696843139663432)\n",
"(0.5048372911169745, 0.4935511607910576)\n"
]
}
],
@ -419,4 +445,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View file

@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 121,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -50,7 +50,7 @@
},
{
"cell_type": "code",
"execution_count": 122,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@ -86,14 +86,14 @@
},
{
"cell_type": "code",
"execution_count": 123,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.4954954954954955\n"
"0.527027027027027\n"
]
}
],
@ -131,7 +131,7 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -141,6 +141,22 @@
"unique rows in original data: 221\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
]
},
{
"name": "stdout",
"output_type": "stream",
@ -148,11 +164,12 @@
"k values: [5, 10, 20, 50, 75]\n",
"unique rows: [34, 19, 8, 4, 2]\n",
"model accuracy: [0.43165832354998956, 0.4509641063206041, -1.730181929385853, -5.577098823982753e+27, -1.2751609045828272e+25]\n",
"attack accuracy: [0.5, 0.47297297297297297, 0.49549549549549543, 0.5, 0.47297297297297297]\n"
"attack accuracy: [0.509009009009009, 0.481981981981982, 0.509009009009009, 0.5045045045045045, 0.4954954954954955]\n"
]
}
],
"source": [
"from apt.utils.datasets import ArrayDataset\n",
"from apt.anonymization import Anonymize\n",
"k_values=[5, 10, 20, 50, 75]\n",
"model_accuracy = []\n",
@ -165,7 +182,7 @@
"\n",
"for k in k_values:\n",
" anonymizer = Anonymize(k, QI, is_regression=True)\n",
" anon = anonymizer.anonymize(X_train, x_train_predictions)\n",
" anon = anonymizer.anonymize(ArrayDataset(X_train, x_train_predictions))\n",
" unique_values.append(len(np.unique(anon, axis=0)))\n",
" \n",
" anon_model = LinearRegression()\n",
@ -198,7 +215,7 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": []

View file

@ -7,10 +7,12 @@ from apt.anonymization import Anonymize
from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
def test_anonymize_ndarray_iris():
(x_train, y_train), _ = get_iris_dataset()
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_train)
@ -18,7 +20,7 @@ def test_anonymize_ndarray_iris():
k = 10
QI = [0, 2]
anonymizer = Anonymize(k, QI)
anon = anonymizer.anonymize(x_train, pred)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
assert (np.min(counts_elements) >= k)
@ -33,20 +35,22 @@ def test_anonymize_pandas_adult():
pred = model.predict(encoded)
k = 100
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
anon = anonymizer.anonymize(x_train, pred)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred, features))
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_anonymize_pandas_nursery():
(x_train, y_train), _ = get_nursery_dataset()
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
x_train = x_train.astype(str)
encoded = OneHotEncoder().fit_transform(x_train)
model = DecisionTreeClassifier()
@ -57,11 +61,11 @@ def test_anonymize_pandas_nursery():
QI = ["finance", "social", "health"]
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
anon = anonymizer.anonymize(x_train, pred)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_regression():
@ -75,7 +79,7 @@ def test_regression():
k = 10
QI = [0, 2, 5, 8]
anonymizer = Anonymize(k, QI, is_regression=True)
anon = anonymizer.anonymize(x_train, pred)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
model.fit(anon, y_train)
print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
@ -95,7 +99,7 @@ def test_errors():
anonymizer = Anonymize(10, [0, 2])
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
with pytest.raises(ValueError):
anonymizer.anonymize(x_train, y_test)
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
(x_train, y_train), _ = get_adult_dataset()
with pytest.raises(ValueError):
anonymizer.anonymize(x_train, y_train)
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))