mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-24 20:36:21 +02:00
Data and Model wrappers (#26)
* Squashed commit of wrappers:
Wrapper minimizer
* apply dataset wrapper on minimizer
* apply changes on minimization notebook
* add black_box_access and unlimited_queries params
Dataset wrapper anonymizer
Add features_names to ArrayDataset
and allow providing features names in QI and Cat features not just indexes
update notebooks
categorical features and QI passed by indexes
dataset include feature names and is_pandas param
add pytorch Dataset
Remove redundant code.
Use data wrappers in model wrapper APIs.
add generic dataset components
Create initial version of wrappers for models
* Fix handling of categorical features
This commit is contained in:
parent
d53818644e
commit
2b2dab6bef
17 changed files with 1340 additions and 752 deletions
|
|
@ -8,6 +8,7 @@ from sklearn.impute import SimpleImputer
|
|||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
|
||||
|
||||
from typing import Union, Optional
|
||||
|
||||
|
|
@ -49,61 +50,64 @@ class Anonymize:
|
|||
self.categorical_features = categorical_features
|
||||
self.is_regression = is_regression
|
||||
self.train_only_QI = train_only_QI
|
||||
self.features_names = None
|
||||
self.features = None
|
||||
|
||||
def anonymize(self, x: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFrame]) \
|
||||
-> Union[np.ndarray, pd.DataFrame]:
|
||||
def anonymize(self, dataset: ArrayDataset) -> DATA_PANDAS_NUMPY_TYPE:
|
||||
"""
|
||||
Method for performing model-guided anonymization.
|
||||
|
||||
:param x: The training data for the model. If provided as a pandas dataframe, may contain both numeric and
|
||||
categorical data.
|
||||
:param y: The predictions of the original model on the training data.
|
||||
:param dataset: Data wrapper containing the training data for the model and the predictions of the
|
||||
original model on the training data.
|
||||
:return: An array containing the anonymized training dataset.
|
||||
"""
|
||||
if type(x) == np.ndarray:
|
||||
self.features = [i for i in range(x.shape[1])]
|
||||
return self._anonymize_ndarray(x.copy(), y)
|
||||
else: # pandas
|
||||
self.features = x.columns
|
||||
if not self.categorical_features:
|
||||
raise ValueError('When supplying a pandas dataframe, categorical_features must be defined')
|
||||
return self._anonymize_pandas(x.copy(), y)
|
||||
if dataset.get_samples().shape[1] != 0:
|
||||
self.features = [i for i in range(dataset.get_samples().shape[1])]
|
||||
else:
|
||||
raise ValueError('No data provided')
|
||||
|
||||
def _anonymize_ndarray(self, x, y):
|
||||
if dataset.features_names is not None:
|
||||
self.features_names = dataset.features_names
|
||||
else: # if no names provided, use numbers instead
|
||||
self.features_names = self.features
|
||||
|
||||
if not set(self.quasi_identifiers).issubset(set(self.features_names)):
|
||||
raise ValueError('Quasi identifiers should bs a subset of the supplied features or indexes in range of '
|
||||
'the data columns')
|
||||
if self.categorical_features and not set(self.categorical_features).issubset(set(self.features_names)):
|
||||
raise ValueError('Categorical features should bs a subset of the supplied features or indexes in range of '
|
||||
'the data columns')
|
||||
self.quasi_identifiers = [i for i, v in enumerate(self.features_names) if v in self.quasi_identifiers]
|
||||
if self.categorical_features:
|
||||
self.categorical_features = [i for i, v in enumerate(self.features_names) if v in self.categorical_features]
|
||||
|
||||
transformed = self._anonymize(dataset.get_samples().copy(), dataset.get_labels())
|
||||
if dataset.is_pandas:
|
||||
return pd.DataFrame(transformed, columns=self.features_names)
|
||||
else:
|
||||
return transformed
|
||||
|
||||
def _anonymize(self, x, y):
|
||||
if x.shape[0] != y.shape[0]:
|
||||
raise ValueError("x and y should have same number of rows")
|
||||
x_anonymizer_train = x
|
||||
if self.train_only_QI:
|
||||
# build DT just on QI features
|
||||
x_anonymizer_train = x[:, self.quasi_identifiers]
|
||||
if x.dtype.kind not in 'iufc':
|
||||
x_prepared = self._modify_categorical_features(x_anonymizer_train)
|
||||
if not self.categorical_features:
|
||||
raise ValueError('when supplying an array with non-numeric data, categorical_features must be defined')
|
||||
x_prepared = self._modify_categorical_features(x)
|
||||
else:
|
||||
x_prepared = x_anonymizer_train
|
||||
if self.is_regression:
|
||||
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||
else:
|
||||
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||
self.anonymizer.fit(x_prepared, y)
|
||||
cells_by_id = self._calculate_cells(x, x_prepared)
|
||||
return self._anonymize_data_numpy(x, x_prepared, cells_by_id)
|
||||
|
||||
def _anonymize_pandas(self, x, y):
|
||||
if x.shape[0] != y.shape[0]:
|
||||
raise ValueError("x and y should have same number of rows")
|
||||
x_anonymizer_train = x
|
||||
x_prepared = x
|
||||
x_anonymizer_train = x_prepared
|
||||
if self.train_only_QI:
|
||||
# build DT just on QI features
|
||||
x_anonymizer_train = x.loc[:, self.quasi_identifiers]
|
||||
# need to one-hot encode before training the decision tree
|
||||
x_prepared = self._modify_categorical_features(x_anonymizer_train)
|
||||
x_anonymizer_train = x_prepared[:, self.quasi_identifiers]
|
||||
if self.is_regression:
|
||||
self.anonymizer = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||
else:
|
||||
self.anonymizer = DecisionTreeClassifier(random_state=10, min_samples_split=2, min_samples_leaf=self.k)
|
||||
self.anonymizer.fit(x_prepared, y)
|
||||
cells_by_id = self._calculate_cells(x, x_prepared)
|
||||
return self._anonymize_data_pandas(x, x_prepared, cells_by_id)
|
||||
|
||||
self.anonymizer.fit(x_anonymizer_train, y)
|
||||
cells_by_id = self._calculate_cells(x, x_anonymizer_train)
|
||||
return self._anonymize_data(x, x_anonymizer_train, cells_by_id)
|
||||
|
||||
def _calculate_cells(self, x, x_anonymizer_train):
|
||||
# x is original data, x_anonymizer_train is only QIs + 1-hot encoded
|
||||
|
|
@ -130,15 +134,9 @@ class Anonymize:
|
|||
# get all rows in cell
|
||||
indexes = [index for index, node_id in enumerate(node_ids) if node_id == cell['id']]
|
||||
# TODO: should we filter only those with majority label? (using hist)
|
||||
if type(x) == np.ndarray:
|
||||
rows = x[indexes]
|
||||
else: # pandas
|
||||
rows = x.iloc[indexes]
|
||||
rows = x[indexes]
|
||||
for feature in self.quasi_identifiers:
|
||||
if type(x) == np.ndarray:
|
||||
values = rows[:, feature]
|
||||
else: # pandas
|
||||
values = rows.loc[:, feature]
|
||||
values = rows[:, feature]
|
||||
if self.categorical_features and feature in self.categorical_features:
|
||||
# find most common value
|
||||
cell['representative'][feature] = Counter(values).most_common(1)[0][0]
|
||||
|
|
@ -163,7 +161,7 @@ class Anonymize:
|
|||
node_ids = self._find_sample_nodes(samples)
|
||||
return [cells_by_id[node_id] for node_id in node_ids]
|
||||
|
||||
def _anonymize_data_numpy(self, x, x_anonymizer_train, cells_by_id):
|
||||
def _anonymize_data(self, x, x_anonymizer_train, cells_by_id):
|
||||
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
|
||||
index = 0
|
||||
for row in x:
|
||||
|
|
@ -173,22 +171,12 @@ class Anonymize:
|
|||
row[feature] = cell['representative'][feature]
|
||||
return x
|
||||
|
||||
def _anonymize_data_pandas(self, x, x_anonymizer_train, cells_by_id):
|
||||
cells = self._find_sample_cells(x_anonymizer_train, cells_by_id)
|
||||
index = 0
|
||||
for i, row in x.iterrows():
|
||||
cell = cells[index]
|
||||
index += 1
|
||||
for feature in cell['representative']:
|
||||
x.at[i, feature] = cell['representative'][feature]
|
||||
return x
|
||||
|
||||
def _modify_categorical_features(self, x):
|
||||
# prepare data for DT
|
||||
used_features = self.features
|
||||
if self.train_only_QI:
|
||||
used_features = self.quasi_identifiers
|
||||
numeric_features = [f for f in x.columns if f in used_features and f not in self.categorical_features]
|
||||
numeric_features = [f for f in self.features if f in used_features and f not in self.categorical_features]
|
||||
categorical_features = [f for f in self.categorical_features if f in used_features]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
"""
|
||||
This module implements all classes needed to perform data minimization
|
||||
"""
|
||||
from typing import Union
|
||||
from typing import Union, Optional
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import copy
|
||||
|
|
@ -16,6 +16,9 @@ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from apt.utils.datasets import ArrayDataset, Data, DATA_PANDAS_NUMPY_TYPE
|
||||
from apt.utils.models import Model, SklearnRegressor, ModelOutputType, SklearnClassifier
|
||||
|
||||
|
||||
class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerMixin):
|
||||
""" A transformer that generalizes data to representative points.
|
||||
|
|
@ -24,16 +27,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
and a target accuracy. Once the generalizations are learned, can
|
||||
receive one or more data records and transform them to representative
|
||||
points based on the learned generalization.
|
||||
|
||||
An alternative way to use the transformer is to supply ``cells`` and
|
||||
``features`` in init or set_params and those will be used to transform
|
||||
An alternative way to use the transformer is to supply ``cells`` in
|
||||
init or set_params and those will be used to transform
|
||||
data to representatives. In this case, fit must still be called but
|
||||
there is no need to supply it with ``X`` and ``y``, and there is no
|
||||
need to supply an existing ``estimator`` to init.
|
||||
|
||||
In summary, either ``estimator`` and ``target_accuracy`` should be
|
||||
supplied or ``cells`` and ``features`` should be supplied.
|
||||
|
||||
supplied or ``cells`` should be supplied.
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator, optional
|
||||
|
|
@ -43,8 +43,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
The required accuracy when applying the base model to the
|
||||
generalized data. Accuracy is measured relative to the original
|
||||
accuracy of the model.
|
||||
features : list of str, optional
|
||||
The feature names, in the order that they appear in the data.
|
||||
categorical_features: list of str, optional
|
||||
The list of categorical features should only be supplied when
|
||||
passing data as a pandas dataframe.
|
||||
|
|
@ -67,28 +65,29 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
Attributes
|
||||
----------
|
||||
features_ : list of str
|
||||
The feature names, in the order that they appear in the data.
|
||||
cells_ : list of object
|
||||
The cells used to generalize records, as learned when calling fit.
|
||||
|
||||
ncp_ : float
|
||||
The NCP (information loss) score of the resulting generalization,
|
||||
as measured on the training data.
|
||||
|
||||
generalizations_ : object
|
||||
The generalizations that were learned (actual feature ranges).
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, estimator=None, target_accuracy=0.998, features=None,
|
||||
cells=None, categorical_features=None, features_to_minimize: Union[np.ndarray, list] = None
|
||||
, train_only_QI=True, is_regression=False):
|
||||
self.estimator = estimator
|
||||
def __init__(self, estimator: Union[BaseEstimator, Model] = None, target_accuracy: float = 0.998,
|
||||
cells: list = None, categorical_features: Union[np.ndarray, list] = None,
|
||||
features_to_minimize: Union[np.ndarray, list] = None, train_only_QI: bool = True,
|
||||
is_regression: bool = False):
|
||||
if issubclass(estimator.__class__, Model):
|
||||
self.estimator = estimator
|
||||
else:
|
||||
if is_regression:
|
||||
self.estimator = SklearnRegressor(estimator)
|
||||
else:
|
||||
self.estimator = SklearnClassifier(estimator, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
self.target_accuracy = target_accuracy
|
||||
self.features = features
|
||||
self.cells = cells
|
||||
self.categorical_features = []
|
||||
if categorical_features:
|
||||
|
|
@ -114,11 +113,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
ret = {}
|
||||
ret['target_accuracy'] = self.target_accuracy
|
||||
if deep:
|
||||
ret['features'] = copy.deepcopy(self.features)
|
||||
ret['cells'] = copy.deepcopy(self.cells)
|
||||
ret['estimator'] = self.estimator
|
||||
else:
|
||||
ret['features'] = copy.copy(self.features)
|
||||
ret['cells'] = copy.copy(self.cells)
|
||||
return ret
|
||||
|
||||
|
|
@ -132,8 +129,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
"""
|
||||
if 'target_accuracy' in params:
|
||||
self.target_accuracy = params['target_accuracy']
|
||||
if 'features' in params:
|
||||
self.features = params['features']
|
||||
if 'cells' in params:
|
||||
self.cells = params['cells']
|
||||
return self
|
||||
|
|
@ -142,7 +137,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
def generalizations(self):
|
||||
return self.generalizations_
|
||||
|
||||
def fit_transform(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
|
||||
def fit_transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
||||
features_names: Optional = None, dataset: Optional[ArrayDataset] = None):
|
||||
"""Learns the generalizations based on training data, and applies them to the data.
|
||||
|
||||
Parameters
|
||||
|
|
@ -152,17 +148,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
y : array-like, shape (n_samples,), optional
|
||||
The target values. An array of int.
|
||||
This should contain the predictions of the original model on ``X``.
|
||||
|
||||
features_names : list of str, The feature names, in the order that they appear in the data,
|
||||
provided just if X and y were provided (optional).
|
||||
dataset : Data wrapper containing the training input samples and the predictions of the
|
||||
original model on the training data.
|
||||
Either X,y OR dataset need to be provided, not both.
|
||||
Returns
|
||||
-------
|
||||
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
|
||||
The array containing the representative values to which each record in
|
||||
``X`` is mapped.
|
||||
"""
|
||||
self.fit(X, y)
|
||||
return self.transform(X)
|
||||
self.fit(X, y, features_names, dataset=dataset)
|
||||
return self.transform(X, features_names, dataset=dataset)
|
||||
|
||||
def fit(self, X: Union[np.ndarray, pd.DataFrame] = None, y: Union[np.ndarray, pd.DataFrame] = None):
|
||||
def fit(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, y: Optional[DATA_PANDAS_NUMPY_TYPE] = None,
|
||||
features_names: Optional = None, dataset: ArrayDataset = None):
|
||||
"""Learns the generalizations based on training data.
|
||||
|
||||
Parameters
|
||||
|
|
@ -172,7 +173,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
y : array-like, shape (n_samples,), optional
|
||||
The target values. An array of int.
|
||||
This should contain the predictions of the original model on ``X``.
|
||||
|
||||
features_names : list of str, The feature names, in the order that they appear in the data,
|
||||
provided just if X and y were provided (optional).
|
||||
dataset : Data wrapper containing the training input samples and the predictions of the
|
||||
original model on the training data.
|
||||
Either X,y OR dataset need to be provided, not both.
|
||||
Returns
|
||||
-------
|
||||
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
|
||||
|
|
@ -181,26 +186,25 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
"""
|
||||
|
||||
# take into account that estimator, X, y, cells, features may be None
|
||||
if X is not None:
|
||||
if type(X) == np.ndarray:
|
||||
self.is_numpy = True
|
||||
else:
|
||||
self.is_numpy = False
|
||||
|
||||
if X is not None and y is not None:
|
||||
if self.is_numpy:
|
||||
X, y = check_X_y(X, y, accept_sparse=True)
|
||||
self.n_features_ = X.shape[1]
|
||||
elif self.features:
|
||||
self.n_features_ = len(self.features)
|
||||
if dataset is not None:
|
||||
raise ValueError('Either X,y OR dataset need to be provided, not both')
|
||||
else:
|
||||
dataset = ArrayDataset(X, y, features_names)
|
||||
|
||||
if dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
|
||||
self.n_features_ = dataset.get_samples().shape[1]
|
||||
|
||||
elif dataset and dataset.features_names:
|
||||
self.n_features_ = len(dataset.features_names)
|
||||
else:
|
||||
self.n_features_ = 0
|
||||
|
||||
if self.features:
|
||||
self._features = self.features
|
||||
if dataset and dataset.features_names:
|
||||
self._features = dataset.features_names
|
||||
# if features is None, use numbers instead of names
|
||||
elif self.n_features_ != 0:
|
||||
self._features = [i for i in range(self.n_features_)]
|
||||
self._features = [str(i) for i in range(self.n_features_)]
|
||||
else:
|
||||
self._features = None
|
||||
|
||||
|
|
@ -212,27 +216,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
# Going to fit
|
||||
# (currently not dealing with option to fit with only X and y and no estimator)
|
||||
if self.estimator and X is not None and y is not None:
|
||||
if self.estimator and dataset and dataset.get_samples() is not None and dataset.get_labels() is not None:
|
||||
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
|
||||
if not self.features_to_minimize:
|
||||
self.features_to_minimize = self._features
|
||||
self.features_to_minimize = [str(i) for i in self.features_to_minimize]
|
||||
if not all(elem in self._features for elem in self.features_to_minimize):
|
||||
raise ValueError('features to minimize should be a subset of features names')
|
||||
x_QI = x.loc[:, self.features_to_minimize]
|
||||
|
||||
if self.is_numpy:
|
||||
if not self.features_to_minimize:
|
||||
self.features_to_minimize = [i for i in range(len(self._features))]
|
||||
x_QI = X[:, self.features_to_minimize]
|
||||
self.features_to_minimize = [self._features[i] for i in self.features_to_minimize]
|
||||
X = pd.DataFrame(X, columns=self._features)
|
||||
else:
|
||||
if not self.features_to_minimize:
|
||||
self.features_to_minimize = self._features
|
||||
x_QI = X.loc[:, self.features_to_minimize]
|
||||
x_QI = pd.DataFrame(x_QI, columns=self.features_to_minimize)
|
||||
# divide dataset into train and test
|
||||
used_data = X
|
||||
used_data = x
|
||||
if self.train_only_QI:
|
||||
used_data = x_QI
|
||||
if self.is_regression:
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=14)
|
||||
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), test_size=0.4, random_state=14)
|
||||
else:
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=18)
|
||||
X_train, X_test, y_train, y_test = train_test_split(x, dataset.get_labels(), stratify=dataset.get_labels(), test_size=0.4,
|
||||
random_state=18)
|
||||
|
||||
X_train_QI = X_train.loc[:, self.features_to_minimize]
|
||||
X_test_QI = X_test.loc[:, self.features_to_minimize]
|
||||
|
|
@ -246,7 +247,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
for feature in self._features:
|
||||
if feature not in feature_data.keys():
|
||||
fd = {}
|
||||
values = list(X.loc[:, feature])
|
||||
values = list(x.loc[:, feature])
|
||||
if feature not in self.categorical_features:
|
||||
fd['min'] = min(values)
|
||||
fd['max'] = max(values)
|
||||
|
|
@ -259,7 +260,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
categorical_features = [f for f in self._features if f in self.categorical_features and
|
||||
f in self.features_to_minimize]
|
||||
|
||||
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
|
|
@ -288,7 +288,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
("cat", categorical_transformer, self.categorical_features),
|
||||
]
|
||||
)
|
||||
preprocessor.fit(X)
|
||||
preprocessor.fit(x)
|
||||
x_prepared = preprocessor.transform(X_train)
|
||||
if self.train_only_QI:
|
||||
x_prepared = preprocessor_QI_features.transform(X_train_QI)
|
||||
|
|
@ -300,7 +300,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
|
||||
else:
|
||||
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
min_samples_leaf=1)
|
||||
self.dt_.fit(x_prepared, y_train)
|
||||
self._modify_categorical_features(used_data)
|
||||
|
||||
|
|
@ -329,7 +329,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
|
||||
|
||||
# check accuracy
|
||||
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
|
||||
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
|
||||
print('Initial accuracy of model on generalized data, relative to original model predictions '
|
||||
'(base generalization derived from tree, before improvements): %f' % accuracy)
|
||||
|
||||
|
|
@ -349,7 +349,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
self._calculate_generalizations()
|
||||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
|
||||
self.cells_by_id_)
|
||||
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
|
||||
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
|
||||
# if accuracy passed threshold roll back to previous iteration generalizations
|
||||
if accuracy < self.target_accuracy:
|
||||
self.cells_ = cells_previous_iter
|
||||
|
|
@ -375,7 +375,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
|
||||
self._calculate_generalizations()
|
||||
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
|
||||
accuracy = self.estimator.score(preprocessor.transform(generalized), y_test)
|
||||
accuracy = self.estimator.score(ArrayDataset(preprocessor.transform(generalized), y_test))
|
||||
print('Removed feature: %s, new relative accuracy: %f' % (removed_feature, accuracy))
|
||||
|
||||
# self.cells_ currently holds the chosen generalization based on target accuracy
|
||||
|
|
@ -386,7 +386,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# Return the transformer
|
||||
return self
|
||||
|
||||
def transform(self, X: Union[np.ndarray, pd.DataFrame]):
|
||||
def transform(self, X: Optional[DATA_PANDAS_NUMPY_TYPE] = None, features_names: Optional = None, dataset: ArrayDataset = None):
|
||||
""" Transforms data records to representative points.
|
||||
|
||||
Parameters
|
||||
|
|
@ -394,6 +394,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
|
||||
may contain both numeric and categorical data.
|
||||
The input samples.
|
||||
features_names : list of str, The feature names, in the order that they appear in the data,
|
||||
provided just if X was provided (optional).
|
||||
dataset : Data wrapper containing the training input samples.
|
||||
Either X OR dataset need to be provided, not both.
|
||||
Returns
|
||||
-------
|
||||
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
|
||||
|
|
@ -405,26 +409,30 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
msg = 'This %(name)s instance is not initialized yet. ' \
|
||||
'Call ‘fit’ or ‘set_params’ with ' \
|
||||
'appropriate arguments before using this method.'
|
||||
check_is_fitted(self, ['cells', 'features'], msg=msg)
|
||||
check_is_fitted(self, ['cells'], msg=msg)
|
||||
|
||||
if type(X) == np.ndarray:
|
||||
# Input validation
|
||||
X = check_array(X, accept_sparse=True)
|
||||
self.is_numpy = True
|
||||
X = pd.DataFrame(X, columns=self._features)
|
||||
else:
|
||||
self.is_numpy = False
|
||||
if X is not None:
|
||||
if dataset is not None:
|
||||
raise ValueError('Either X OR dataset need to be provided, not both')
|
||||
else:
|
||||
dataset = ArrayDataset(X, features_names=features_names)
|
||||
elif dataset is None:
|
||||
raise ValueError('Either X OR dataset need to be provided, not both')
|
||||
if dataset and dataset.features_names:
|
||||
self._features = dataset.features_names
|
||||
if dataset and dataset.get_samples() is not None:
|
||||
x = pd.DataFrame(dataset.get_samples(), columns=self._features)
|
||||
|
||||
if X.shape[1] != self.n_features_ and self.n_features_ != 0:
|
||||
if x.shape[1] != self.n_features_ and self.n_features_ != 0:
|
||||
raise ValueError('Shape of input is different from what was seen'
|
||||
'in `fit`')
|
||||
|
||||
if not self._features:
|
||||
self._features = [i for i in range(X.shape[1])]
|
||||
self._features = [i for i in range(x.shape[1])]
|
||||
|
||||
representatives = pd.DataFrame(columns=self._features) # only columns
|
||||
generalized = pd.DataFrame(X, columns=self._features, copy=True) # original data
|
||||
mapped = np.zeros(X.shape[0]) # to mark records we already mapped
|
||||
generalized = pd.DataFrame(x, columns=self._features, copy=True) # original data
|
||||
mapped = np.zeros(x.shape[0]) # to mark records we already mapped
|
||||
|
||||
# iterate over cells (leaves in decision tree)
|
||||
for i in range(len(self.cells_)):
|
||||
|
|
@ -443,7 +451,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
representatives = representatives.drop(feature, axis=1)
|
||||
|
||||
# get the indexes of all records that map to this cell
|
||||
indexes = self._get_record_indexes_for_cell(X, self.cells_[i], mapped)
|
||||
indexes = self._get_record_indexes_for_cell(x, self.cells_[i], mapped)
|
||||
|
||||
# replace the values in the representative columns with the representative
|
||||
# values (leaves others untouched)
|
||||
|
|
@ -454,9 +462,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
|
||||
replace.index = indexes
|
||||
generalized.loc[indexes, representatives.columns] = replace
|
||||
if self.is_numpy:
|
||||
return generalized.to_numpy()
|
||||
return generalized
|
||||
if dataset and dataset.is_pandas:
|
||||
return generalized
|
||||
elif isinstance(X, pd.DataFrame):
|
||||
return generalized
|
||||
return generalized.to_numpy()
|
||||
|
||||
def _get_record_indexes_for_cell(self, X, cell, mapped):
|
||||
indexes = []
|
||||
|
|
@ -640,7 +650,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
# else: nothing to do, stay with previous cells
|
||||
|
||||
def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
|
||||
new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
|
||||
new_cell['hist'] = [x + y for x, y in
|
||||
zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
|
||||
new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
|
||||
|
||||
def _get_nodes_level(self, level):
|
||||
|
|
@ -797,8 +808,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
cells_by_id = copy.deepcopy(self.cells_by_id_)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
|
||||
accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
|
||||
labels) - current_accuracy
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
|
||||
labels)) - current_accuracy
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
if accuracy_gain != 0:
|
||||
|
|
@ -820,8 +831,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
|
|||
cells_by_id = copy.deepcopy(self.cells_by_id_)
|
||||
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
|
||||
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
|
||||
accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
|
||||
labels) - current_accuracy
|
||||
accuracy_gain = self.estimator.score(ArrayDataset(self._preprocessor.transform(generalized),
|
||||
labels)) - current_accuracy
|
||||
|
||||
if accuracy_gain < 0:
|
||||
accuracy_gain = 0
|
||||
|
|
|
|||
0
apt/utils/__init__.py
Normal file
0
apt/utils/__init__.py
Normal file
|
|
@ -13,8 +13,7 @@ def _load_iris(test_set_size: float = 0.3):
|
|||
|
||||
# Split training and test sets
|
||||
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
|
||||
random_state=18, stratify=labels,
|
||||
shuffle=True)
|
||||
random_state=18, stratify=labels)
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
|
@ -29,6 +28,28 @@ def get_iris_dataset(test_set: float = 0.3):
|
|||
return _load_iris(test_set)
|
||||
|
||||
|
||||
def _load_diabetes(test_set_size: float = 0.3):
|
||||
diabetes = datasets.load_diabetes()
|
||||
data = diabetes.data
|
||||
labels = diabetes.target
|
||||
|
||||
# Split training and test sets
|
||||
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
|
||||
random_state=18)
|
||||
|
||||
return (x_train, y_train), (x_test, y_test)
|
||||
|
||||
|
||||
def get_diabetes_dataset():
|
||||
"""
|
||||
Loads the Iris dataset from scikit-learn.
|
||||
|
||||
:param test_set: Proportion of the data to use as validation split (value between 0 and 1).
|
||||
:return: Entire dataset and labels as numpy array.
|
||||
"""
|
||||
return _load_diabetes()
|
||||
|
||||
|
||||
def get_german_credit_dataset(test_set: float = 0.3):
|
||||
"""
|
||||
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
|
||||
|
|
@ -253,7 +274,7 @@ def get_nursery_dataset(raw: bool = True, test_set: float = 0.2, transform_socia
|
|||
raise Exception("Bad label value: %s" % value)
|
||||
|
||||
data["label"] = data["label"].apply(modify_label)
|
||||
data["children"] = data["children"].apply(lambda x: 4 if x == "more" else x)
|
||||
data["children"] = data["children"].apply(lambda x: "4" if x == "more" else x)
|
||||
|
||||
if transform_social:
|
||||
|
||||
7
apt/utils/datasets/__init__.py
Normal file
7
apt/utils/datasets/__init__.py
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
"""
|
||||
The AI Privacy Toolbox (datasets).
|
||||
Implementation of datasets utility components for datasets creation, load, and store
|
||||
"""
|
||||
|
||||
from apt.utils.datasets.datasets import Dataset, StoredDataset, DatasetFactory, Data, ArrayDataset, \
|
||||
OUTPUT_DATA_ARRAY_TYPE, DATA_PANDAS_NUMPY_TYPE
|
||||
320
apt/utils/datasets/datasets.py
Normal file
320
apt/utils/datasets/datasets.py
Normal file
|
|
@ -0,0 +1,320 @@
|
|||
# !/usr/bin/env python
|
||||
"""
|
||||
The AI Privacy Toolbox (datasets).
|
||||
Implementation of utility classes for dataset handling
|
||||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from typing import Callable, Collection, Any, Union, List, Optional
|
||||
|
||||
import tarfile
|
||||
import os
|
||||
import urllib.request
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import logging
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
INPUT_DATA_ARRAY_TYPE = Union[np.ndarray, pd.DataFrame, List, Tensor]
|
||||
OUTPUT_DATA_ARRAY_TYPE = np.ndarray
|
||||
DATA_PANDAS_NUMPY_TYPE = Union[np.ndarray, pd.DataFrame]
|
||||
|
||||
|
||||
def array2numpy(self, arr: INPUT_DATA_ARRAY_TYPE) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
|
||||
"""
|
||||
converts from INPUT_DATA_ARRAY_TYPE to numpy array
|
||||
"""
|
||||
if type(arr) == np.ndarray:
|
||||
return arr
|
||||
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
|
||||
self.is_pandas = True
|
||||
return arr.to_numpy()
|
||||
if isinstance(arr, list):
|
||||
return np.array(arr)
|
||||
if type(arr) == Tensor:
|
||||
return arr.detach().cpu().numpy()
|
||||
|
||||
raise ValueError('Non supported type: ', type(arr).__name__)
|
||||
|
||||
|
||||
def array2torch_tensor(self, arr: INPUT_DATA_ARRAY_TYPE) -> Tensor:
|
||||
"""
|
||||
converts from INPUT_DATA_ARRAY_TYPE to torch tensor array
|
||||
"""
|
||||
if type(arr) == np.ndarray:
|
||||
return torch.from_numpy(arr)
|
||||
if type(arr) == pd.DataFrame or type(arr) == pd.Series:
|
||||
self.is_pandas = True
|
||||
return torch.from_numpy(arr.to_numpy())
|
||||
if isinstance(arr, list):
|
||||
return torch.tensor(arr)
|
||||
if type(arr) == Tensor:
|
||||
return arr
|
||||
|
||||
raise ValueError('Non supported type: ', type(arr).__name__)
|
||||
|
||||
|
||||
class Dataset(metaclass=ABCMeta):
|
||||
"""Base Abstract Class for Dataset"""
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_samples(self) -> Collection[Any]:
|
||||
"""Return data samples"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_labels(self) -> Collection[Any]:
|
||||
"""Return labels"""
|
||||
pass
|
||||
|
||||
|
||||
class StoredDataset(Dataset):
|
||||
"""Abstract Class for Storable Dataset"""
|
||||
|
||||
@abstractmethod
|
||||
def load_from_file(self, path: str):
|
||||
"""Load dataset from file"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load(self, **kwargs):
|
||||
"""Load dataset"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def download(url: str, dest_path: str, filename: str, unzip: bool = False) -> None:
|
||||
"""
|
||||
Download the dataset from URL
|
||||
:param url: dataset URL, the dataset will be requested from this URL
|
||||
:param dest_path: local dataset destination path
|
||||
:param filename: local dataset filename
|
||||
:param unzip: flag whether or not perform extraction
|
||||
:return: None
|
||||
"""
|
||||
file_path = os.path.join(dest_path, filename)
|
||||
|
||||
if os.path.exists(file_path):
|
||||
logger.warning("Files already downloaded, skipping downloading")
|
||||
|
||||
else:
|
||||
os.makedirs(dest_path, exist_ok=True)
|
||||
logger.info("Downloading the dataset...")
|
||||
urllib.request.urlretrieve(url, file_path)
|
||||
logger.info('Dataset Downloaded')
|
||||
|
||||
if unzip:
|
||||
StoredDataset.extract_archive(zip_path=file_path, dest_path=dest_path, remove_archive=False)
|
||||
|
||||
@staticmethod
|
||||
def extract_archive(zip_path: str, dest_path=None, remove_archive=False):
|
||||
"""
|
||||
Extract dataset from archived file
|
||||
:param zip_path: path to archived file
|
||||
:param dest_path: directory path to uncompress the file to
|
||||
:param remove_archive: whether remove the archive file after uncompress (default False)
|
||||
:return: None
|
||||
"""
|
||||
logger.info("Extracting the dataset...")
|
||||
tar = tarfile.open(zip_path)
|
||||
tar.extractall(path=dest_path)
|
||||
|
||||
logger.info("Dataset was extracted to {}".format(dest_path))
|
||||
if remove_archive:
|
||||
logger.info("Removing a zip file")
|
||||
os.remove(zip_path)
|
||||
logger.info("Extracted the dataset")
|
||||
|
||||
@staticmethod
|
||||
def split_debug(datafile: str, dest_datafile: str, ratio: int, shuffle=True, delimiter=",", fmt=None) -> None:
|
||||
"""
|
||||
Split the data and take only a part of it
|
||||
:param datafile: dataset file path
|
||||
:param dest_datafile: destination path for the partial dataset file
|
||||
:param ratio: part of the dataset to save
|
||||
:param shuffle: whether to shuffle the data or not (default True)
|
||||
:param delimiter: dataset delimiter (default ",")
|
||||
:param fmt: format for the correct data saving
|
||||
:return: None
|
||||
"""
|
||||
if os.path.isfile(dest_datafile):
|
||||
logger.info(f"The partial debug split already exists {dest_datafile}")
|
||||
return
|
||||
else:
|
||||
os.makedirs(os.path.dirname(dest_datafile), exist_ok=True)
|
||||
|
||||
data = np.genfromtxt(datafile, delimiter=delimiter)
|
||||
if shuffle:
|
||||
logger.info("Shuffling data")
|
||||
np.random.shuffle(data)
|
||||
|
||||
debug_data = data[:int(len(data) * ratio)]
|
||||
logger.info(f"Saving {ratio} of the data to {dest_datafile}")
|
||||
np.savetxt(dest_datafile, debug_data, delimiter=delimiter, fmt=fmt)
|
||||
|
||||
|
||||
class ArrayDataset(Dataset):
|
||||
"""Dataset that is based on x and y arrays (e.g., numpy/pandas/list...)"""
|
||||
|
||||
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None,
|
||||
features_names: Optional = None, **kwargs):
|
||||
"""
|
||||
ArrayDataset constructor.
|
||||
:param x: collection of data samples
|
||||
:param y: collection of labels (optional)
|
||||
:param feature_names: list of str, The feature names, in the order that they appear in the data (optional)
|
||||
:param kwargs: dataset parameters
|
||||
"""
|
||||
self.is_pandas = False
|
||||
self.features_names = features_names
|
||||
self._y = array2numpy(self, y) if y is not None else None
|
||||
self._x = array2numpy(self, x)
|
||||
if self.is_pandas:
|
||||
if features_names and not np.array_equal(features_names, x.columns):
|
||||
raise ValueError("The supplied features are not the same as in the data features")
|
||||
self.features_names = x.columns.to_list()
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of x and y')
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return data samples as numpy array"""
|
||||
return self._x
|
||||
|
||||
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return labels as numpy array"""
|
||||
return self._y
|
||||
|
||||
|
||||
class PytorchData(Dataset):
|
||||
|
||||
def __init__(self, x: INPUT_DATA_ARRAY_TYPE, y: Optional[INPUT_DATA_ARRAY_TYPE] = None, **kwargs):
|
||||
"""
|
||||
PytorchData constructor.
|
||||
:param x: collection of data samples
|
||||
:param y: collection of labels (optional)
|
||||
:param kwargs: dataset parameters
|
||||
"""
|
||||
self.is_pandas = False
|
||||
self._y = array2torch_tensor(self, y) if y is not None else None
|
||||
self._x = array2torch_tensor(self, x)
|
||||
if self.is_pandas:
|
||||
self.features_names = x.columns
|
||||
|
||||
if y is not None and len(self._x) != len(self._y):
|
||||
raise ValueError('Non equivalent lengths of x and y')
|
||||
|
||||
|
||||
if self._y is not None:
|
||||
self.__getitem__ = self.get_item
|
||||
else:
|
||||
self.__getitem__ = self.get_sample_item
|
||||
|
||||
|
||||
def get_samples(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return data samples as numpy array"""
|
||||
return array2numpy(self._x)
|
||||
|
||||
def get_labels(self) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""Return labels as numpy array"""
|
||||
return array2numpy(self._y) if self._y is not None else None
|
||||
|
||||
def get_sample_item(self, idx) -> Tensor:
|
||||
return self.x[idx]
|
||||
|
||||
def get_item(self, idx) -> Tensor:
|
||||
sample, label = self.x[idx], self.y[idx]
|
||||
return sample, label
|
||||
|
||||
def __len__(self):
|
||||
return len(self.x)
|
||||
|
||||
|
||||
class DatasetFactory:
|
||||
"""Factory class for dataset creation"""
|
||||
registry = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, name: str) -> Callable:
|
||||
"""
|
||||
Class method to register Dataset to the internal registry
|
||||
:param name: dataset name
|
||||
:return:
|
||||
"""
|
||||
|
||||
def inner_wrapper(wrapped_class: Dataset) -> Any:
|
||||
if name in cls.registry:
|
||||
logger.warning('Dataset %s already exists. Will replace it', name)
|
||||
cls.registry[name] = wrapped_class
|
||||
return wrapped_class
|
||||
|
||||
return inner_wrapper
|
||||
|
||||
@classmethod
|
||||
def create_dataset(cls, name: str, **kwargs) -> Dataset:
|
||||
"""
|
||||
Factory command to create dataset instance.
|
||||
This method gets the appropriate Dataset class from the registry
|
||||
and creates an instance of it, while passing in the parameters
|
||||
given in ``kwargs``.
|
||||
:param name: The name of the dataset to create.
|
||||
:param kwargs: dataset parameters
|
||||
:return: An instance of the dataset that is created.
|
||||
"""
|
||||
if name not in cls.registry:
|
||||
msg = f'Dataset {name} does not exist in the registry'
|
||||
logger.error(msg)
|
||||
raise ValueError(msg)
|
||||
|
||||
exec_class = cls.registry[name]
|
||||
executor = exec_class(**kwargs)
|
||||
return executor
|
||||
|
||||
|
||||
class Data:
|
||||
def __init__(self, train: Dataset = None, test: Dataset = None, **kwargs):
|
||||
"""
|
||||
Data class constructor.
|
||||
The class stores train and test datasets.
|
||||
If neither of the datasets was provided,
|
||||
Both train and test datasets will be create using
|
||||
DatasetFactory to create a dataset instance
|
||||
"""
|
||||
if train or test:
|
||||
self.train = train
|
||||
self.test = test
|
||||
else:
|
||||
self.train = DatasetFactory.create_dataset(train=True, **kwargs)
|
||||
self.test = DatasetFactory.create_dataset(train=False, **kwargs)
|
||||
|
||||
def get_train_set(self) -> Dataset:
|
||||
"""Return train DatasetBase"""
|
||||
return self.train
|
||||
|
||||
def get_test_set(self) -> Dataset:
|
||||
"""Return test DatasetBase"""
|
||||
return self.test
|
||||
|
||||
def get_train_samples(self) -> Collection[Any]:
|
||||
"""Return train set samples"""
|
||||
return self.train.get_samples()
|
||||
|
||||
def get_train_labels(self) -> Collection[Any]:
|
||||
"""Return train set labels"""
|
||||
return self.train.get_labels()
|
||||
|
||||
def get_test_samples(self) -> Collection[Any]:
|
||||
"""Return test set samples"""
|
||||
return self.test.get_samples()
|
||||
|
||||
def get_test_labels(self) -> Collection[Any]:
|
||||
"""Return test set labels"""
|
||||
return self.test.get_labels()
|
||||
2
apt/utils/models/__init__.py
Normal file
2
apt/utils/models/__init__.py
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
from apt.utils.models.model import Model, ModelOutputType
|
||||
from apt.utils.models.sklearn_model import SklearnModel, SklearnClassifier, SklearnRegressor
|
||||
109
apt/utils/models/model.py
Normal file
109
apt/utils/models/model.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
from abc import ABCMeta, abstractmethod
|
||||
from typing import Any, Optional
|
||||
from enum import Enum, auto
|
||||
|
||||
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
|
||||
|
||||
|
||||
class ModelOutputType(Enum):
|
||||
CLASSIFIER_VECTOR = auto() # probabilities or logits
|
||||
CLASSIFIER_SCALAR = auto() # label only
|
||||
REGRESSOR_SCALAR = auto() # value
|
||||
|
||||
|
||||
class Model(metaclass=ABCMeta):
|
||||
"""
|
||||
Abstract base class for ML model wrappers.
|
||||
"""
|
||||
|
||||
def __init__(self, model: Any, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
|
||||
unlimited_queries: Optional[bool] = True, **kwargs):
|
||||
"""
|
||||
Initialize a `Model` wrapper object.
|
||||
|
||||
:param model: The original model object (of the underlying ML framework)
|
||||
:param output_type: The type of output the model yields (vector/label only for classifiers,
|
||||
value for regressors)
|
||||
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
|
||||
Set to True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals
|
||||
are also available. Optional, Default is True.
|
||||
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
|
||||
unlimited queries to the model API or whether there is a limit to the number of
|
||||
queries that can be submitted. Optional, Default is True.
|
||||
"""
|
||||
self._model = model
|
||||
self._output_type = output_type
|
||||
self._black_box_access = black_box_access
|
||||
self._unlimited_queries = unlimited_queries
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, train_data: Dataset, **kwargs) -> None:
|
||||
"""
|
||||
Fit the model using the training data.
|
||||
|
||||
:param train_data: Training data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Perform predictions using the model for input `x`.
|
||||
|
||||
:param x: Input samples.
|
||||
:type x: `np.ndarray` or `pandas.DataFrame`
|
||||
:return: Predictions from the model.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def score(self, test_data: Dataset, **kwargs):
|
||||
"""
|
||||
Score the model using test data.
|
||||
|
||||
:param test_data: Test data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
return NotImplementedError
|
||||
|
||||
@property
|
||||
def model(self) -> Any:
|
||||
"""
|
||||
Return the model.
|
||||
|
||||
:return: The model.
|
||||
"""
|
||||
return self._model
|
||||
|
||||
@property
|
||||
def output_type(self) -> ModelOutputType:
|
||||
"""
|
||||
Return the model's output type.
|
||||
|
||||
:return: The model's output type.
|
||||
"""
|
||||
return self._output_type
|
||||
|
||||
@property
|
||||
def black_box_access(self) -> bool:
|
||||
"""
|
||||
Return True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals are also available.
|
||||
|
||||
:return: True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals are also available.
|
||||
"""
|
||||
return self._black_box_access
|
||||
|
||||
@property
|
||||
def unlimited_queries(self) -> bool:
|
||||
"""
|
||||
If black_box_access is True, Return whether a user can perform unlimited queries to the model API
|
||||
or whether there is a limit to the number of queries that can be submitted.
|
||||
|
||||
:return: If black_box_access is True, Return whether a user can perform unlimited queries to the model API
|
||||
or whether there is a limit to the number of queries that can be submitted.
|
||||
"""
|
||||
return self._unlimited_queries
|
||||
112
apt/utils/models/sklearn_model.py
Normal file
112
apt/utils/models/sklearn_model.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.base import BaseEstimator
|
||||
|
||||
from apt.utils.models import Model, ModelOutputType
|
||||
from apt.utils.datasets import Dataset, OUTPUT_DATA_ARRAY_TYPE
|
||||
|
||||
from art.estimators.classification.scikitlearn import SklearnClassifier as ArtSklearnClassifier
|
||||
from art.estimators.regression.scikitlearn import ScikitlearnRegressor
|
||||
|
||||
|
||||
class SklearnModel(Model):
|
||||
"""
|
||||
Wrapper class for scikitlearn models.
|
||||
"""
|
||||
def score(self, test_data: Dataset, **kwargs):
|
||||
"""
|
||||
Score the model using test data.
|
||||
|
||||
:param test_data: Test data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
return self.model.score(test_data.get_samples(), test_data.get_labels(), **kwargs)
|
||||
|
||||
|
||||
class SklearnClassifier(SklearnModel):
|
||||
"""
|
||||
Wrapper class for scikitlearn classification models.
|
||||
"""
|
||||
def __init__(self, model: BaseEstimator, output_type: ModelOutputType, black_box_access: Optional[bool] = True,
|
||||
unlimited_queries: Optional[bool] = True, **kwargs):
|
||||
"""
|
||||
Initialize a `SklearnClassifier` wrapper object.
|
||||
|
||||
:param model: The original sklearn model object.
|
||||
:param output_type: The type of output the model yields (vector/label only for classifiers,
|
||||
value for regressors)
|
||||
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
|
||||
Set to True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals
|
||||
are also available. Optional, Default is True.
|
||||
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
|
||||
unlimited queries to the model API or whether there is a limit to the number of
|
||||
queries that can be submitted. Optional, Default is True.
|
||||
"""
|
||||
super().__init__(model, output_type, black_box_access, unlimited_queries, **kwargs)
|
||||
self._art_model = ArtSklearnClassifier(model)
|
||||
|
||||
def fit(self, train_data: Dataset, **kwargs) -> None:
|
||||
"""
|
||||
Fit the model using the training data.
|
||||
|
||||
:param train_data: Training data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
encoder = OneHotEncoder(sparse=False)
|
||||
y_encoded = encoder.fit_transform(train_data.get_labels().reshape(-1, 1))
|
||||
self._art_model.fit(train_data.get_samples(), y_encoded, **kwargs)
|
||||
|
||||
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Perform predictions using the model for input `x`.
|
||||
|
||||
:param x: Input samples.
|
||||
:type x: `np.ndarray` or `pandas.DataFrame`
|
||||
:return: Predictions from the model (class probabilities, if supported).
|
||||
"""
|
||||
return self._art_model.predict(x, **kwargs)
|
||||
|
||||
|
||||
class SklearnRegressor(SklearnModel):
|
||||
"""
|
||||
Wrapper class for scikitlearn regression models.
|
||||
"""
|
||||
def __init__(self, model: BaseEstimator, black_box_access: Optional[bool] = True,
|
||||
unlimited_queries: Optional[bool] = True, **kwargs):
|
||||
"""
|
||||
Initialize a `SklearnRegressor` wrapper object.
|
||||
|
||||
:param model: The original sklearn model object.
|
||||
:param black_box_access: Boolean describing the type of deployment of the model (when in production).
|
||||
Set to True if the model is only available via query (API) access, i.e.,
|
||||
only the outputs of the model are exposed, and False if the model internals
|
||||
are also available. Optional, Default is True.
|
||||
:param unlimited_queries: If black_box_access is True, this boolean indicates whether a user can perform
|
||||
unlimited queries to the model API or whether there is a limit to the number of
|
||||
queries that can be submitted. Optional, Default is True.
|
||||
"""
|
||||
super().__init__(model, ModelOutputType.REGRESSOR_SCALAR, black_box_access, unlimited_queries, **kwargs)
|
||||
self._art_model = ScikitlearnRegressor(model)
|
||||
|
||||
def fit(self, train_data: Dataset, **kwargs) -> None:
|
||||
"""
|
||||
Fit the model using the training data.
|
||||
|
||||
:param train_data: Training data.
|
||||
:type train_data: `Dataset`
|
||||
"""
|
||||
self._art_model.fit(train_data.get_samples(), train_data.get_labels(), **kwargs)
|
||||
|
||||
def predict(self, x: Dataset, **kwargs) -> OUTPUT_DATA_ARRAY_TYPE:
|
||||
"""
|
||||
Perform predictions using the model for input `x`.
|
||||
|
||||
:param x: Input samples.
|
||||
:type x: `np.ndarray` or `pandas.DataFrame`
|
||||
:return: Predictions from the model.
|
||||
"""
|
||||
return self._art_model.predict(x, **kwargs)
|
||||
|
|
@ -29,198 +29,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 61,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>parents</th>\n",
|
||||
" <th>has_nurs</th>\n",
|
||||
" <th>form</th>\n",
|
||||
" <th>children</th>\n",
|
||||
" <th>housing</th>\n",
|
||||
" <th>finance</th>\n",
|
||||
" <th>social</th>\n",
|
||||
" <th>health</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>8450</th>\n",
|
||||
" <td>pretentious</td>\n",
|
||||
" <td>very_crit</td>\n",
|
||||
" <td>foster</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>less_conv</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12147</th>\n",
|
||||
" <td>great_pret</td>\n",
|
||||
" <td>very_crit</td>\n",
|
||||
" <td>complete</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>inconv</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>recommended</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2780</th>\n",
|
||||
" <td>usual</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>complete</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>less_conv</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11924</th>\n",
|
||||
" <td>great_pret</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>foster</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>59</th>\n",
|
||||
" <td>usual</td>\n",
|
||||
" <td>proper</td>\n",
|
||||
" <td>complete</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5193</th>\n",
|
||||
" <td>pretentious</td>\n",
|
||||
" <td>less_proper</td>\n",
|
||||
" <td>complete</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>inconv</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>recommended</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1375</th>\n",
|
||||
" <td>usual</td>\n",
|
||||
" <td>less_proper</td>\n",
|
||||
" <td>incomplete</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>less_conv</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>priority</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10318</th>\n",
|
||||
" <td>great_pret</td>\n",
|
||||
" <td>less_proper</td>\n",
|
||||
" <td>foster</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>priority</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6396</th>\n",
|
||||
" <td>pretentious</td>\n",
|
||||
" <td>improper</td>\n",
|
||||
" <td>completed</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>less_conv</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>recommended</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>485</th>\n",
|
||||
" <td>usual</td>\n",
|
||||
" <td>proper</td>\n",
|
||||
" <td>incomplete</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>inconv</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>10366 rows × 8 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" parents has_nurs form children housing finance \\\n",
|
||||
"8450 pretentious very_crit foster 1 less_conv convenient \n",
|
||||
"12147 great_pret very_crit complete 1 critical inconv \n",
|
||||
"2780 usual critical complete 4 less_conv convenient \n",
|
||||
"11924 great_pret critical foster 1 critical convenient \n",
|
||||
"59 usual proper complete 2 convenient convenient \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"5193 pretentious less_proper complete 1 convenient inconv \n",
|
||||
"1375 usual less_proper incomplete 2 less_conv convenient \n",
|
||||
"10318 great_pret less_proper foster 4 convenient convenient \n",
|
||||
"6396 pretentious improper completed 3 less_conv convenient \n",
|
||||
"485 usual proper incomplete 1 critical inconv \n",
|
||||
"\n",
|
||||
" social health \n",
|
||||
"8450 1 not_recom \n",
|
||||
"12147 1 recommended \n",
|
||||
"2780 1 not_recom \n",
|
||||
"11924 1 not_recom \n",
|
||||
"59 0 not_recom \n",
|
||||
"... ... ... \n",
|
||||
"5193 0 recommended \n",
|
||||
"1375 1 priority \n",
|
||||
"10318 0 priority \n",
|
||||
"6396 1 recommended \n",
|
||||
"485 1 not_recom \n",
|
||||
"\n",
|
||||
"[10366 rows x 8 columns]"
|
||||
]
|
||||
"text/plain": " parents has_nurs form children housing finance \\\n8450 pretentious very_crit foster 1 less_conv convenient \n12147 great_pret very_crit complete 1 critical inconv \n2780 usual critical complete 4 less_conv convenient \n11924 great_pret critical foster 1 critical convenient \n59 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n5193 pretentious less_proper complete 1 convenient inconv \n1375 usual less_proper incomplete 2 less_conv convenient \n10318 great_pret less_proper foster 4 convenient convenient \n6396 pretentious improper completed 3 less_conv convenient \n485 usual proper incomplete 1 critical inconv \n\n social health \n8450 1 not_recom \n12147 1 recommended \n2780 1 not_recom \n11924 1 not_recom \n59 0 not_recom \n... ... ... \n5193 0 recommended \n1375 1 priority \n10318 0 priority \n6396 1 recommended \n485 1 not_recom \n\n[10366 rows x 8 columns]",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>parents</th>\n <th>has_nurs</th>\n <th>form</th>\n <th>children</th>\n <th>housing</th>\n <th>finance</th>\n <th>social</th>\n <th>health</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>8450</th>\n <td>pretentious</td>\n <td>very_crit</td>\n <td>foster</td>\n <td>1</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>12147</th>\n <td>great_pret</td>\n <td>very_crit</td>\n <td>complete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>2780</th>\n <td>usual</td>\n <td>critical</td>\n <td>complete</td>\n <td>4</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>11924</th>\n <td>great_pret</td>\n <td>critical</td>\n <td>foster</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>59</th>\n <td>usual</td>\n <td>proper</td>\n <td>complete</td>\n <td>2</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5193</th>\n <td>pretentious</td>\n <td>less_proper</td>\n <td>complete</td>\n <td>1</td>\n <td>convenient</td>\n <td>inconv</td>\n <td>0</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>1375</th>\n <td>usual</td>\n <td>less_proper</td>\n <td>incomplete</td>\n <td>2</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10318</th>\n <td>great_pret</td>\n <td>less_proper</td>\n <td>foster</td>\n <td>4</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>6396</th>\n <td>pretentious</td>\n <td>improper</td>\n <td>completed</td>\n <td>3</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>485</th>\n <td>usual</td>\n <td>proper</td>\n <td>incomplete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>not_recom</td>\n </tr>\n </tbody>\n</table>\n<p>10366 rows × 8 columns</p>\n</div>"
|
||||
},
|
||||
"execution_count": 61,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -230,7 +47,7 @@
|
|||
"import sys\n",
|
||||
"sys.path.insert(0, os.path.abspath('..'))\n",
|
||||
"\n",
|
||||
"from apt.utils import get_nursery_dataset\n",
|
||||
"from apt.utils.dataset_utils import get_nursery_dataset\n",
|
||||
"\n",
|
||||
"(x_train, y_train), (x_test, y_test) = get_nursery_dataset(transform_social=True)\n",
|
||||
"\n",
|
||||
|
|
@ -246,7 +63,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 62,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -263,9 +80,9 @@
|
|||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"\n",
|
||||
"x_train_str = x_train.astype(str)\n",
|
||||
"train_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_train_str)\n",
|
||||
"train_encoded = OneHotEncoder(sparse=False).fit_transform(x_train_str)\n",
|
||||
"x_test_str = x_test.astype(str)\n",
|
||||
"test_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(x_test_str)\n",
|
||||
"test_encoded = OneHotEncoder(sparse=False).fit_transform(x_test_str)\n",
|
||||
" \n",
|
||||
"model = DecisionTreeClassifier()\n",
|
||||
"model.fit(train_encoded, y_train)\n",
|
||||
|
|
@ -287,7 +104,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 91,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -323,14 +140,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 96,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.6430638626278217\n"
|
||||
"1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -361,14 +178,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 55,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.6980513216284006\n"
|
||||
"0.5122515917422342\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -408,224 +225,43 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 97,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>parents</th>\n",
|
||||
" <th>has_nurs</th>\n",
|
||||
" <th>form</th>\n",
|
||||
" <th>children</th>\n",
|
||||
" <th>housing</th>\n",
|
||||
" <th>finance</th>\n",
|
||||
" <th>social</th>\n",
|
||||
" <th>health</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>8450</th>\n",
|
||||
" <td>pretentious</td>\n",
|
||||
" <td>very_crit</td>\n",
|
||||
" <td>foster</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>less_conv</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12147</th>\n",
|
||||
" <td>great_pret</td>\n",
|
||||
" <td>very_crit</td>\n",
|
||||
" <td>complete</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>inconv</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>recommended</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2780</th>\n",
|
||||
" <td>usual</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>complete</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>less_conv</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11924</th>\n",
|
||||
" <td>great_pret</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>foster</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>59</th>\n",
|
||||
" <td>usual</td>\n",
|
||||
" <td>proper</td>\n",
|
||||
" <td>complete</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5193</th>\n",
|
||||
" <td>pretentious</td>\n",
|
||||
" <td>less_proper</td>\n",
|
||||
" <td>complete</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>inconv</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>recommended</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1375</th>\n",
|
||||
" <td>usual</td>\n",
|
||||
" <td>less_proper</td>\n",
|
||||
" <td>incomplete</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>less_conv</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>priority</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10318</th>\n",
|
||||
" <td>great_pret</td>\n",
|
||||
" <td>less_proper</td>\n",
|
||||
" <td>foster</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>priority</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6396</th>\n",
|
||||
" <td>pretentious</td>\n",
|
||||
" <td>improper</td>\n",
|
||||
" <td>completed</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>less_conv</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>recommended</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>485</th>\n",
|
||||
" <td>usual</td>\n",
|
||||
" <td>proper</td>\n",
|
||||
" <td>incomplete</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>critical</td>\n",
|
||||
" <td>convenient</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>not_recom</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>10366 rows × 8 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" parents has_nurs form children housing finance \\\n",
|
||||
"8450 pretentious very_crit foster 1 less_conv convenient \n",
|
||||
"12147 great_pret very_crit complete 1 critical inconv \n",
|
||||
"2780 usual critical complete 4 less_conv convenient \n",
|
||||
"11924 great_pret critical foster 1 critical convenient \n",
|
||||
"59 usual proper complete 2 convenient convenient \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"5193 pretentious less_proper complete 1 convenient inconv \n",
|
||||
"1375 usual less_proper incomplete 2 less_conv convenient \n",
|
||||
"10318 great_pret less_proper foster 4 convenient convenient \n",
|
||||
"6396 pretentious improper completed 3 less_conv convenient \n",
|
||||
"485 usual proper incomplete 1 critical convenient \n",
|
||||
"\n",
|
||||
" social health \n",
|
||||
"8450 0 not_recom \n",
|
||||
"12147 1 recommended \n",
|
||||
"2780 0 not_recom \n",
|
||||
"11924 0 not_recom \n",
|
||||
"59 0 not_recom \n",
|
||||
"... ... ... \n",
|
||||
"5193 0 recommended \n",
|
||||
"1375 1 priority \n",
|
||||
"10318 0 priority \n",
|
||||
"6396 1 recommended \n",
|
||||
"485 0 not_recom \n",
|
||||
"\n",
|
||||
"[10366 rows x 8 columns]"
|
||||
]
|
||||
"text/plain": " parents has_nurs form children housing finance \\\n0 pretentious very_crit foster 1 less_conv convenient \n1 great_pret very_crit complete 1 critical inconv \n2 usual critical complete 4 less_conv convenient \n3 great_pret critical foster 1 critical convenient \n4 usual proper complete 2 convenient convenient \n... ... ... ... ... ... ... \n10361 pretentious less_proper complete 1 convenient inconv \n10362 usual less_proper incomplete 2 less_conv convenient \n10363 great_pret less_proper foster 4 convenient convenient \n10364 pretentious improper completed 3 less_conv convenient \n10365 usual proper incomplete 1 critical convenient \n\n social health \n0 0 not_recom \n1 1 recommended \n2 0 not_recom \n3 0 not_recom \n4 0 not_recom \n... ... ... \n10361 0 recommended \n10362 1 priority \n10363 0 priority \n10364 1 recommended \n10365 0 not_recom \n\n[10366 rows x 8 columns]",
|
||||
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>parents</th>\n <th>has_nurs</th>\n <th>form</th>\n <th>children</th>\n <th>housing</th>\n <th>finance</th>\n <th>social</th>\n <th>health</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>pretentious</td>\n <td>very_crit</td>\n <td>foster</td>\n <td>1</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>1</th>\n <td>great_pret</td>\n <td>very_crit</td>\n <td>complete</td>\n <td>1</td>\n <td>critical</td>\n <td>inconv</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>2</th>\n <td>usual</td>\n <td>critical</td>\n <td>complete</td>\n <td>4</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>3</th>\n <td>great_pret</td>\n <td>critical</td>\n <td>foster</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>4</th>\n <td>usual</td>\n <td>proper</td>\n <td>complete</td>\n <td>2</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>10361</th>\n <td>pretentious</td>\n <td>less_proper</td>\n <td>complete</td>\n <td>1</td>\n <td>convenient</td>\n <td>inconv</td>\n <td>0</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>10362</th>\n <td>usual</td>\n <td>less_proper</td>\n <td>incomplete</td>\n <td>2</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10363</th>\n <td>great_pret</td>\n <td>less_proper</td>\n <td>foster</td>\n <td>4</td>\n <td>convenient</td>\n <td>convenient</td>\n <td>0</td>\n <td>priority</td>\n </tr>\n <tr>\n <th>10364</th>\n <td>pretentious</td>\n <td>improper</td>\n <td>completed</td>\n <td>3</td>\n <td>less_conv</td>\n <td>convenient</td>\n <td>1</td>\n <td>recommended</td>\n </tr>\n <tr>\n <th>10365</th>\n <td>usual</td>\n <td>proper</td>\n <td>incomplete</td>\n <td>1</td>\n <td>critical</td>\n <td>convenient</td>\n <td>0</td>\n <td>not_recom</td>\n </tr>\n </tbody>\n</table>\n<p>10366 rows × 8 columns</p>\n</div>"
|
||||
},
|
||||
"execution_count": 97,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from apt.utils.datasets import ArrayDataset\n",
|
||||
"from apt.anonymization import Anonymize\n",
|
||||
"\n",
|
||||
"features = x_train.columns\n",
|
||||
"QI = [\"finance\", \"social\", \"health\"]\n",
|
||||
"categorical_features = [\"parents\", \"has_nurs\", \"form\", \"housing\", \"finance\", \"health\", 'children']\n",
|
||||
"anonymizer = Anonymize(100, QI, categorical_features=categorical_features)\n",
|
||||
"anon = anonymizer.anonymize(x_train, x_train_predictions)\n",
|
||||
"anon"
|
||||
"QI_indexes = [i for i, v in enumerate(features) if v in QI]\n",
|
||||
"categorical_features_indexes = [i for i, v in enumerate(features) if v in categorical_features]\n",
|
||||
"anonymizer = Anonymize(100, QI_indexes, categorical_features=categorical_features_indexes)\n",
|
||||
"anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
|
||||
"anon\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"7585"
|
||||
]
|
||||
"text/plain": "7585"
|
||||
},
|
||||
"execution_count": 64,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -637,16 +273,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"5766"
|
||||
]
|
||||
"text/plain": "5766"
|
||||
},
|
||||
"execution_count": 65,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -665,7 +299,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -678,7 +312,7 @@
|
|||
],
|
||||
"source": [
|
||||
"anon_str = anon.astype(str)\n",
|
||||
"anon_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon_str)\n",
|
||||
"anon_encoded = OneHotEncoder(sparse=False).fit_transform(anon_str)\n",
|
||||
"\n",
|
||||
"anon_model = DecisionTreeClassifier()\n",
|
||||
"anon_model.fit(anon_encoded, y_train)\n",
|
||||
|
|
@ -698,14 +332,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 98,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.6471155701331275\n"
|
||||
"1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -734,14 +368,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.6982442600810341\n"
|
||||
"0.5245996527107852\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -765,15 +399,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 87,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(0.33056202194878614, 0.2888695146759663)\n",
|
||||
"(0.34112301200908796, 0.3054344667247893)\n"
|
||||
"(0.49415432579890883, 0.48976438779451525)\n",
|
||||
"(0.49415432579890883, 0.48976438779451525)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -810,15 +444,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 88,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(0.6457357075913777, 0.2002324905550712)\n",
|
||||
"(0.6472248353715898, 0.1999418773612322)\n"
|
||||
"(1.0, 0.019204655674102813)\n",
|
||||
"(0.9829787234042553, 0.04481086323957323)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -849,26 +483,24 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 74,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"anonymizer2 = Anonymize(1000, QI, categorical_features=categorical_features)\n",
|
||||
"anon2 = anonymizer2.anonymize(x_train, x_train_predictions)"
|
||||
"anonymizer2 = Anonymize(1000, QI_indexes, categorical_features=categorical_features_indexes)\n",
|
||||
"anon2 = anonymizer2.anonymize(ArrayDataset(x_train, x_train_predictions))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 75,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"4226"
|
||||
]
|
||||
"text/plain": "4226"
|
||||
},
|
||||
"execution_count": 75,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -887,7 +519,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 104,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -900,7 +532,7 @@
|
|||
],
|
||||
"source": [
|
||||
"anon2_str = anon2.astype(str)\n",
|
||||
"anon2_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon2_str)\n",
|
||||
"anon2_encoded = OneHotEncoder(sparse=False).fit_transform(anon2_str)\n",
|
||||
"\n",
|
||||
"anon2_model = DecisionTreeClassifier()\n",
|
||||
"anon2_model.fit(anon2_encoded, y_train)\n",
|
||||
|
|
@ -920,14 +552,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 105,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.6266640941539648\n"
|
||||
"1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -956,14 +588,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 106,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.6944819602546788\n"
|
||||
"0.515820953115956\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -980,17 +612,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 107,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(0.35793357933579334, 0.17037470725995316)\n",
|
||||
"(0.3360655737704918, 0.1680327868852459)\n",
|
||||
"(0.6457357075913777, 0.2002324905550712)\n",
|
||||
"(0.6327519379844961, 0.1897704155768672)\n"
|
||||
"(0.49415432579890883, 0.48976438779451525)\n",
|
||||
"(0.49415432579890883, 0.48976438779451525)\n",
|
||||
"(1.0, 0.019204655674102813)\n",
|
||||
"(1.0, 0.026382153249272552)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -1023,27 +655,26 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 111,
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"QI2 = [\"parents\", \"has_nurs\", \"form\", \"children\", \"housing\", \"finance\", \"social\", \"health\"]\n",
|
||||
"anonymizer3 = Anonymize(100, QI2, categorical_features=categorical_features)\n",
|
||||
"anon3 = anonymizer3.anonymize(x_train, x_train_predictions)"
|
||||
"QI2_indexes = [i for i, v in enumerate(features) if v in QI2]\n",
|
||||
"anonymizer3 = Anonymize(100, QI2_indexes, categorical_features=categorical_features_indexes)\n",
|
||||
"anon3 = anonymizer3.anonymize(ArrayDataset(x_train, x_train_predictions))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 112,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"39"
|
||||
]
|
||||
"text/plain": "39"
|
||||
},
|
||||
"execution_count": 112,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -1055,22 +686,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 113,
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Anonymized model accuracy: 0.7723765432098766\n",
|
||||
"BB attack accuracy: 0.5792012348060969\n",
|
||||
"WB attack accuracy: 0.6680493922438742\n"
|
||||
"Anonymized model accuracy: 0.751929012345679\n",
|
||||
"BB attack accuracy: 1.0\n",
|
||||
"WB attack accuracy: 0.5187150299054601\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"anon3_str = anon3.astype(str)\n",
|
||||
"anon3_encoded = OneHotEncoder(sparse=False, drop='if_binary').fit_transform(anon3_str)\n",
|
||||
"anon3_encoded = OneHotEncoder(sparse=False).fit_transform(anon3_str)\n",
|
||||
"\n",
|
||||
"anon3_model = DecisionTreeClassifier()\n",
|
||||
"anon3_model.fit(anon3_encoded, y_train)\n",
|
||||
|
|
@ -1105,17 +736,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 114,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(0.35793357933579334, 0.17037470725995316)\n",
|
||||
"(0.3393939393939394, 0.13114754098360656)\n",
|
||||
"(0.6457357075913777, 0.2002324905550712)\n",
|
||||
"(1, 0.0)\n"
|
||||
"(0.49415432579890883, 0.48976438779451525)\n",
|
||||
"(0.49415432579890883, 0.48976438779451525)\n",
|
||||
"(1.0, 0.019204655674102813)\n",
|
||||
"(1.0, 0.032201745877788554)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -1162,4 +793,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
@ -29,7 +29,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 97,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -44,6 +44,18 @@
|
|||
" [ 26. 11. 0. 0. 48.]\n",
|
||||
" [ 27. 9. 0. 0. 40.]]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_85828/3975777015.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" y_train = y_train.astype(np.int)\n",
|
||||
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_85828/3975777015.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" y_test = y_test.astype(np.int)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
|
@ -90,14 +102,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 116,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Base model accuracy: 0.8075056814691972\n"
|
||||
"Base model accuracy: 0.8074442601805786\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -126,9 +138,18 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 124,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
|
||||
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from art.attacks.inference.membership_inference import MembershipInferenceBlackBox\n",
|
||||
"\n",
|
||||
|
|
@ -154,14 +175,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 125,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.5440363591696352\n"
|
||||
"0.545264709495148\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -197,7 +218,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 128,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -215,6 +236,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"from apt.utils.datasets import ArrayDataset\n",
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"sys.path.insert(0, os.path.abspath('..'))\n",
|
||||
|
|
@ -223,22 +245,20 @@
|
|||
"# QI = (age, education-num, capital-gain, hours-per-week)\n",
|
||||
"QI = [0, 1, 2, 4]\n",
|
||||
"anonymizer = Anonymize(100, QI)\n",
|
||||
"anon = anonymizer.anonymize(x_train, x_train_predictions)\n",
|
||||
"anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
|
||||
"print(anon)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 104,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"6739"
|
||||
]
|
||||
"text/plain": "6739"
|
||||
},
|
||||
"execution_count": 104,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -250,16 +270,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 129,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"658"
|
||||
]
|
||||
"text/plain": "658"
|
||||
},
|
||||
"execution_count": 129,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -278,14 +296,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 130,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Anonymized model accuracy: 0.8304158221239482\n"
|
||||
"Anonymized model accuracy: 0.83078434985566\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -308,14 +326,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 131,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
|
||||
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.5034393809114359\n"
|
||||
"0.5047291487532244\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -345,15 +371,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 132,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(0.5298924372550654, 0.7806166318634075)\n",
|
||||
"(0.5030507735890172, 0.5671293452892765)\n"
|
||||
"(0.5312420517168291, 0.7696843139663432)\n",
|
||||
"(0.5048372911169745, 0.4935511607910576)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -419,4 +445,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
@ -29,7 +29,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 121,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -50,7 +50,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 122,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -86,14 +86,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 123,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.4954954954954955\n"
|
||||
"0.527027027027027\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -131,7 +131,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 124,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -141,6 +141,22 @@
|
|||
"unique rows in original data: 221\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
|
||||
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
|
||||
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
|
||||
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
|
||||
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
|
||||
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
|
||||
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
|
||||
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n",
|
||||
"/Users/olasaadi/PycharmProjects/ai-privacy-toolkit-internal/venv/lib/python3.8/site-packages/art/attacks/inference/membership_inference/black_box.py:262: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
|
||||
" self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
|
|
@ -148,11 +164,12 @@
|
|||
"k values: [5, 10, 20, 50, 75]\n",
|
||||
"unique rows: [34, 19, 8, 4, 2]\n",
|
||||
"model accuracy: [0.43165832354998956, 0.4509641063206041, -1.730181929385853, -5.577098823982753e+27, -1.2751609045828272e+25]\n",
|
||||
"attack accuracy: [0.5, 0.47297297297297297, 0.49549549549549543, 0.5, 0.47297297297297297]\n"
|
||||
"attack accuracy: [0.509009009009009, 0.481981981981982, 0.509009009009009, 0.5045045045045045, 0.4954954954954955]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from apt.utils.datasets import ArrayDataset\n",
|
||||
"from apt.anonymization import Anonymize\n",
|
||||
"k_values=[5, 10, 20, 50, 75]\n",
|
||||
"model_accuracy = []\n",
|
||||
|
|
@ -165,7 +182,7 @@
|
|||
"\n",
|
||||
"for k in k_values:\n",
|
||||
" anonymizer = Anonymize(k, QI, is_regression=True)\n",
|
||||
" anon = anonymizer.anonymize(X_train, x_train_predictions)\n",
|
||||
" anon = anonymizer.anonymize(ArrayDataset(X_train, x_train_predictions))\n",
|
||||
" unique_values.append(len(np.unique(anon, axis=0)))\n",
|
||||
" \n",
|
||||
" anon_model = LinearRegression()\n",
|
||||
|
|
@ -198,7 +215,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 124,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -42,6 +42,18 @@
|
|||
" [2.2000e+01 9.0000e+00 0.0000e+00 0.0000e+00 2.0000e+01]\n",
|
||||
" [5.2000e+01 9.0000e+00 1.5024e+04 0.0000e+00 4.0000e+01]]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:22: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" y_train = y_train.astype(np.int)\n",
|
||||
"/var/folders/9b/qbtw28w53355cvpjs4qn83yc0000gn/T/ipykernel_13726/1357868359.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
|
||||
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
|
||||
" y_test = y_test.astype(np.int)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
|
@ -84,24 +96,27 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Base model accuracy: 0.8189914624408821\n"
|
||||
"Base model accuracy: 0.8183158282660771\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from apt.utils.datasets import ArrayDataset\n",
|
||||
"from apt.utils.models import SklearnClassifier, ModelOutputType\n",
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"\n",
|
||||
"model = DecisionTreeClassifier()\n",
|
||||
"model.fit(x_train, y_train)\n",
|
||||
"base_est = DecisionTreeClassifier()\n",
|
||||
"model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)\n",
|
||||
"model.fit(ArrayDataset(x_train, y_train))\n",
|
||||
"\n",
|
||||
"print('Base model accuracy: ', model.score(x_test, y_test))"
|
||||
"print('Base model accuracy: ', model.score(ArrayDataset(x_test, y_test)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -114,26 +129,26 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
|
||||
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
|
||||
"Improving accuracy\n",
|
||||
"feature to remove: 0\n",
|
||||
"Removed feature: 0, new relative accuracy: 0.939867\n",
|
||||
"feature to remove: 4\n",
|
||||
"Removed feature: 4, new relative accuracy: 0.967247\n",
|
||||
"feature to remove: 2\n",
|
||||
"Removed feature: 2, new relative accuracy: 0.972620\n",
|
||||
"Removed feature: 2, new relative accuracy: 0.935261\n",
|
||||
"feature to remove: 4\n",
|
||||
"Removed feature: 4, new relative accuracy: 0.946776\n",
|
||||
"feature to remove: 0\n",
|
||||
"Removed feature: 0, new relative accuracy: 0.972876\n",
|
||||
"feature to remove: 1\n",
|
||||
"Removed feature: 1, new relative accuracy: 0.992323\n",
|
||||
"Removed feature: 1, new relative accuracy: 0.992835\n",
|
||||
"feature to remove: 3\n",
|
||||
"Removed feature: 3, new relative accuracy: 1.000000\n",
|
||||
"Accuracy on minimized data: 0.8237371411024106\n"
|
||||
"Accuracy on minimized data: 0.8231229847996315\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -155,10 +170,12 @@
|
|||
"X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,\n",
|
||||
" test_size = 0.4, random_state = 38)\n",
|
||||
"x_train_predictions = model.predict(X_generalizer_train)\n",
|
||||
"minimizer.fit(X_generalizer_train, x_train_predictions)\n",
|
||||
"transformed = minimizer.transform(x_test)\n",
|
||||
"if x_train_predictions.shape[1] > 1:\n",
|
||||
" x_train_predictions = np.argmax(x_train_predictions, axis=1)\n",
|
||||
"minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
|
||||
"transformed = minimizer.transform(dataset=ArrayDataset(x_test))\n",
|
||||
"\n",
|
||||
"print('Accuracy on minimized data: ', model.score(transformed, y_test))"
|
||||
"print('Accuracy on minimized data: ', model.score(ArrayDataset(transformed, y_test)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -170,14 +187,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'ranges': {}, 'untouched': [0, 1, 2, 3, 4]}\n"
|
||||
"{'ranges': {}, 'categories': {}, 'untouched': ['4', '1', '3', '0', '2']}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -197,25 +214,25 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.929376\n",
|
||||
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.936540\n",
|
||||
"Improving accuracy\n",
|
||||
"feature to remove: 0\n",
|
||||
"Removed feature: 0, new relative accuracy: 0.939867\n",
|
||||
"feature to remove: 4\n",
|
||||
"Removed feature: 4, new relative accuracy: 0.967247\n",
|
||||
"feature to remove: 2\n",
|
||||
"Removed feature: 2, new relative accuracy: 0.972620\n",
|
||||
"Removed feature: 2, new relative accuracy: 0.935261\n",
|
||||
"feature to remove: 4\n",
|
||||
"Removed feature: 4, new relative accuracy: 0.946776\n",
|
||||
"feature to remove: 0\n",
|
||||
"Removed feature: 0, new relative accuracy: 0.972876\n",
|
||||
"feature to remove: 1\n",
|
||||
"Removed feature: 1, new relative accuracy: 0.992323\n",
|
||||
"Accuracy on minimized data: 0.820205742361431\n",
|
||||
"{'ranges': {3: [546.0, 704.0, 705.5, 742.5, 782.0, 834.0, 870.0, 1446.5, 1538.5, 1612.5, 1699.0, 1744.0, 1801.0, 1814.0, 1846.0, 1881.5, 1978.5, 2248.0, 2298.5, 2537.5]}, 'untouched': [0, 1, 2, 4]}\n"
|
||||
"Removed feature: 1, new relative accuracy: 0.992835\n",
|
||||
"Accuracy on minimized data: 0.8192845079072624\n",
|
||||
"{'ranges': {'3': [569.0, 782.0, 870.0, 870.5, 938.0, 1016.5, 1311.5, 1457.0, 1494.5, 1596.0, 1629.5, 1684.0, 1805.0, 1859.0, 1867.5, 1881.5, 1938.0, 1978.5, 2119.0, 2210.0, 2218.0, 2244.5, 2298.5, 2443.5]}, 'categories': {}, 'untouched': ['2', '1', '0', '4']}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -223,9 +240,9 @@
|
|||
"# We allow a 1% deviation in accuracy from the original model accuracy\n",
|
||||
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.99)\n",
|
||||
"\n",
|
||||
"minimizer2.fit(X_generalizer_train, x_train_predictions)\n",
|
||||
"transformed2 = minimizer2.transform(x_test)\n",
|
||||
"print('Accuracy on minimized data: ', model.score(transformed2, y_test))\n",
|
||||
"minimizer2.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))\n",
|
||||
"transformed2 = minimizer2.transform(dataset=ArrayDataset(x_test))\n",
|
||||
"print('Accuracy on minimized data: ', model.score(test_data=ArrayDataset(transformed2, y_test)))\n",
|
||||
"generalizations2 = minimizer2.generalizations\n",
|
||||
"print(generalizations2)"
|
||||
]
|
||||
|
|
@ -259,4 +276,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
|
@ -2,6 +2,7 @@ numpy==1.21.0
|
|||
pandas==1.1.0
|
||||
scipy==1.4.1
|
||||
scikit-learn==0.22.2
|
||||
adversarial-robustness-toolkit>=1.9.1
|
||||
|
||||
# testing
|
||||
pytest==5.4.2
|
||||
|
|
|
|||
|
|
@ -7,13 +7,15 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|||
from sklearn.preprocessing import OneHotEncoder
|
||||
|
||||
from apt.anonymization import Anonymize
|
||||
from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
|
||||
from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
|
||||
from sklearn.datasets import load_diabetes
|
||||
from sklearn.model_selection import train_test_split
|
||||
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
|
||||
|
||||
|
||||
def test_anonymize_ndarray_iris():
|
||||
(x_train, y_train), _ = get_iris_dataset()
|
||||
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(x_train, y_train)
|
||||
pred = model.predict(x_train)
|
||||
|
|
@ -21,7 +23,7 @@ def test_anonymize_ndarray_iris():
|
|||
k = 10
|
||||
QI = [0, 2]
|
||||
anonymizer = Anonymize(k, QI, train_only_QI=True)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
|
||||
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
|
||||
assert (np.min(counts_elements) >= k)
|
||||
|
|
@ -30,10 +32,14 @@ def test_anonymize_ndarray_iris():
|
|||
|
||||
def test_anonymize_pandas_adult():
|
||||
(x_train, y_train), _ = get_adult_dataset()
|
||||
encoded = OneHotEncoder().fit_transform(x_train)
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(encoded, y_train)
|
||||
pred = model.predict(encoded)
|
||||
|
||||
k = 100
|
||||
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation',
|
||||
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
|
||||
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
|
||||
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'native-country']
|
||||
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
|
|
@ -56,12 +62,11 @@ def test_anonymize_pandas_adult():
|
|||
pred = model.predict(encoded)
|
||||
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred, features))
|
||||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
|
||||
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
|
||||
|
||||
def test_anonymize_pandas_nursery():
|
||||
(x_train, y_train), _ = get_nursery_dataset()
|
||||
|
|
@ -89,11 +94,11 @@ def test_anonymize_pandas_nursery():
|
|||
pred = model.predict(encoded)
|
||||
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
assert (anon.loc[:, QI].value_counts().min() >= k)
|
||||
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
|
||||
|
||||
|
||||
def test_regression():
|
||||
|
|
@ -107,7 +112,7 @@ def test_regression():
|
|||
k = 10
|
||||
QI = [0, 2, 5, 8]
|
||||
anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
|
||||
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
|
||||
model.fit(anon, y_train)
|
||||
print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
|
||||
|
|
@ -127,7 +132,7 @@ def test_errors():
|
|||
anonymizer = Anonymize(10, [0, 2])
|
||||
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
|
||||
with pytest.raises(ValueError):
|
||||
anonymizer.anonymize(x_train, y_test)
|
||||
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
|
||||
(x_train, y_train), _ = get_adult_dataset()
|
||||
with pytest.raises(ValueError):
|
||||
anonymizer.anonymize(x_train, y_train)
|
||||
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
|
||||
|
|
|
|||
|
|
@ -5,14 +5,15 @@ from sklearn.compose import ColumnTransformer
|
|||
|
||||
from sklearn.datasets import load_boston, load_diabetes
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
|
||||
from apt.minimization import GeneralizeToRepresentative
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
|
||||
from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
from apt.utils.models import SklearnClassifier, ModelOutputType, SklearnRegressor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
@ -38,11 +39,12 @@ def test_minimizer_params(data):
|
|||
y = [1, 1, 0]
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
base_est.fit(X, y)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(X, y))
|
||||
|
||||
gen = GeneralizeToRepresentative(base_est, features=features, cells=cells)
|
||||
gen = GeneralizeToRepresentative(model, cells=cells)
|
||||
gen.fit()
|
||||
transformed = gen.transform(X)
|
||||
transformed = gen.transform(dataset=ArrayDataset(X, features_names=features))
|
||||
|
||||
|
||||
def test_minimizer_fit(data):
|
||||
|
|
@ -58,15 +60,20 @@ def test_minimizer_fit(data):
|
|||
[69, 175],
|
||||
[24, 181],
|
||||
[18, 190]])
|
||||
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
|
||||
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
base_est.fit(X, y)
|
||||
predictions = base_est.predict(X)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(X, y))
|
||||
predictions = model.predict(X)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5)
|
||||
gen.fit(X, predictions)
|
||||
transformed = gen.transform(X)
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5)
|
||||
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(X))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}
|
||||
|
||||
|
|
@ -103,7 +110,7 @@ def test_minimizer_fit_pandas(data):
|
|||
[69, 175, 'm', 'aa'],
|
||||
[24, 181, 'm', 'bb'],
|
||||
[18, 190, 'm', 'bb']]
|
||||
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
|
||||
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
X = pd.DataFrame(X, columns=features)
|
||||
|
||||
numeric_features = ["age", "height"]
|
||||
|
|
@ -121,16 +128,22 @@ def test_minimizer_fit_pandas(data):
|
|||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(X)
|
||||
encoded = pd.DataFrame(encoded)
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
base_est.fit(encoded, y)
|
||||
predictions = base_est.predict(encoded)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(encoded, y))
|
||||
predictions = model.predict(encoded)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
# Append classifier to preprocessing pipeline.
|
||||
# Now we have a full prediction pipeline.
|
||||
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
|
||||
categorical_features=categorical_features)
|
||||
gen.fit(X, predictions)
|
||||
transformed = gen.transform(X)
|
||||
train_dataset = ArrayDataset(X, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(X))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['ola', 'height', 'sex']}
|
||||
|
||||
|
|
@ -143,7 +156,7 @@ def test_minimizer_fit_pandas(data):
|
|||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
||||
ncp = gen.ncp_
|
||||
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
|
|
@ -179,7 +192,7 @@ def test_minimizer_params_categorical(data):
|
|||
[24, 181, 'm'],
|
||||
[18, 190, 'm']]
|
||||
|
||||
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
|
||||
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
X = pd.DataFrame(X, columns=features)
|
||||
numeric_features = ["age", "height"]
|
||||
numeric_transformer = Pipeline(
|
||||
|
|
@ -196,16 +209,21 @@ def test_minimizer_params_categorical(data):
|
|||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(X)
|
||||
encoded = pd.DataFrame(encoded)
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
base_est.fit(encoded, y)
|
||||
predictions = base_est.predict(encoded)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(encoded, y))
|
||||
predictions = model.predict(encoded)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
# Append classifier to preprocessing pipeline.
|
||||
# Now we have a full prediction pipeline.
|
||||
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
|
||||
categorical_features=categorical_features, cells=cells)
|
||||
gen.fit(X, predictions)
|
||||
transformed = gen.transform(X)
|
||||
train_dataset = ArrayDataset(X, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(X))
|
||||
|
||||
|
||||
def test_minimizer_fit_QI(data):
|
||||
|
|
@ -222,16 +240,20 @@ def test_minimizer_fit_QI(data):
|
|||
[24, 181, 95],
|
||||
[18, 190, 102]])
|
||||
print(X)
|
||||
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
|
||||
QI = [0, 2]
|
||||
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
QI = ['age', 'weight']
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
base_est.fit(X, y)
|
||||
predictions = base_est.predict(X)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(X, y))
|
||||
predictions = model.predict(X)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, features_to_minimize=QI)
|
||||
gen.fit(X, predictions)
|
||||
transformed = gen.transform(X)
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
|
||||
train_dataset = ArrayDataset(X, predictions, features_names=features)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(X))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
|
||||
for key in expexted_generalizations['ranges']:
|
||||
|
|
@ -240,7 +262,7 @@ def test_minimizer_fit_QI(data):
|
|||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert ((np.delete(transformed, QI, axis=1) == np.delete(X, QI, axis=1)).all())
|
||||
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
|
|
@ -269,7 +291,7 @@ def test_minimizer_fit_pandas_QI(data):
|
|||
[24, 181, 49, 'm', 'bb'],
|
||||
[18, 190, 69, 'm', 'bb']]
|
||||
|
||||
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
|
||||
y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
X = pd.DataFrame(X, columns=features)
|
||||
QI = ['age', 'weight', 'ola']
|
||||
|
||||
|
|
@ -288,16 +310,22 @@ def test_minimizer_fit_pandas_QI(data):
|
|||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(X)
|
||||
encoded = pd.DataFrame(encoded)
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
base_est.fit(encoded, y)
|
||||
predictions = base_est.predict(encoded)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(encoded, y))
|
||||
predictions = model.predict(encoded)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
# Append classifier to preprocessing pipeline.
|
||||
# Now we have a full prediction pipeline.
|
||||
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
gen.fit(X, predictions)
|
||||
transformed = gen.transform(X)
|
||||
train_dataset = ArrayDataset(X, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(X))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
|
||||
'untouched': ['height', 'sex']}
|
||||
|
|
@ -308,12 +336,13 @@ def test_minimizer_fit_pandas_QI(data):
|
|||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
|
||||
|
||||
# assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
|
||||
# assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
||||
ncp = gen.ncp_
|
||||
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
|
|
@ -322,16 +351,19 @@ def test_minimizer_fit_pandas_QI(data):
|
|||
|
||||
def test_minimize_ndarray_iris():
|
||||
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
|
||||
(x_train, y_train), _ = get_iris_dataset()
|
||||
QI = [0, 2]
|
||||
model = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
model.fit(x_train, y_train)
|
||||
pred = model.predict(x_train)
|
||||
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
|
||||
QI = ['sepal length (cm)', 'petal length (cm)']
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(x_train, y_train))
|
||||
predictions = model.predict(x_train)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features=features, features_to_minimize=QI)
|
||||
gen.fit(x_train, pred)
|
||||
transformed = gen.transform(x_train)
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features_to_minimize=QI)
|
||||
# gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.fit_transform(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
|
||||
'categories': {}, 'untouched': ['petal width (cm)', 'sepal width (cm)']}
|
||||
|
|
@ -342,7 +374,7 @@ def test_minimize_ndarray_iris():
|
|||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
|
||||
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(x_train, [0, 2], axis=1)).all())
|
||||
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
|
|
@ -359,12 +391,13 @@ def test_minimize_ndarray_iris():
|
|||
|
||||
|
||||
def test_minimize_pandas_adult():
|
||||
(x_train, y_train), _ = get_adult_dataset()
|
||||
(x_train, y_train), (x_test, y_test) = get_adult_dataset()
|
||||
x_train = x_train.head(1000)
|
||||
y_train = y_train.head(1000)
|
||||
|
||||
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
|
||||
x_train = pd.DataFrame(x_train, columns=features)
|
||||
|
||||
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'hours-per-week', 'native-country']
|
||||
|
|
@ -384,15 +417,19 @@ def test_minimize_pandas_adult():
|
|||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(x_train)
|
||||
encoded = pd.DataFrame(encoded)
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
base_est.fit(encoded, y_train)
|
||||
predictions = base_est.predict(encoded)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(encoded, y_train))
|
||||
predictions = model.predict(encoded)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
gen.fit(x_train, predictions)
|
||||
transformed = gen.transform(x_train)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
|
||||
'workclass': [['Self-emp-not-inc', 'Private', 'Federal-gov', 'Self-emp-inc', '?', 'Local-gov', 'State-gov']],
|
||||
|
|
@ -414,12 +451,14 @@ def test_minimize_pandas_adult():
|
|||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
# assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
|
||||
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
|
||||
# assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
|
||||
ncp = gen.ncp_
|
||||
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
|
|
@ -451,15 +490,19 @@ def test_german_credit_pandas():
|
|||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(x_train)
|
||||
encoded = pd.DataFrame(encoded)
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
base_est.fit(encoded, y_train)
|
||||
predictions = base_est.predict(encoded)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(encoded, y_train))
|
||||
predictions = model.predict(encoded)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
gen.fit(x_train, predictions)
|
||||
transformed = gen.transform(x_train)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'Duration_in_month': [31.5]},
|
||||
'categories': {'Credit_history': [['A30', 'A32', 'A31', 'A34', 'A33']], 'Purpose': [
|
||||
|
|
@ -481,12 +524,14 @@ def test_german_credit_pandas():
|
|||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
# assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(QI, axis=1), x_train.drop(QI, axis=1))
|
||||
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
|
||||
# assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), x_train.drop(modified_features, axis=1))
|
||||
ncp = gen.ncp_
|
||||
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
|
|
@ -497,17 +542,258 @@ def test_regression():
|
|||
dataset = load_diabetes()
|
||||
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
|
||||
|
||||
model = DecisionTreeRegressor(random_state=10, min_samples_split=2)
|
||||
model.fit(x_train, y_train)
|
||||
pred = model.predict(x_train)
|
||||
QI = [0, 2, 5, 8]
|
||||
base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
|
||||
model = SklearnRegressor(base_est)
|
||||
model.fit(ArrayDataset(x_train, y_train))
|
||||
predictions = model.predict(x_train)
|
||||
QI = ['age', 'bmi', 's2', 's5']
|
||||
features = ['age', 'sex', 'bmi', 'bp',
|
||||
's1', 's2', 's3', 's4', 's5', 's6']
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, features=features, is_regression=True,
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(x_train, pred)
|
||||
transformed = gen.transform(x_train)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
|
||||
print('Base model accuracy (R2 score): ', model.score(ArrayDataset(x_test, y_test)))
|
||||
model.fit(ArrayDataset(transformed, y_train))
|
||||
print('Base model accuracy (R2 score) after anonymization: ', model.score(ArrayDataset(x_test, y_test)))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {
|
||||
'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
|
||||
-0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
|
||||
-0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
|
||||
0.0017505218856967986, 0.0035667913616634905, 0.007199329789727926, 0.010831868276000023,
|
||||
0.02354575227946043, 0.030810829252004623, 0.03262709779664874, 0.03444336913526058,
|
||||
0.03625963814556599, 0.03807590529322624, 0.03807590715587139, 0.047157252207398415,
|
||||
0.06168740428984165, 0.0635036751627922, 0.06895248219370842, 0.07258502021431923, 0.07621755823493004,
|
||||
0.1034616008400917],
|
||||
'bmi': [-0.07626373693346977, -0.060635464265942574, -0.056863121688365936, -0.05578530766069889,
|
||||
-0.054168591275811195, -0.042312657460570335, -0.0374625027179718, -0.03422906715422869,
|
||||
-0.033690162003040314, -0.03261234890669584, -0.02614547684788704, -0.025067666545510292,
|
||||
-0.022373135201632977, -0.016984074376523495, -0.01375063881278038, -0.007822672137990594,
|
||||
-0.004589236050378531, 0.008344509289599955, 0.015889193629845977, 0.016967005096375942,
|
||||
0.024511689320206642, 0.0272062208969146, 0.030978563241660595, 0.032595280557870865,
|
||||
0.033673093654215336, 0.04391230642795563, 0.04552902653813362, 0.05469042807817459,
|
||||
0.06977979838848114, 0.07301323488354683, 0.09349166229367256],
|
||||
's2': [-0.1044962927699089, -0.08649025857448578, -0.07740895450115204, -0.07114598527550697,
|
||||
-0.06378699466586113, -0.05971606448292732, -0.04437179118394852, -0.0398311372846365,
|
||||
-0.03137612994760275, -0.022138250060379505, -0.018067320343106985, -0.017910746857523918,
|
||||
-0.017910745926201344, -0.01618842873722315, -0.007576846517622471, -0.007263698382303119,
|
||||
-0.0010007291566580534, 0.0010347360512241721, 0.006514834007248282, 0.00933317095041275,
|
||||
0.012464655097573996, 0.019197346206055954, 0.020919663831591606, 0.02217225730419159,
|
||||
0.032036433927714825, 0.036420512944459915, 0.04080459102988243, 0.04127431474626064,
|
||||
0.04268348217010498, 0.04424922354519367, 0.04424922540783882, 0.056462014093995094, 0.05928034894168377,
|
||||
0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
|
||||
'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}
|
||||
|
||||
for key in expexted_generalizations['ranges']:
|
||||
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
|
||||
for key in expexted_generalizations['categories']:
|
||||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
|
||||
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
indexes = []
|
||||
for i in range(len(features)):
|
||||
if features[i] in modified_features:
|
||||
indexes.append(i)
|
||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
|
||||
ncp = gen.ncp_
|
||||
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (x_train[indexes])).any())
|
||||
|
||||
|
||||
def test_X_y(data):
|
||||
features = [0, 1, 2]
|
||||
X = np.array([[23, 165, 70],
|
||||
[45, 158, 67],
|
||||
[56, 123, 65],
|
||||
[67, 154, 90],
|
||||
[45, 149, 67],
|
||||
[42, 166, 58],
|
||||
[73, 172, 68],
|
||||
[94, 168, 69],
|
||||
[69, 175, 80],
|
||||
[24, 181, 95],
|
||||
[18, 190, 102]])
|
||||
print(X)
|
||||
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
QI = [0, 2]
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(X, y))
|
||||
predictions = model.predict(X)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
|
||||
gen.fit(X=X, y=predictions)
|
||||
transformed = gen.transform(X)
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'0': [], '2': [67.5]}, 'categories': {}, 'untouched': ['1']}
|
||||
for key in expexted_generalizations['ranges']:
|
||||
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
|
||||
for key in expexted_generalizations['categories']:
|
||||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
|
||||
modified_features = [f for f in features if
|
||||
str(f) in expexted_generalizations['categories'].keys() or str(f) in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
indexes = []
|
||||
for i in range(len(features)):
|
||||
if features[i] in modified_features:
|
||||
indexes.append(i)
|
||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
||||
ncp = gen.ncp_
|
||||
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
|
||||
def test_X_y_features_names(data):
|
||||
features = ['age', 'height', 'weight']
|
||||
X = np.array([[23, 165, 70],
|
||||
[45, 158, 67],
|
||||
[56, 123, 65],
|
||||
[67, 154, 90],
|
||||
[45, 149, 67],
|
||||
[42, 166, 58],
|
||||
[73, 172, 68],
|
||||
[94, 168, 69],
|
||||
[69, 175, 80],
|
||||
[24, 181, 95],
|
||||
[18, 190, 102]])
|
||||
print(X)
|
||||
y = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
QI = ['age', 'weight']
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
model.fit(ArrayDataset(X, y))
|
||||
predictions = model.predict(X)
|
||||
if predictions.shape[1] > 1:
|
||||
predictions = np.argmax(predictions, axis=1)
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5, features_to_minimize=QI)
|
||||
gen.fit(X=X, y=predictions, features_names=features)
|
||||
transformed = gen.transform(X=X, features_names=features)
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
|
||||
for key in expexted_generalizations['ranges']:
|
||||
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
|
||||
for key in expexted_generalizations['categories']:
|
||||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert ((np.delete(transformed, [0, 2], axis=1) == np.delete(X, [0, 2], axis=1)).all())
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
indexes = []
|
||||
for i in range(len(features)):
|
||||
if features[i] in modified_features:
|
||||
indexes.append(i)
|
||||
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
|
||||
ncp = gen.ncp_
|
||||
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
assert (((transformed[indexes]) != (X[indexes])).any())
|
||||
|
||||
|
||||
def test_BaseEstimator_classification(data):
|
||||
features = ['age', 'height', 'weight', 'sex', 'ola']
|
||||
X = [[23, 165, 65, 'f', 'aa'],
|
||||
[45, 158, 76, 'f', 'aa'],
|
||||
[56, 123, 78, 'f', 'bb'],
|
||||
[67, 154, 87, 'm', 'aa'],
|
||||
[45, 149, 45, 'f', 'bb'],
|
||||
[42, 166, 76, 'm', 'bb'],
|
||||
[73, 172, 85, 'm', 'bb'],
|
||||
[94, 168, 92, 'f', 'aa'],
|
||||
[69, 175, 95, 'm', 'aa'],
|
||||
[24, 181, 49, 'm', 'bb'],
|
||||
[18, 190, 69, 'm', 'bb']]
|
||||
|
||||
y = pd.Series([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
|
||||
X = pd.DataFrame(X, columns=features)
|
||||
QI = ['age', 'weight', 'ola']
|
||||
|
||||
numeric_features = ["age", "height", "weight"]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
|
||||
categorical_features = ["sex", "ola"]
|
||||
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("cat", categorical_transformer, categorical_features),
|
||||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(X)
|
||||
encoded = pd.DataFrame(encoded)
|
||||
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
|
||||
min_samples_leaf=1)
|
||||
model = base_est
|
||||
model.fit(encoded, y)
|
||||
predictions = model.predict(encoded)
|
||||
|
||||
# Append classifier to preprocessing pipeline.
|
||||
# Now we have a full prediction pipeline.
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.5,
|
||||
categorical_features=categorical_features, features_to_minimize=QI)
|
||||
train_dataset = ArrayDataset(X, predictions)
|
||||
gen.fit(dataset=train_dataset)
|
||||
transformed = gen.transform(dataset=ArrayDataset(X))
|
||||
gener = gen.generalizations_
|
||||
expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
|
||||
'untouched': ['height', 'sex']}
|
||||
|
||||
for key in expexted_generalizations['ranges']:
|
||||
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
|
||||
for key in expexted_generalizations['categories']:
|
||||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
# assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(QI, axis=1), X.drop(QI, axis=1))
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
'ranges'].keys()]
|
||||
# assert (transformed.drop(modified_features, axis=1).equals(X.drop(modified_features, axis=1)))
|
||||
np.testing.assert_array_equal(transformed.drop(modified_features, axis=1), X.drop(modified_features, axis=1))
|
||||
ncp = gen.ncp_
|
||||
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
|
||||
assert (ncp > 0)
|
||||
assert (((transformed[modified_features]).equals(X[modified_features])) == False)
|
||||
|
||||
|
||||
def test_BaseEstimator_regression():
|
||||
dataset = load_diabetes()
|
||||
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
|
||||
|
||||
base_est = DecisionTreeRegressor(random_state=10, min_samples_split=2)
|
||||
model = base_est
|
||||
model.fit(x_train, y_train)
|
||||
predictions = model.predict(x_train)
|
||||
QI = ['age', 'bmi', 's2', 's5']
|
||||
features = ['age', 'sex', 'bmi', 'bp',
|
||||
's1', 's2', 's3', 's4', 's5', 's6']
|
||||
|
||||
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, is_regression=True,
|
||||
features_to_minimize=QI)
|
||||
gen.fit(dataset=ArrayDataset(x_train, predictions, features_names=features))
|
||||
transformed = gen.transform(dataset=ArrayDataset(x_train, features_names=features))
|
||||
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
|
||||
model.fit(transformed, y_train)
|
||||
print('Base model accuracy (R2 score) after minimization: ', model.score(x_test, y_test))
|
||||
|
|
@ -546,7 +832,7 @@ def test_regression():
|
|||
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
|
||||
set([frozenset(sl) for sl in gener['categories'][key]]))
|
||||
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
|
||||
assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
|
||||
assert ((np.delete(transformed, [0, 2, 5, 8], axis=1) == np.delete(x_train, [0, 2, 5, 8], axis=1)).all())
|
||||
|
||||
modified_features = [f for f in features if
|
||||
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
|
||||
|
|
|
|||
35
tests/test_model.py
Normal file
35
tests/test_model.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
import pytest
|
||||
|
||||
from apt.utils.models import SklearnClassifier, SklearnRegressor, ModelOutputType
|
||||
from apt.utils.datasets import ArrayDataset
|
||||
from apt.utils import dataset_utils
|
||||
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
|
||||
def test_sklearn_classifier():
|
||||
(x_train, y_train), (x_test, y_test) = dataset_utils.get_iris_dataset()
|
||||
underlying_model = RandomForestClassifier()
|
||||
model = SklearnClassifier(underlying_model, ModelOutputType.CLASSIFIER_VECTOR)
|
||||
train = ArrayDataset(x_train, y_train)
|
||||
test = ArrayDataset(x_test, y_test)
|
||||
model.fit(train)
|
||||
pred = model.predict(x_test)
|
||||
assert(pred.shape[0] == x_test.shape[0])
|
||||
|
||||
score = model.score(test)
|
||||
assert(0.0 <= score <= 1.0)
|
||||
|
||||
|
||||
def test_sklearn_regressor():
|
||||
(x_train, y_train), (x_test, y_test) = dataset_utils.get_diabetes_dataset()
|
||||
underlying_model = DecisionTreeRegressor()
|
||||
model = SklearnRegressor(underlying_model)
|
||||
train = ArrayDataset(x_train, y_train)
|
||||
test = ArrayDataset(x_test, y_test)
|
||||
model.fit(train)
|
||||
pred = model.predict(x_test)
|
||||
assert (pred.shape[0] == x_test.shape[0])
|
||||
|
||||
score = model.score(test)
|
||||
Loading…
Add table
Add a link
Reference in a new issue