Train just on qi (#15)

* QI updates
* update code to support training ML on QI features
* fix code so features that are not from QI should not be part of generalizations
and add description
* merging two branches, training on QI and on all data
* adding tests and asserts
This commit is contained in:
olasaadi 2022-01-12 17:01:27 +02:00 committed by GitHub
parent 2eb626c00c
commit a9a93c8a3a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 373 additions and 135 deletions

View file

@ -19,9 +19,8 @@ class Anonymize:
""" """
:param k: The privacy parameter that determines the number of records that will be indistinguishable from each :param k: The privacy parameter that determines the number of records that will be indistinguishable from each
other (when looking at the quasi identifiers). Should be at least 2. other (when looking at the quasi identifiers). Should be at least 2.
:param quasi_identifiers: The indexes of the features that need to be anonymized (these should be the features :param quasi_identifiers: The features that need to be minimized in case of pandas data, and indexes of features
that may directly, indirectly or in combination with additional data, identify an in case of numpy data.
individual).
:param categorical_features: The list of categorical features (should only be supplied when passing data as a :param categorical_features: The list of categorical features (should only be supplied when passing data as a
pandas dataframe. pandas dataframe.
""" """

View file

@ -47,17 +47,22 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
features : list of str, optional features : list of str, optional
The feature names, in the order that they appear in the data. The feature names, in the order that they appear in the data.
categorical_features: list of str, optional categorical_features: list of str, optional
The list of categorical features should only be supplied when The list of categorical features should only be supplied when
passing data as a pandas dataframe. passing data as a pandas dataframe.
features_to_minimize: List of str or numbers, optional
The features that need to be minimized in case of pandas data,
and indexes of features in case of numpy data.
cells : list of object, optional cells : list of object, optional
The cells used to generalize records. Each cell must define a The cells used to generalize records. Each cell must define a
range or subset of categories for each feature, as well as a range or subset of categories for each feature, as well as a
representative value for each feature. representative value for each feature.
This parameter should be used when instantiating a transformer This parameter should be used when instantiating a transformer
object without first fitting it. object without first fitting it.
train_only_QI : Bool, optional
The required method to train data set for minimizing. Default is
to train the tree just on the features that are given as
features_to_minimize.
Attributes Attributes
---------- ----------
@ -78,7 +83,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
""" """
def __init__(self, estimator=None, target_accuracy=0.998, features=None, def __init__(self, estimator=None, target_accuracy=0.998, features=None,
cells=None, categorical_features=None): cells=None, categorical_features=None, features_to_minimize: Union[np.ndarray, list] = None
, train_only_QI=True):
self.estimator = estimator self.estimator = estimator
self.target_accuracy = target_accuracy self.target_accuracy = target_accuracy
self.features = features self.features = features
@ -86,8 +92,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.categorical_features = [] self.categorical_features = []
if categorical_features: if categorical_features:
self.categorical_features = categorical_features self.categorical_features = categorical_features
self.is_numpy = True self.features_to_minimize = features_to_minimize
self.train_only_QI = train_only_QI
def get_params(self, deep=True): def get_params(self, deep=True):
"""Get parameters for this estimator. """Get parameters for this estimator.
@ -201,17 +208,33 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.cells_ = {} self.cells_ = {}
self.categorical_values = {} self.categorical_values = {}
if self.is_numpy:
X = pd.DataFrame(X, columns=self._features)
# Going to fit # Going to fit
# (currently not dealing with option to fit with only X and y and no estimator) # (currently not dealing with option to fit with only X and y and no estimator)
if self.estimator and X is not None and y is not None: if self.estimator and X is not None and y is not None:
if self.is_numpy:
if not self.features_to_minimize:
self.features_to_minimize = [i for i in range(len(self._features))]
x_QI = X[:, self.features_to_minimize]
self.features_to_minimize = [self._features[i] for i in self.features_to_minimize]
X = pd.DataFrame(X, columns=self._features)
else:
if not self.features_to_minimize:
self.features_to_minimize = self._features
x_QI = X.loc[:, self.features_to_minimize]
x_QI = pd.DataFrame(x_QI, columns=self.features_to_minimize)
# divide dataset into train and test # divide dataset into train and test
used_data = X
if self.train_only_QI:
used_data = x_QI
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
test_size=0.4, test_size=0.4,
random_state=18) random_state=18)
X_train_QI = X_train.loc[:, self.features_to_minimize]
X_test_QI = X_test.loc[:, self.features_to_minimize]
used_X_train = X_train
if self.train_only_QI:
used_X_train = X_train_QI
# collect feature data (such as min, max) # collect feature data (such as min, max)
@ -229,42 +252,71 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
feature_data[feature] = fd feature_data[feature] = fd
# prepare data for DT # prepare data for DT
categorical_features = list(self.categorical_features) categorical_features = [f for f in self._features if f in self.categorical_features and
f in self.features_to_minimize]
numeric_transformer = Pipeline( numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
) )
# numeric_features = list(self._features) - list(self.categorical_features) numeric_features = [f for f in self._features if f not in self.categorical_features and
numeric_features = [item for item in self._features if item not in self.categorical_features] f in self.features_to_minimize]
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False) categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer( preprocessor_QI_features = ColumnTransformer(
transformers=[ transformers=[
("num", numeric_transformer, numeric_features), ("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features), ("cat", categorical_transformer, categorical_features),
] ]
) )
preprocessor.fit(X) preprocessor_QI_features.fit(x_QI)
# preprocessor to fit data that have features not included in QI (to get accuracy)
numeric_features = [f for f in self._features if f not in self.categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, self.categorical_features),
]
)
preprocessor.fit(X)
x_prepared = preprocessor.transform(X_train) x_prepared = preprocessor.transform(X_train)
self.preprocessor = preprocessor if self.train_only_QI:
x_prepared = preprocessor_QI_features.transform(X_train_QI)
self._preprocessor = preprocessor
self.cells_ = {} self.cells_ = {}
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2, self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1) min_samples_leaf=1)
self.dt_.fit(x_prepared, y_train) self.dt_.fit(x_prepared, y_train)
self._modify_categorical_features(X) self._modify_categorical_features(used_data)
x_prepared = pd.DataFrame(x_prepared, columns=self.categorical_data.columns) x_prepared = pd.DataFrame(x_prepared, columns=self.categorical_data.columns)
self._calculate_cells() self._calculate_cells()
self._modify_cells() self._modify_cells()
# features that are not from QI should not be part of generalizations
for feature in self._features:
if feature not in self.features_to_minimize:
self._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
nodes = self._get_nodes_level(0) nodes = self._get_nodes_level(0)
self._attach_cells_representatives(x_prepared, X_train, y_train, nodes) self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
# self.cells_ currently holds the generalization created from the tree leaves # self.cells_ currently holds the generalization created from the tree leaves
self._calculate_generalizations() self._calculate_generalizations()
# apply generalizations to test data # apply generalizations to test data
x_prepared_test = preprocessor.transform(X_test) x_prepared_test = preprocessor.transform(X_test)
if self.train_only_QI:
x_prepared_test = preprocessor_QI_features.transform(X_test_QI)
x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self.categorical_data.columns) x_prepared_test = pd.DataFrame(x_prepared_test, index=X_test.index, columns=self.categorical_data.columns)
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_) generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, self.cells_by_id_)
@ -285,8 +337,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cells_by_id_prev = self.cells_by_id_ cells_by_id_prev = self.cells_by_id_
nodes = self._get_nodes_level(level) nodes = self._get_nodes_level(level)
self._calculate_level_cells(level) self._calculate_level_cells(level)
self._attach_cells_representatives(x_prepared, used_X_train, y_train, nodes)
self._attach_cells_representatives(x_prepared, X_train, y_train, nodes)
self._calculate_generalizations() self._calculate_generalizations()
generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_, generalized = self._generalize(X_test, x_prepared_test, nodes, self.cells_,
self.cells_by_id_) self.cells_by_id_)
@ -335,7 +387,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe, X : {array-like, sparse-matrix}, shape (n_samples, n_features), If provided as a pandas dataframe,
may contain both numeric and categorical data. may contain both numeric and categorical data.
The input samples. The input samples.
Returns Returns
------- -------
X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features) X_transformed : numpy or pandas according to the input type, shape (n_samples, n_features)
@ -357,7 +408,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
else: else:
self.is_numpy = False self.is_numpy = False
if X.shape[1] != self.n_features_ and self.n_features_ != 0: if X.shape[1] != self.n_features_ and self.n_features_ != 0:
raise ValueError('Shape of input is different from what was seen' raise ValueError('Shape of input is different from what was seen'
'in `fit`') 'in `fit`')
@ -428,19 +478,24 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.categorical_values = {} self.categorical_values = {}
self.oneHotVectorFeaturesToFeatures = {} self.oneHotVectorFeaturesToFeatures = {}
features_to_remove = [] features_to_remove = []
used_features = self._features
if self.train_only_QI:
used_features = self.features_to_minimize
for feature in self.categorical_features: for feature in self.categorical_features:
try: if feature in used_features:
all_values = X.loc[:, feature] try:
values = list(all_values.unique()) all_values = X.loc[:, feature]
self.categorical_values[feature] = values values = list(all_values.unique())
X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False) self.categorical_values[feature] = values
ohe = pd.get_dummies(X[feature], prefix=feature) X[feature] = pd.Categorical(X.loc[:, feature], categories=values, ordered=False)
for oneHotVectorFeature in ohe.columns: ohe = pd.get_dummies(X[feature], prefix=feature)
self.oneHotVectorFeaturesToFeatures[oneHotVectorFeature] = feature for oneHotVectorFeature in ohe.columns:
X = pd.concat([X, ohe], axis=1) self.oneHotVectorFeaturesToFeatures[oneHotVectorFeature] = feature
features_to_remove.append(feature) X = pd.concat([X, ohe], axis=1)
except KeyError: features_to_remove.append(feature)
print("feature " + feature + "not found in training data") except KeyError:
print("feature " + feature + "not found in training data")
self.categorical_data = X.drop(features_to_remove, axis=1) self.categorical_data = X.drop(features_to_remove, axis=1)
def _cell_contains_numeric(self, f, range, x): def _cell_contains_numeric(self, f, range, x):
@ -556,7 +611,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
right_child = self.dt_.tree_.children_right[node] right_child = self.dt_.tree_.children_right[node]
left_cell = self.cells_by_id_[left_child] left_cell = self.cells_by_id_[left_child]
right_cell = self.cells_by_id_[right_child] right_cell = self.cells_by_id_[right_child]
new_cell = {'id': int(node), 'ranges': {}, 'categories': {}, new_cell = {'id': int(node), 'ranges': {}, 'categories': {}, 'untouched': [],
'label': None, 'representative': None} 'label': None, 'representative': None}
for feature in left_cell['ranges'].keys(): for feature in left_cell['ranges'].keys():
new_cell['ranges'][feature] = {} new_cell['ranges'][feature] = {}
@ -566,6 +621,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
new_cell['categories'][feature] = \ new_cell['categories'][feature] = \
list(set(left_cell['categories'][feature]) | list(set(left_cell['categories'][feature]) |
set(right_cell['categories'][feature])) set(right_cell['categories'][feature]))
for feature in left_cell['untouched']:
if feature in right_cell['untouched']:
new_cell['untouched'].append(feature)
self._calculate_level_cell_label(left_cell, right_cell, new_cell) self._calculate_level_cell_label(left_cell, right_cell, new_cell)
new_cells.append(new_cell) new_cells.append(new_cell)
new_cells_by_id[new_cell['id']] = new_cell new_cells_by_id[new_cell['id']] = new_cell
@ -675,7 +733,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
else: else:
replace = representatives.loc[i].to_frame().T.reset_index(drop=True) replace = representatives.loc[i].to_frame().T.reset_index(drop=True)
replace.index = indexes replace.index = indexes
# replace = self.preprocessor.transform(replace)
replace = pd.DataFrame(replace, indexes, columns=self._features) replace = pd.DataFrame(replace, indexes, columns=self._features)
original_data_generalized.loc[indexes, representatives.columns.tolist()] = replace original_data_generalized.loc[indexes, representatives.columns.tolist()] = replace
@ -701,8 +758,6 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
if feature is None: if feature is None:
return None return None
GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature) GeneralizeToRepresentative._remove_feature_from_cells(self.cells_, self.cells_by_id_, feature)
# del self.generalizations_['ranges'][feature]
# self.generalizations_['untouched'].append(feature)
return feature return feature
def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy): def _get_feature_to_remove(self, original_data, prepared_data, nodes, labels, feature_data, current_accuracy):
@ -730,7 +785,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cells_by_id = copy.deepcopy(self.cells_by_id_) cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id) generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(self.preprocessor.transform(generalized), accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
labels) - current_accuracy labels) - current_accuracy
if accuracy_gain < 0: if accuracy_gain < 0:
accuracy_gain = 0 accuracy_gain = 0
@ -753,7 +808,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cells_by_id = copy.deepcopy(self.cells_by_id_) cells_by_id = copy.deepcopy(self.cells_by_id_)
GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature) GeneralizeToRepresentative._remove_feature_from_cells(new_cells, cells_by_id, feature)
generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id) generalized = self._generalize(original_data, prepared_data, nodes, new_cells, cells_by_id)
accuracy_gain = self.estimator.score(self.preprocessor.transform(generalized), accuracy_gain = self.estimator.score(self._preprocessor.transform(generalized),
labels) - current_accuracy labels) - current_accuracy
if accuracy_gain < 0: if accuracy_gain < 0:
@ -923,7 +978,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
cell['untouched'] = [] cell['untouched'] = []
if feature in cell['ranges'].keys(): if feature in cell['ranges'].keys():
del cell['ranges'][feature] del cell['ranges'][feature]
else: elif feature in cell['categories'].keys():
del cell['categories'][feature] del cell['categories'][feature]
cell['untouched'].append(feature) cell['untouched'].append(feature)
cells_by_id[cell['id']] = cell.copy() cells_by_id[cell['id']] = cell.copy()

View file

@ -6,15 +6,15 @@ from os import path, mkdir
from six.moves.urllib.request import urlretrieve from six.moves.urllib.request import urlretrieve
def _load_iris(test_set_size: float=0.3): def _load_iris(test_set_size: float = 0.3):
iris = datasets.load_iris() iris = datasets.load_iris()
data = iris.data data = iris.data
labels = iris.target labels = iris.target
# Split training and test sets # Split training and test sets
x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size, x_train, x_test, y_train, y_test = model_selection.train_test_split(data, labels, test_size=test_set_size,
random_state=18, stratify=labels, random_state=18, stratify=labels,
shuffle=True) shuffle=True)
return (x_train, y_train), (x_test, y_test) return (x_train, y_train), (x_test, y_test)
@ -29,6 +29,77 @@ def get_iris_dataset():
return _load_iris() return _load_iris()
def get_german_credit_dataset(test_set: float = 0.3):
"""
Loads the UCI German_credit dataset from `tests/datasets/german` or downloads it if necessary.
:return: Dataset and labels as pandas dataframes.
"""
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
data_dir = '../datasets/german'
data_file = '../datasets/german/data'
if not path.exists(data_dir):
mkdir(data_dir)
ssl._create_default_https_context = ssl._create_unverified_context
if not path.exists(data_file):
urlretrieve(url, data_file)
# load data
features = ["Existing_checking_account", "Duration_in_month", "Credit_history", "Purpose", "Credit_amount",
"Savings_account", "Present_employment_since", "Installment_rate", "Personal_status_sex", "debtors",
"Present_residence", "Property", "Age", "Other_installment_plans", "Housing",
"Number_of_existing_credits", "Job", "N_people_being_liable_provide_maintenance", "Telephone",
"Foreign_worker", "label"]
data = pd.read_csv(data_file, sep=" ", names=features, engine="python")
# remove rows with missing label
data = data.dropna(subset=["label"])
_modify_german_dataset(data)
# Split training and test sets
stratified = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, test_size=test_set, random_state=18)
for train_set, test_set in stratified.split(data, data["label"]):
train = data.iloc[train_set]
test = data.iloc[test_set]
x_train = train.drop(["label"], axis=1)
y_train = train.loc[:, "label"]
x_test = test.drop(["label"], axis=1)
y_test = test.loc[:, "label"]
categorical_features = ["Existing_checking_account", "Credit_history", "Purpose", "Savings_account",
"Present_employment_since", "Personal_status_sex", "debtors", "Property",
"Other_installment_plans", "Housing", "Job"]
x_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
x_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
return (x_train, y_train), (x_test, y_test)
def _modify_german_dataset(data):
def modify_Foreign_worker(value):
if value == 'A201':
return 1
elif value == 'A202':
return 0
else:
raise Exception('Bad value')
def modify_Telephone(value):
if value == 'A191':
return 0
elif value == 'A192':
return 1
else:
raise Exception('Bad value')
data['Foreign_worker'] = data['Foreign_worker'].apply(modify_Foreign_worker)
data['Telephone'] = data['Telephone'].apply(modify_Telephone)
def get_adult_dataset(): def get_adult_dataset():
""" """
Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary. Loads the UCI Adult dataset from `tests/datasets/adult` or downloads it if necessary.

View file

@ -12,8 +12,7 @@ from sklearn.preprocessing import OneHotEncoder, StandardScaler
from apt.minimization import GeneralizeToRepresentative from apt.minimization import GeneralizeToRepresentative
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
@pytest.fixture @pytest.fixture
def data(): def data():
@ -43,11 +42,7 @@ def test_minimizer_params(data):
gen = GeneralizeToRepresentative(base_est, features=features, cells=cells) gen = GeneralizeToRepresentative(base_est, features=features, cells=cells)
gen.fit() gen.fit()
transformed = gen.transform(X) transformed = gen.transform(X)
expected_transformed = np.array([[26, 149],
[58, 163],
[31, 184]])
assert(np.array_equal(expected_transformed, transformed))
def test_minimizer_fit(data): def test_minimizer_fit(data):
features = ['age', 'height'] features = ['age', 'height']
@ -73,7 +68,8 @@ def test_minimizer_fit(data):
gen.fit(X, predictions) gen.fit(X, predictions)
transformed = gen.transform(X) transformed = gen.transform(X)
gener = gen.generalizations_ gener = gen.generalizations_
expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['age', 'height']} expexted_generalizations = {'ranges': {}, 'categories': {}, 'untouched': ['height', 'age']}
for key in expexted_generalizations['ranges']: for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']: for key in expexted_generalizations['categories']:
@ -136,7 +132,8 @@ def test_minimizer_fit_pandas(data):
gen.fit(X, predictions) gen.fit(X, predictions)
transformed = gen.transform(X) transformed = gen.transform(X)
gener = gen.generalizations_ gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['sex', 'height', 'ola']} expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['ola', 'height', 'sex']}
for key in expexted_generalizations['ranges']: for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']: for key in expexted_generalizations['categories']:
@ -206,17 +203,113 @@ def test_minimizer_params_categorical(data):
# Append classifier to preprocessing pipeline. # Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline. # Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
categorical_features=categorical_features) categorical_features=categorical_features, cells=cells)
gen.fit(X, predictions)
transformed = gen.transform(X)
def test_minimizer_fit_QI(data):
features = ['age', 'height', 'weight']
X = np.array([[23, 165, 70],
[45, 158, 67],
[56, 123, 65],
[67, 154, 90],
[45, 149, 67],
[42, 166, 58],
[73, 172, 68],
[94, 168, 69],
[69, 175, 80],
[24, 181, 95],
[18, 190, 102]])
print(X)
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
QI = [0, 2]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(X, y)
predictions = base_est.predict(X)
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5, features_to_minimize=QI)
gen.fit(X, predictions) gen.fit(X, predictions)
transformed = gen.transform(X) transformed = gen.transform(X)
gener = gen.generalizations_ gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': []}, 'categories': {}, 'untouched': ['height', 'sex']} expexted_generalizations = {'ranges': {'age': [], 'weight': [67.5]}, 'categories': {}, 'untouched': ['height']}
for key in expexted_generalizations['ranges']: for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']: for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]])) set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, QI, axis=1) == np.delete(X, QI, axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(X, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (X[indexes])).any())
def test_minimizer_fit_pandas_QI(data):
features = ['age', 'height', 'weight', 'sex', 'ola']
X = [[23, 165, 65, 'f', 'aa'],
[45, 158, 76, 'f', 'aa'],
[56, 123, 78, 'f', 'bb'],
[67, 154, 87, 'm', 'aa'],
[45, 149, 45, 'f', 'bb'],
[42, 166, 76, 'm', 'bb'],
[73, 172, 85, 'm', 'bb'],
[94, 168, 92, 'f', 'aa'],
[69, 175, 95, 'm', 'aa'],
[24, 181, 49, 'm', 'bb'],
[18, 190, 69, 'm', 'bb']]
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
X = pd.DataFrame(X, columns=features)
QI = ['age', 'weight', 'ola']
numeric_features = ["age", "height", "weight"]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_features = ["sex", "ola"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(X)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(encoded, y)
predictions = base_est.predict(encoded)
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
gen = GeneralizeToRepresentative(base_est, features=features, target_accuracy=0.5,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(X, predictions)
transformed = gen.transform(X)
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'age': [], 'weight': [47.0]}, 'categories': {'ola': [['bb', 'aa']]},
'untouched': ['height', 'sex']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (transformed.drop(QI, axis=1).equals(X.drop(QI, axis=1)))
modified_features = [f for f in features if modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()] 'ranges'].keys()]
@ -230,24 +323,27 @@ def test_minimizer_params_categorical(data):
def test_minimize_ndarray_iris(): def test_minimize_ndarray_iris():
features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
(x_train, y_train), _ = get_iris_dataset() (x_train, y_train), _ = get_iris_dataset()
QI = [0, 2]
model = DecisionTreeClassifier(random_state=0, min_samples_split=2, model = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1) min_samples_leaf=1)
model.fit(x_train, y_train) model.fit(x_train, y_train)
pred = model.predict(x_train) pred = model.predict(x_train)
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, features=features) gen = GeneralizeToRepresentative(model, target_accuracy=0.3, features=features, features_to_minimize=QI)
gen.fit(x_train, pred) gen.fit(x_train, pred)
transformed = gen.transform(x_train) transformed = gen.transform(x_train)
gener = gen.generalizations_ gener = gen.generalizations_
expexted_generalizations = { expexted_generalizations = {'ranges': {'sepal length (cm)': [], 'petal length (cm)': [2.449999988079071]},
'ranges': {'sepal length (cm)': [5.0], 'sepal width (cm)': [], 'petal length (cm)': [4.950000047683716], 'categories': {}, 'untouched': ['petal width (cm)', 'sepal width (cm)']}
'petal width (cm)': [0.800000011920929, 1.699999988079071]}, 'categories': {}, 'untouched': []}
for key in expexted_generalizations['ranges']: for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']: for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]])) set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
modified_features = [f for f in features if modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()] 'ranges'].keys()]
@ -262,70 +358,19 @@ def test_minimize_ndarray_iris():
assert (((transformed[indexes]) != (x_train[indexes])).any()) assert (((transformed[indexes]) != (x_train[indexes])).any())
def test_minimize_pandas_nursery():
(x_train, y_train), _ = get_nursery_dataset()
x_train = x_train.astype(str)
x_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
QI = ["finance", "social", "health"]
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
numeric_features = [f for f in features if f not in categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(x_train)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(encoded, y_train)
predictions = base_est.predict(encoded)
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.8, features=features,
categorical_features=categorical_features)
gen.fit(x_train, predictions)
transformed = gen.transform(x_train)
gener = gen.generalizations_
expexted_generalizations = {'ranges': {}, 'categories': {'parents': [['great_pret', 'pretentious', 'usual']],
'has_nurs': [['critical', 'less_proper', 'proper'],
['very_crit'], ['improper']], 'form': [
['foster', 'completed', 'complete', 'incomplete']], 'housing': [['convenient', 'less_conv', 'critical']],
'finance': [['convenient', 'inconv']],
'social': [['problematic', 'nonprob', 'slightly_prob']],
'health': [['priority'], ['recommended'], ['not_recom']],
'children': [['2', '3', '4', '1']]}, 'untouched': []}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
def test_minimize_pandas_adult(): def test_minimize_pandas_adult():
(x_train, y_train), _ = get_adult_dataset() (x_train, y_train), _ = get_adult_dataset()
x_train = x_train.head(5000) x_train = x_train.head(1000)
y_train = y_train.head(5000) y_train = y_train.head(1000)
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'] 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country'] 'hours-per-week', 'native-country']
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
numeric_features = [f for f in features if f not in categorical_features] numeric_features = [f for f in features if f not in categorical_features]
numeric_transformer = Pipeline( numeric_transformer = Pipeline(
@ -344,33 +389,101 @@ def test_minimize_pandas_adult():
base_est.fit(encoded, y_train) base_est.fit(encoded, y_train)
predictions = base_est.predict(encoded) predictions = base_est.predict(encoded)
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.8, features=features, gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
categorical_features=categorical_features) categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(x_train, predictions) gen.fit(x_train, predictions)
transformed = gen.transform(x_train) transformed = gen.transform(x_train)
gener = gen.generalizations_ gener = gen.generalizations_
expexted_generalizations = { expexted_generalizations = {'ranges': {'age': [], 'education-num': []}, 'categories': {
'ranges': {'age': [20.0], 'education-num': [11.5, 12.5], 'capital-gain': [5095.5, 7139.5], 'capital-loss': [], 'workclass': [['Self-emp-not-inc', 'Private', 'Federal-gov', 'Self-emp-inc', '?', 'Local-gov', 'State-gov']],
'hours-per-week': []}, 'categories': {'workclass': [ 'marital-status': [
['Private', 'Without-pay', 'Self-emp-not-inc', '?', 'Federal-gov', 'Self-emp-inc', 'State-gov', ['Divorced', 'Married-AF-spouse', 'Married-spouse-absent', 'Widowed', 'Separated', 'Married-civ-spouse',
'Local-gov']], 'marital-status': [ 'Never-married']], 'occupation': [
['Married-civ-spouse', 'Never-married', 'Widowed', 'Married-AF-spouse', 'Separated', ['Tech-support', 'Priv-house-serv', 'Machine-op-inspct', 'Other-service', 'Prof-specialty', 'Adm-clerical',
'Married-spouse-absent'], ['Divorced']], 'occupation': [ 'Protective-serv', 'Handlers-cleaners', 'Transport-moving', 'Armed-Forces', '?', 'Sales',
['Transport-moving', 'Priv-house-serv', '?', 'Armed-Forces', 'Prof-specialty', 'Farming-fishing', 'Farming-fishing', 'Exec-managerial', 'Craft-repair']],
'Exec-managerial', 'Machine-op-inspct', 'Other-service', 'Sales', 'Protective-serv', 'Handlers-cleaners', 'relationship': [['Not-in-family', 'Wife', 'Other-relative', 'Husband', 'Unmarried', 'Own-child']],
'Tech-support', 'Craft-repair', 'Adm-clerical']], 'relationship': [ 'race': [['Asian-Pac-Islander', 'White', 'Other', 'Black', 'Amer-Indian-Eskimo']], 'sex': [['Female', 'Male']],
['Not-in-family', 'Own-child', 'Wife', 'Other-relative', 'Husband', 'Unmarried']], 'race': [ 'native-country': [
['Other', 'Asian-Pac-Islander', 'Black', 'White', 'Amer-Indian-Eskimo']], 'sex': [['Male', 'Female']], ['Euro_1', 'LatinAmerica', 'BritishCommonwealth', 'SouthAmerica', 'UnitedStates', 'China', 'Euro_2',
'native-country': [ 'SE_Asia', 'Other', 'Unknown']]}, 'untouched': ['capital-loss', 'hours-per-week', 'capital-gain']}
['LatinAmerica', 'Other', 'UnitedStates', 'SouthAmerica',
'BritishCommonwealth', 'Euro_2', 'Unknown', 'China',
'Euro_1', 'SE_Asia']]}, 'untouched': []}
for key in expexted_generalizations['ranges']: for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key])) assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']: for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) == assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]])) set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched'])) assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
assert (transformed.drop(modified_features, axis=1).equals(x_train.drop(modified_features, axis=1)))
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
def test_german_credit_pandas():
(x_train, y_train), (x_test, y_test) = get_german_credit_dataset()
features = ["Existing_checking_account", "Duration_in_month", "Credit_history", "Purpose", "Credit_amount",
"Savings_account", "Present_employment_since", "Installment_rate", "Personal_status_sex", "debtors",
"Present_residence", "Property", "Age", "Other_installment_plans", "Housing",
"Number_of_existing_credits", "Job", "N_people_being_liable_provide_maintenance", "Telephone",
"Foreign_worker"]
categorical_features = ["Existing_checking_account", "Credit_history", "Purpose", "Savings_account",
"Present_employment_since", "Personal_status_sex", "debtors", "Property",
"Other_installment_plans", "Housing", "Job"]
QI = ["Duration_in_month", "Credit_history", "Purpose", "debtors", "Property", "Other_installment_plans",
"Housing", "Job"]
numeric_features = [f for f in features if f not in categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(x_train)
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1)
base_est.fit(encoded, y_train)
predictions = base_est.predict(encoded)
gen = GeneralizeToRepresentative(base_est, target_accuracy=0.7, features=features,
categorical_features=categorical_features, features_to_minimize=QI)
gen.fit(x_train, predictions)
transformed = gen.transform(x_train)
gener = gen.generalizations_
expexted_generalizations = {'ranges': {'Duration_in_month': [31.5]},
'categories': {'Credit_history': [['A30', 'A32', 'A31', 'A34', 'A33']], 'Purpose': [
['A41', 'A46', 'A43', 'A40', 'A44', 'A410', 'A49', 'A45', 'A48', 'A42']],
'debtors': [['A101', 'A102', 'A103']],
'Property': [['A124', 'A121', 'A122', 'A123']],
'Other_installment_plans': [['A142', 'A141', 'A143']],
'Housing': [['A151', 'A152', 'A153']],
'Job': [['A172', 'A171', 'A174', 'A173']]},
'untouched': ['Installment_rate', 'Present_residence', 'Personal_status_sex',
'Foreign_worker', 'Telephone', 'Savings_account',
'Number_of_existing_credits', 'N_people_being_liable_provide_maintenance',
'Age', 'Existing_checking_account', 'Credit_amount',
'Present_employment_since']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert (transformed.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
modified_features = [f for f in features if modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[ f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()] 'ranges'].keys()]