Regression minimization (#20)

* support regression in minimization and add test

* fix #10
This commit is contained in:
olasaadi 2022-01-27 15:57:55 +02:00 committed by GitHub
parent cb9278ddb5
commit 3feebe8973
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 98 additions and 19 deletions

View file

@ -13,7 +13,7 @@ from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@ -84,7 +84,7 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
def __init__(self, estimator=None, target_accuracy=0.998, features=None, def __init__(self, estimator=None, target_accuracy=0.998, features=None,
cells=None, categorical_features=None, features_to_minimize: Union[np.ndarray, list] = None cells=None, categorical_features=None, features_to_minimize: Union[np.ndarray, list] = None
, train_only_QI=True): , train_only_QI=True, is_regression=False):
self.estimator = estimator self.estimator = estimator
self.target_accuracy = target_accuracy self.target_accuracy = target_accuracy
self.features = features self.features = features
@ -94,7 +94,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self.categorical_features = categorical_features self.categorical_features = categorical_features
self.features_to_minimize = features_to_minimize self.features_to_minimize = features_to_minimize
self.train_only_QI = train_only_QI self.train_only_QI = train_only_QI
self.is_regression = is_regression
def get_params(self, deep=True): def get_params(self, deep=True):
"""Get parameters for this estimator. """Get parameters for this estimator.
@ -227,9 +228,11 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
used_data = X used_data = X
if self.train_only_QI: if self.train_only_QI:
used_data = x_QI used_data = x_QI
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, if self.is_regression:
test_size=0.4, X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=14)
random_state=18) else:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.4, random_state=18)
X_train_QI = X_train.loc[:, self.features_to_minimize] X_train_QI = X_train.loc[:, self.features_to_minimize]
X_test_QI = X_test.loc[:, self.features_to_minimize] X_test_QI = X_test.loc[:, self.features_to_minimize]
used_X_train = X_train used_X_train = X_train
@ -292,7 +295,10 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
self._preprocessor = preprocessor self._preprocessor = preprocessor
self.cells_ = {} self.cells_ = {}
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2, if self.is_regression:
self.dt_ = DecisionTreeRegressor(random_state=10, min_samples_split=2, min_samples_leaf=1)
else:
self.dt_ = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1) min_samples_leaf=1)
self.dt_.fit(x_prepared, y_train) self.dt_.fit(x_prepared, y_train)
self._modify_categorical_features(used_data) self._modify_categorical_features(used_data)
@ -528,8 +534,9 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
feature_index = self.dt_.tree_.feature[node] feature_index = self.dt_.tree_.feature[node]
if feature_index == -2: if feature_index == -2:
# this is a leaf # this is a leaf
label = self._calculate_cell_label(node) # if it is a regression problem we do not use label
hist = [int(i) for i in self.dt_.tree_.value[node][0]] label = self._calculate_cell_label(node) if not self.is_regression else 1
hist = [int(i) for i in self.dt_.tree_.value[node][0]] if not self.is_regression else []
cell = {'label': label, 'hist': hist, 'ranges': {}, 'id': int(node)} cell = {'label': label, 'hist': hist, 'ranges': {}, 'id': int(node)}
return [cell] return [cell]
@ -632,8 +639,8 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
# else: nothing to do, stay with previous cells # else: nothing to do, stay with previous cells
def _calculate_level_cell_label(self, left_cell, right_cell, new_cell): def _calculate_level_cell_label(self, left_cell, right_cell, new_cell):
new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])] new_cell['hist'] = [x + y for x, y in zip(left_cell['hist'], right_cell['hist'])] if not self.is_regression else []
new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) new_cell['label'] = int(self.dt_.classes_[np.argmax(new_cell['hist'])]) if not self.is_regression else 1
def _get_nodes_level(self, level): def _get_nodes_level(self, level):
# level = distance from lowest leaf # level = distance from lowest leaf
@ -674,9 +681,13 @@ class GeneralizeToRepresentative(BaseEstimator, MetaEstimatorMixin, TransformerM
sample_rows = prepared_data.iloc[indexes] sample_rows = prepared_data.iloc[indexes]
sample_labels = labels_df.iloc[indexes]['label'].values.tolist() sample_labels = labels_df.iloc[indexes]['label'].values.tolist()
# get rows with matching label # get rows with matching label
indexes = [i for i, label in enumerate(sample_labels) if label == cell['label']] if self.is_regression:
match_samples = sample_rows.iloc[indexes] match_samples = sample_rows
match_rows = original_rows.iloc[indexes] match_rows = original_rows
else:
indexes = [i for i, label in enumerate(sample_labels) if label == cell['label']]
match_samples = sample_rows.iloc[indexes]
match_rows = original_rows.iloc[indexes]
# find the "middle" of the cluster # find the "middle" of the cluster
array = match_samples.values array = match_samples.values
# Only works with numpy 1.9.0 and higher!!! # Only works with numpy 1.9.0 and higher!!!

View file

@ -3,7 +3,7 @@ import numpy as np
import pandas as pd import pandas as pd
from sklearn.compose import ColumnTransformer from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_boston from sklearn.datasets import load_boston, load_diabetes
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
@ -11,9 +11,10 @@ from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.preprocessing import OneHotEncoder, StandardScaler
from apt.minimization import GeneralizeToRepresentative from apt.minimization import GeneralizeToRepresentative
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset, get_german_credit_dataset
@pytest.fixture @pytest.fixture
def data(): def data():
return load_boston(return_X_y=True) return load_boston(return_X_y=True)
@ -42,7 +43,7 @@ def test_minimizer_params(data):
gen = GeneralizeToRepresentative(base_est, features=features, cells=cells) gen = GeneralizeToRepresentative(base_est, features=features, cells=cells)
gen.fit() gen.fit()
transformed = gen.transform(X) transformed = gen.transform(X)
def test_minimizer_fit(data): def test_minimizer_fit(data):
features = ['age', 'height'] features = ['age', 'height']
@ -57,7 +58,6 @@ def test_minimizer_fit(data):
[69, 175], [69, 175],
[24, 181], [24, 181],
[18, 190]]) [18, 190]])
print(X)
y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0] y = [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2, base_est = DecisionTreeClassifier(random_state=0, min_samples_split=2,
min_samples_leaf=1) min_samples_leaf=1)
@ -439,7 +439,6 @@ def test_german_credit_pandas():
QI = ["Duration_in_month", "Credit_history", "Purpose", "debtors", "Property", "Other_installment_plans", QI = ["Duration_in_month", "Credit_history", "Purpose", "debtors", "Property", "Other_installment_plans",
"Housing", "Job"] "Housing", "Job"]
numeric_features = [f for f in features if f not in categorical_features] numeric_features = [f for f in features if f not in categorical_features]
numeric_transformer = Pipeline( numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))] steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
@ -492,3 +491,72 @@ def test_german_credit_pandas():
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0: if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0) assert (ncp > 0)
assert (((transformed[modified_features]).equals(x_train[modified_features])) == False) assert (((transformed[modified_features]).equals(x_train[modified_features])) == False)
def test_regression():
dataset = load_diabetes()
x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5, random_state=14)
model = DecisionTreeRegressor(random_state=10, min_samples_split=2)
model.fit(x_train, y_train)
pred = model.predict(x_train)
QI = [0, 2, 5, 8]
features = ['age', 'sex', 'bmi', 'bp',
's1', 's2', 's3', 's4', 's5', 's6']
gen = GeneralizeToRepresentative(model, target_accuracy=0.7, features=features, is_regression=True,
features_to_minimize=QI)
gen.fit(x_train, pred)
transformed = gen.transform(x_train)
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
model.fit(transformed, y_train)
print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
gener = gen.generalizations_
expexted_generalizations = {'ranges': {
'age': [-0.07816532626748085, -0.07090024650096893, -0.05637009255588055, -0.05092128552496433,
-0.04728874587453902, -0.04547247663140297, -0.04183994047343731, -0.027309784665703773,
-0.023677248042076826, -0.020044708624482155, -0.01641217083670199, -0.001882016600575298,
0.0017505218856967986, 0.0035667913616634905, 0.007199329789727926, 0.010831868276000023,
0.02354575227946043, 0.030810829252004623, 0.03262709779664874, 0.03444336913526058,
0.03625963814556599, 0.03807590529322624, 0.03807590715587139, 0.047157252207398415,
0.06168740428984165, 0.0635036751627922, 0.06895248219370842, 0.07258502021431923, 0.07621755823493004,
0.1034616008400917],
'bmi': [-0.07626373693346977, -0.060635464265942574, -0.056863121688365936, -0.05578530766069889,
-0.054168591275811195, -0.042312657460570335, -0.0374625027179718, -0.03422906715422869,
-0.033690162003040314, -0.03261234890669584, -0.02614547684788704, -0.025067666545510292,
-0.022373135201632977, -0.016984074376523495, -0.01375063881278038, -0.007822672137990594,
-0.004589236050378531, 0.008344509289599955, 0.015889193629845977, 0.016967005096375942,
0.024511689320206642, 0.0272062208969146, 0.030978563241660595, 0.032595280557870865,
0.033673093654215336, 0.04391230642795563, 0.04552902653813362, 0.05469042807817459,
0.06977979838848114, 0.07301323488354683, 0.09349166229367256],
's2': [-0.1044962927699089, -0.08649025857448578, -0.07740895450115204, -0.07114598527550697,
-0.06378699466586113, -0.05971606448292732, -0.04437179118394852, -0.0398311372846365,
-0.03137612994760275, -0.022138250060379505, -0.018067320343106985, -0.017910746857523918,
-0.017910745926201344, -0.01618842873722315, -0.007576846517622471, -0.007263698382303119,
-0.0010007291566580534, 0.0010347360512241721, 0.006514834007248282, 0.00933317095041275,
0.012464655097573996, 0.019197346206055954, 0.020919663831591606, 0.02217225730419159,
0.032036433927714825, 0.036420512944459915, 0.04080459102988243, 0.04127431474626064,
0.04268348217010498, 0.04424922354519367, 0.04424922540783882, 0.056462014093995094, 0.05928034894168377,
0.061315815430134535, 0.06272498145699501, 0.06460387445986271]}, 'categories': {},
'untouched': ['s5', 's3', 'bp', 's1', 'sex', 's6', 's4']}
for key in expexted_generalizations['ranges']:
assert (set(expexted_generalizations['ranges'][key]) == set(gener['ranges'][key]))
for key in expexted_generalizations['categories']:
assert (set([frozenset(sl) for sl in expexted_generalizations['categories'][key]]) ==
set([frozenset(sl) for sl in gener['categories'][key]]))
assert (set(expexted_generalizations['untouched']) == set(gener['untouched']))
assert ((np.delete(transformed, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
modified_features = [f for f in features if
f in expexted_generalizations['categories'].keys() or f in expexted_generalizations[
'ranges'].keys()]
indexes = []
for i in range(len(features)):
if features[i] in modified_features:
indexes.append(i)
assert ((np.delete(transformed, indexes, axis=1) == np.delete(x_train, indexes, axis=1)).all())
ncp = gen.ncp_
if len(expexted_generalizations['ranges'].keys()) > 0 or len(expexted_generalizations['categories'].keys()) > 0:
assert (ncp > 0)
assert (((transformed[indexes]) != (x_train[indexes])).any())