Build the dt on all features anon (#23)

* add param to build the DT on all features and not just on QI
* one-hot encoding only for categorical features
This commit is contained in:
olasaadi 2022-03-07 20:12:55 +02:00 committed by GitHub
parent c47819a031
commit d53818644e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 79 additions and 17 deletions

View file

@ -1,5 +1,8 @@
import pytest
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
@ -17,7 +20,7 @@ def test_anonymize_ndarray_iris():
k = 10
QI = [0, 2]
anonymizer = Anonymize(k, QI)
anonymizer = Anonymize(k, QI, train_only_QI=True)
anon = anonymizer.anonymize(x_train, pred)
assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
@ -27,16 +30,31 @@ def test_anonymize_ndarray_iris():
def test_anonymize_pandas_adult():
(x_train, y_train), _ = get_adult_dataset()
encoded = OneHotEncoder().fit_transform(x_train)
model = DecisionTreeClassifier()
model.fit(encoded, y_train)
pred = model.predict(encoded)
k = 100
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
# prepare data for DT
numeric_features = [f for f in features if f not in categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(x_train)
model = DecisionTreeClassifier()
model.fit(encoded, y_train)
pred = model.predict(encoded)
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
anon = anonymizer.anonymize(x_train, pred)
@ -48,15 +66,29 @@ def test_anonymize_pandas_adult():
def test_anonymize_pandas_nursery():
(x_train, y_train), _ = get_nursery_dataset()
x_train = x_train.astype(str)
encoded = OneHotEncoder().fit_transform(x_train)
k = 100
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
QI = ["finance", "social", "health"]
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
# prepare data for DT
numeric_features = [f for f in features if f not in categorical_features]
numeric_transformer = Pipeline(
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
)
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
encoded = preprocessor.fit_transform(x_train)
model = DecisionTreeClassifier()
model.fit(encoded, y_train)
pred = model.predict(encoded)
k = 100
QI = ["finance", "social", "health"]
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True)
anon = anonymizer.anonymize(x_train, pred)
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
@ -74,7 +106,7 @@ def test_regression():
pred = model.predict(x_train)
k = 10
QI = [0, 2, 5, 8]
anonymizer = Anonymize(k, QI, is_regression=True)
anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True)
anon = anonymizer.anonymize(x_train, pred)
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
model.fit(anon, y_train)