mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-05-06 10:32:38 +02:00
Build the dt on all features anon (#23)
* add param to build the DT on all features and not just on QI * one-hot encoding only for categorical features
This commit is contained in:
parent
c47819a031
commit
d53818644e
2 changed files with 79 additions and 17 deletions
|
|
@ -1,5 +1,8 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
|
||||
|
|
@ -17,7 +20,7 @@ def test_anonymize_ndarray_iris():
|
|||
|
||||
k = 10
|
||||
QI = [0, 2]
|
||||
anonymizer = Anonymize(k, QI)
|
||||
anonymizer = Anonymize(k, QI, train_only_QI=True)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
|
||||
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
|
||||
|
|
@ -27,16 +30,31 @@ def test_anonymize_ndarray_iris():
|
|||
|
||||
def test_anonymize_pandas_adult():
|
||||
(x_train, y_train), _ = get_adult_dataset()
|
||||
encoded = OneHotEncoder().fit_transform(x_train)
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(encoded, y_train)
|
||||
pred = model.predict(encoded)
|
||||
|
||||
k = 100
|
||||
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation',
|
||||
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
|
||||
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'native-country']
|
||||
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
|
||||
'native-country']
|
||||
# prepare data for DT
|
||||
numeric_features = [f for f in features if f not in categorical_features]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("cat", categorical_transformer, categorical_features),
|
||||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(x_train)
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(encoded, y_train)
|
||||
pred = model.predict(encoded)
|
||||
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
|
||||
|
|
@ -48,15 +66,29 @@ def test_anonymize_pandas_adult():
|
|||
def test_anonymize_pandas_nursery():
|
||||
(x_train, y_train), _ = get_nursery_dataset()
|
||||
x_train = x_train.astype(str)
|
||||
encoded = OneHotEncoder().fit_transform(x_train)
|
||||
|
||||
k = 100
|
||||
features = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health"]
|
||||
QI = ["finance", "social", "health"]
|
||||
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
|
||||
# prepare data for DT
|
||||
numeric_features = [f for f in features if f not in categorical_features]
|
||||
numeric_transformer = Pipeline(
|
||||
steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]
|
||||
)
|
||||
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("cat", categorical_transformer, categorical_features),
|
||||
]
|
||||
)
|
||||
encoded = preprocessor.fit_transform(x_train)
|
||||
model = DecisionTreeClassifier()
|
||||
model.fit(encoded, y_train)
|
||||
pred = model.predict(encoded)
|
||||
|
||||
k = 100
|
||||
QI = ["finance", "social", "health"]
|
||||
categorical_features = ["parents", "has_nurs", "form", "housing", "finance", "social", "health", 'children']
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
|
||||
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
|
||||
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
|
||||
|
|
@ -74,7 +106,7 @@ def test_regression():
|
|||
pred = model.predict(x_train)
|
||||
k = 10
|
||||
QI = [0, 2, 5, 8]
|
||||
anonymizer = Anonymize(k, QI, is_regression=True)
|
||||
anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True)
|
||||
anon = anonymizer.anonymize(x_train, pred)
|
||||
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
|
||||
model.fit(anon, y_train)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue