Data and Model wrappers (#26)

* Squashed commit of wrappers:

    Wrapper minimizer

    * apply dataset wrapper on minimizer
    * apply changes on minimization notebook
    * add black_box_access and unlimited_queries params

    Dataset wrapper anonymizer

    Add features_names to ArrayDataset
    and allow providing features names in QI and Cat features not just indexes

    update notebooks

    categorical features and QI passed by indexes
    dataset include feature names and is_pandas param

    add pytorch Dataset

    Remove redundant code.
    Use data wrappers in model wrapper APIs.

    add generic dataset components 

    Create initial version of wrappers for models

* Fix handling of categorical features
This commit is contained in:
abigailgold 2022-04-27 12:33:27 +03:00 committed by GitHub
parent d53818644e
commit 2b2dab6bef
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 1340 additions and 752 deletions

View file

@ -7,13 +7,15 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from apt.anonymization import Anonymize
from apt.utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
from apt.utils.dataset_utils import get_iris_dataset, get_adult_dataset, get_nursery_dataset
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from apt.utils.datasets import ArrayDataset, DATA_PANDAS_NUMPY_TYPE
def test_anonymize_ndarray_iris():
(x_train, y_train), _ = get_iris_dataset()
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_train)
@ -21,7 +23,7 @@ def test_anonymize_ndarray_iris():
k = 10
QI = [0, 2]
anonymizer = Anonymize(k, QI, train_only_QI=True)
anon = anonymizer.anonymize(x_train, pred)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert(len(np.unique(anon[:, QI], axis=0)) < len(np.unique(x_train[:, QI], axis=0)))
_, counts_elements = np.unique(anon[:, QI], return_counts=True)
assert (np.min(counts_elements) >= k)
@ -30,10 +32,14 @@ def test_anonymize_ndarray_iris():
def test_anonymize_pandas_adult():
(x_train, y_train), _ = get_adult_dataset()
encoded = OneHotEncoder().fit_transform(x_train)
model = DecisionTreeClassifier()
model.fit(encoded, y_train)
pred = model.predict(encoded)
k = 100
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
features = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
QI = ['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
'native-country']
categorical_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
@ -56,12 +62,11 @@ def test_anonymize_pandas_adult():
pred = model.predict(encoded)
anonymizer = Anonymize(k, QI, categorical_features=categorical_features)
anon = anonymizer.anonymize(x_train, pred)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred, features))
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_anonymize_pandas_nursery():
(x_train, y_train), _ = get_nursery_dataset()
@ -89,11 +94,11 @@ def test_anonymize_pandas_nursery():
pred = model.predict(encoded)
anonymizer = Anonymize(k, QI, categorical_features=categorical_features, train_only_QI=True)
anon = anonymizer.anonymize(x_train, pred)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
assert(anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
assert (anon.loc[:, QI].value_counts().min() >= k)
assert (anon.drop(QI, axis=1).equals(x_train.drop(QI, axis=1)))
np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
def test_regression():
@ -107,7 +112,7 @@ def test_regression():
k = 10
QI = [0, 2, 5, 8]
anonymizer = Anonymize(k, QI, is_regression=True, train_only_QI=True)
anon = anonymizer.anonymize(x_train, pred)
anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
print('Base model accuracy (R2 score): ', model.score(x_test, y_test))
model.fit(anon, y_train)
print('Base model accuracy (R2 score) after anonymization: ', model.score(x_test, y_test))
@ -127,7 +132,7 @@ def test_errors():
anonymizer = Anonymize(10, [0, 2])
(x_train, y_train), (x_test, y_test) = get_iris_dataset()
with pytest.raises(ValueError):
anonymizer.anonymize(x_train, y_test)
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))
(x_train, y_train), _ = get_adult_dataset()
with pytest.raises(ValueError):
anonymizer.anonymize(x_train, y_train)
anonymizer.anonymize(dataset=ArrayDataset(x_train, y_test))