From 364ebf68ebc06d169a3cf9a3938f3f75909374af Mon Sep 17 00:00:00 2001 From: abigailt Date: Thu, 21 Sep 2023 19:10:05 +0300 Subject: [PATCH] Add test for pandas Signed-off-by: abigailt --- tests/test_anonymizer.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py index 633a7b0..503ae6b 100644 --- a/tests/test_anonymizer.py +++ b/tests/test_anonymizer.py @@ -1,5 +1,6 @@ import pytest import numpy as np +import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline @@ -147,6 +148,37 @@ def test_anonymize_ndarray_one_hot(): assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all()) +def test_anonymize_pandas_one_hot(): + feature_names = ["age", "gender_M", "gender_F", "height"] + x_train = np.array([[23, 0, 1, 165], + [45, 0, 1, 158], + [56, 1, 0, 123], + [67, 0, 1, 154], + [45, 1, 0, 149], + [42, 1, 0, 166], + [73, 0, 1, 172], + [94, 0, 1, 168], + [69, 0, 1, 175], + [24, 1, 0, 181], + [18, 1, 0, 190]]) + y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]) + x_train = pd.DataFrame(x_train, columns=feature_names) + y_train = pd.Series(y_train) + + model = DecisionTreeClassifier() + model.fit(x_train, y_train) + pred = model.predict(x_train) + + k = 10 + QI = ["age", "gender_M", "gender_F"] + QI_slices = [["gender_M", "gender_F"]] + anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices) + anon = anonymizer.anonymize(ArrayDataset(x_train, pred)) + assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0]) + assert (anon.loc[:, QI].value_counts().min() >= k) + np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1)) + + def test_errors(): with pytest.raises(ValueError): Anonymize(1, [0, 2])