From 364ebf68ebc06d169a3cf9a3938f3f75909374af Mon Sep 17 00:00:00 2001
From: abigailt <abigailt@il.ibm.com>
Date: Thu, 21 Sep 2023 19:10:05 +0300
Subject: [PATCH] Add test for pandas

Signed-off-by: abigailt <abigailt@il.ibm.com>
---
 tests/test_anonymizer.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py
index 633a7b0..503ae6b 100644
--- a/tests/test_anonymizer.py
+++ b/tests/test_anonymizer.py
@@ -1,5 +1,6 @@
 import pytest
 import numpy as np
+import pandas as pd
 from sklearn.compose import ColumnTransformer
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
@@ -147,6 +148,37 @@ def test_anonymize_ndarray_one_hot():
     assert ((np.delete(anon, QI, axis=1) == np.delete(x_train, QI, axis=1)).all())
 
 
+def test_anonymize_pandas_one_hot():
+    feature_names = ["age", "gender_M", "gender_F", "height"]
+    x_train = np.array([[23, 0, 1, 165],
+                        [45, 0, 1, 158],
+                        [56, 1, 0, 123],
+                        [67, 0, 1, 154],
+                        [45, 1, 0, 149],
+                        [42, 1, 0, 166],
+                        [73, 0, 1, 172],
+                        [94, 0, 1, 168],
+                        [69, 0, 1, 175],
+                        [24, 1, 0, 181],
+                        [18, 1, 0, 190]])
+    y_train = np.array([1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])
+    x_train = pd.DataFrame(x_train, columns=feature_names)
+    y_train = pd.Series(y_train)
+
+    model = DecisionTreeClassifier()
+    model.fit(x_train, y_train)
+    pred = model.predict(x_train)
+
+    k = 10
+    QI = ["age", "gender_M", "gender_F"]
+    QI_slices = [["gender_M", "gender_F"]]
+    anonymizer = Anonymize(k, QI, train_only_QI=True, quasi_identifer_slices=QI_slices)
+    anon = anonymizer.anonymize(ArrayDataset(x_train, pred))
+    assert (anon.loc[:, QI].drop_duplicates().shape[0] < x_train.loc[:, QI].drop_duplicates().shape[0])
+    assert (anon.loc[:, QI].value_counts().min() >= k)
+    np.testing.assert_array_equal(anon.drop(QI, axis=1), x_train.drop(QI, axis=1))
+
+
 def test_errors():
     with pytest.raises(ValueError):
         Anonymize(1, [0, 2])