From 570c6f89666f8c382480fb70335bb87123b894eb Mon Sep 17 00:00:00 2001
From: abigailt <abigailt@il.ibm.com>
Date: Tue, 17 Oct 2023 11:25:03 +0300
Subject: [PATCH] Fix anonymization adult notebook + new notebook to
 demonstrate anonymization on 1-hot encoded data

Signed-off-by: abigailt <abigailt@il.ibm.com>
---
 notebooks/anonymization_one_hot_adult.ipynb   | 303 ++++++++++++++++++
 ...ership_inference_anonymization_adult.ipynb | 196 ++++++-----
 2 files changed, 393 insertions(+), 106 deletions(-)
 create mode 100644 notebooks/anonymization_one_hot_adult.ipynb

diff --git a/notebooks/anonymization_one_hot_adult.ipynb b/notebooks/anonymization_one_hot_adult.ipynb
new file mode 100644
index 0000000..dd6352e
--- /dev/null
+++ b/notebooks/anonymization_one_hot_adult.ipynb
@@ -0,0 +1,303 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using ML anonymization on one-hot encoded data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial we will show how to anonymize models using the ML anonymization module, specifically when the inout data is already one-hot encoded. \n",
+    "\n",
+    "This will be demonstarted using the Adult dataset (original dataset can be found here: https://archive.ics.uci.edu/ml/datasets/adult). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['State-gov' 'Never-married' 'Adm-clerical' ... 'White' 'Male'\n",
+      "  'UnitedStates']\n",
+      " ['Self-emp-not-inc' 'Married-civ-spouse' 'Exec-managerial' ... 'White'\n",
+      "  'Male' 'UnitedStates']\n",
+      " ['Private' 'Divorced' 'Handlers-cleaners' ... 'White' 'Male'\n",
+      "  'UnitedStates']\n",
+      " ...\n",
+      " ['Private' 'Never-married' 'Sales' ... 'White' 'Female' 'UnitedStates']\n",
+      " ['Private' 'Never-married' 'Craft-repair' ... 'White' 'Male'\n",
+      "  'UnitedStates']\n",
+      " ['Private' 'Never-married' 'Handlers-cleaners' ... 'White' 'Male'\n",
+      "  'UnitedStates']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
+    "from apt.utils.dataset_utils import get_adult_dataset_pd\n",
+    "\n",
+    "# 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'\n",
+    "categorical_features = [1, 3, 4, 5, 6, 7, 11]\n",
+    "\n",
+    "# requires a folder called 'datasets' in the current directory\n",
+    "(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()\n",
+    "x_train = x_train.to_numpy()[:, [1, 3, 4, 5, 6, 7, 11]]\n",
+    "y_train = y_train.to_numpy().astype(int)\n",
+    "x_test = x_test.to_numpy()[:, [1, 3, 4, 5, 6, 7, 11]]\n",
+    "y_test = y_test.to_numpy().astype(int)\n",
+    "\n",
+    "# get balanced dataset\n",
+    "x_train = x_train[:x_test.shape[0]]\n",
+    "y_train = y_train[:y_test.shape[0]]\n",
+    "\n",
+    "print(x_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Encode data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " ...\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "import scipy\n",
+    "\n",
+    "preprocessor = OneHotEncoder(handle_unknown=\"ignore\")\n",
+    "\n",
+    "x_train = preprocessor.fit_transform(x_train)\n",
+    "x_test = preprocessor.transform(x_test)\n",
+    "if scipy.sparse.issparse(x_train):\n",
+    "    x_train = x_train.toarray().astype(int)\n",
+    "if scipy.sparse.issparse(x_test):\n",
+    "    x_test = x_test.toarray().astype(int)\n",
+    "\n",
+    "print(x_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train decision tree model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Base model accuracy:  0.814446287083103\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
+      "  warnings.warn(msg, category=FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from art.estimators.classification.scikitlearn import ScikitlearnDecisionTreeClassifier\n",
+    "\n",
+    "model = DecisionTreeClassifier()\n",
+    "model.fit(x_train, y_train)\n",
+    "\n",
+    "art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
+    "\n",
+    "print('Base model accuracy: ', model.score(x_test, y_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Anonymize data\n",
+    "## k=100\n",
+    "\n",
+    "The data is anonymized on the quasi-identifiers: age, education-num, capital-gain, hours-per-week and with a privact parameter k=100.\n",
+    "\n",
+    "This means that each record in the anonymized dataset is identical to 99 others on the quasi-identifier values (i.e., when looking only at those features, the records are indistinguishable)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " ...\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]\n",
+      " [0 0 0 ... 0 1 0]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from apt.utils.datasets import ArrayDataset\n",
+    "from apt.anonymization import Anonymize\n",
+    "\n",
+    "x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)])\n",
+    "\n",
+    "# QI = (race, sex)\n",
+    "QI = [53, 52, 51, 50, 49, 48, 47]\n",
+    "QI_slices = [[47, 48, 49, 50, 51], [52, 53]]\n",
+    "anonymizer = Anonymize(100, QI)\n",
+    "anon = anonymizer.anonymize(ArrayDataset(x_train, x_train_predictions))\n",
+    "print(anon)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2711"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# number of distinct rows in original data\n",
+    "len(np.unique(x_train, axis=0))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2476"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# number of distinct rows in anonymized data\n",
+    "len(np.unique(anon, axis=0))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train decision tree model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Anonymized model accuracy:  0.8135863890424421\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
+      "  warnings.warn(msg, category=FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "anon_model = DecisionTreeClassifier()\n",
+    "anon_model.fit(anon, y_train)\n",
+    "\n",
+    "anon_art_classifier = ScikitlearnDecisionTreeClassifier(anon_model)\n",
+    "\n",
+    "print('Anonymized model accuracy: ', anon_model.score(x_test, y_test))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/membership_inference_anonymization_adult.ipynb b/notebooks/membership_inference_anonymization_adult.ipynb
index 7d8bbb3..1bab5a7 100644
--- a/notebooks/membership_inference_anonymization_adult.ipynb
+++ b/notebooks/membership_inference_anonymization_adult.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -9,7 +8,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -23,13 +21,72 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Load data"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[  39   13 2174    0   40]\n",
+      " [  50   13    0    0   13]\n",
+      " [  38    9    0    0   40]\n",
+      " ...\n",
+      " [  27   13    0    0   40]\n",
+      " [  26   11    0    0   48]\n",
+      " [  27    9    0    0   40]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
+    "from apt.utils.dataset_utils import get_adult_dataset_pd\n",
+    "\n",
+    "# requires a folder called 'datasets' in the current directory\n",
+    "(x_train, y_train), (x_test, y_test) = get_adult_dataset_pd()\n",
+    "x_train = x_train.to_numpy()\n",
+    "y_train = y_train.to_numpy().astype(int)\n",
+    "x_test = x_test.to_numpy()\n",
+    "y_test = y_test.to_numpy().astype(int)\n",
+    "\n",
+    "# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
+    "x_train = x_train[:, [0, 2, 8, 9, 10]].astype(int)\n",
+    "x_test = x_test[:, [0, 2, 8, 9, 10]].astype(int)\n",
+    "\n",
+    "# get balanced dataset\n",
+    "x_train = x_train[:x_test.shape[0]]\n",
+    "y_train = y_train[:y_test.shape[0]]\n",
+    "\n",
+    "print(x_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train decision tree model"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 3,
@@ -39,76 +96,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[[  39.   13. 2174.    0.   40.]\n",
-      " [  50.   13.    0.    0.   13.]\n",
-      " [  38.    9.    0.    0.   40.]\n",
-      " ...\n",
-      " [  27.   13.    0.    0.   40.]\n",
-      " [  26.   11.    0.    0.   48.]\n",
-      " [  27.    9.    0.    0.   40.]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "# Use only numeric features (age, education-num, capital-gain, capital-loss, hours-per-week)\n",
-    "x_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
-    "                        usecols=(0, 4, 10, 11, 12), delimiter=\", \")\n",
-    "\n",
-    "y_train = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
-    "                        usecols=14, dtype=str, delimiter=\", \")\n",
-    "\n",
-    "\n",
-    "x_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
-    "                        usecols=(0, 4, 10, 11, 12), delimiter=\", \", skiprows=1)\n",
-    "\n",
-    "y_test = np.loadtxt(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test\",\n",
-    "                        usecols=14, dtype=str, delimiter=\", \", skiprows=1)\n",
-    "\n",
-    "# Trim trailing period \".\" from label\n",
-    "y_test = np.array([a[:-1] for a in y_test])\n",
-    "\n",
-    "y_train[y_train == '<=50K'] = 0\n",
-    "y_train[y_train == '>50K'] = 1\n",
-    "y_train = y_train.astype(int)\n",
-    "\n",
-    "y_test[y_test == '<=50K'] = 0\n",
-    "y_test[y_test == '>50K'] = 1\n",
-    "y_test = y_test.astype(int)\n",
-    "\n",
-    "# get balanced dataset\n",
-    "x_train = x_train[:x_test.shape[0]]\n",
-    "y_train = y_train[:y_test.shape[0]]\n",
-    "\n",
-    "print(x_train)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Train decision tree model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Base model accuracy:  0.8076285240464345\n"
+      "Base model accuracy:  0.8087341072415699\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/mayaa/Development/GitHub/aiprivacy/ai-privacy-toolkit/venv1/lib/python3.8/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
       "  warnings.warn(msg, category=FutureWarning)\n"
      ]
     }
@@ -122,13 +117,10 @@
     "\n",
     "art_classifier = ScikitlearnDecisionTreeClassifier(model)\n",
     "\n",
-    "print('Base model accuracy: ', model.score(x_test, y_test))\n",
-    "\n",
-    "x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)]).reshape(-1,1)"
+    "print('Base model accuracy: ', model.score(x_test, y_test))"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -139,7 +131,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -159,7 +151,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -168,14 +159,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.5460017196904557\n"
+      "0.5434836015231544\n"
      ]
     }
    ],
@@ -191,7 +182,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -199,7 +189,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -213,30 +202,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[[38. 13.  0.  0. 40.]\n",
-      " [46. 13.  0.  0. 35.]\n",
-      " [28.  9.  0.  0. 40.]\n",
+      "[[38 13  0  0 40]\n",
+      " [46 13  0  0 35]\n",
+      " [28  9  0  0 40]\n",
       " ...\n",
-      " [26. 13.  0.  0. 40.]\n",
-      " [27. 10.  0.  0. 50.]\n",
-      " [28.  9.  0.  0. 40.]]\n"
+      " [26 13  0  0 40]\n",
+      " [27 10  0  0 50]\n",
+      " [28  9  0  0 40]]\n"
      ]
     }
    ],
    "source": [
-    "import os\n",
-    "import sys\n",
-    "sys.path.insert(0, os.path.abspath('..'))\n",
     "from apt.utils.datasets import ArrayDataset\n",
     "from apt.anonymization import Anonymize\n",
     "\n",
+    "x_train_predictions = np.array([np.argmax(arr) for arr in art_classifier.predict(x_train)])\n",
+    "\n",
     "# QI = (age, education-num, capital-gain, hours-per-week)\n",
     "QI = [0, 1, 2, 4]\n",
     "anonymizer = Anonymize(100, QI)\n",
@@ -246,7 +234,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -255,7 +243,7 @@
        "6739"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -267,7 +255,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -276,7 +264,7 @@
        "401"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -287,7 +275,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -296,21 +283,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Anonymized model accuracy:  0.826914808672686\n"
+      "Anonymized model accuracy:  0.8308457711442786\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/mayaa/Development/GitHub/aiprivacy/ai-privacy-toolkit/venv1/lib/python3.8/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
+      "/Users/abigailt/Library/Python/3.9/lib/python/site-packages/sklearn/utils/deprecation.py:103: FutureWarning: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead.\n",
       "  warnings.warn(msg, category=FutureWarning)\n"
      ]
     }
@@ -325,7 +312,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -335,14 +321,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.49692912418621793\n"
+      "0.4944724235351923\n"
      ]
     }
    ],
@@ -364,7 +350,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -380,8 +365,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "(0.5316007088009451, 0.7738607050730868)\n",
-      "(0.4971184877823882, 0.5297874953936863)\n"
+      "without anonymization: (0.5303914835164835, 0.7588748311018303)\n",
+      "with anonymization: (0.49255952380952384, 0.3659255619702739)\n"
      ]
     }
    ],
@@ -411,15 +396,14 @@
     "    return precision, recall\n",
     "\n",
     "# regular\n",
-    "print(calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
+    "print('without anonymization:', calc_precision_recall(np.concatenate((inferred_train_bb, inferred_test_bb)), \n",
     "                            np.concatenate((np.ones(len(inferred_train_bb)), np.zeros(len(inferred_test_bb))))))\n",
     "# anon\n",
-    "print(calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
+    "print('with anonymization:', calc_precision_recall(np.concatenate((anon_inferred_train_bb, anon_inferred_test_bb)), \n",
     "                            np.concatenate((np.ones(len(anon_inferred_train_bb)), np.zeros(len(anon_inferred_test_bb))))))"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -429,7 +413,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -443,7 +427,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,