diff --git a/notebooks/minimization_loan.ipynb b/notebooks/minimization_loan.ipynb
new file mode 100644
index 0000000..4f11475
--- /dev/null
+++ b/notebooks/minimization_loan.ipynb
@@ -0,0 +1,829 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Applying data minimization to loans data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
+    "\n",
+    "This will be demonstarted using the Loans dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load and preprocess data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "sys.path.insert(0, os.path.abspath('..'))\n",
+    "\n",
+    "def modify_specific_features(data):\n",
+    "    data = data.rename(\n",
+    "        columns={\"loan_amnt\": \"loan_amount\", \"funded_amnt\": \"funded_amount\", \"funded_amnt_inv\": \"investor_funds\",\n",
+    "                 \"int_rate\": \"interest_rate\", \"annual_inc\": \"annual_income\"})\n",
+    "\n",
+    "    date_format = '%b-%Y'\n",
+    "    dt_series = pd.to_datetime(data['issue_d'], format=date_format)\n",
+    "    data['year'] = dt_series.dt.year\n",
+    "\n",
+    "    dt_series = pd.to_datetime(data['earliest_cr_line'], format=date_format)\n",
+    "    data['earliest_cr_year'] = dt_series.dt.year\n",
+    "\n",
+    "    dt_series = pd.to_datetime(data['last_credit_pull_d'], format=date_format)\n",
+    "    data['last_credit_pull_year'] = dt_series.dt.year\n",
+    "\n",
+    "    # TODO: Maybe year is not enough, we may want time since last payment? Or some other diff (time between last payment and X)\n",
+    "    dt_series = pd.to_datetime(data['last_pymnt_d'], format=date_format)\n",
+    "    data['last_pymnt_year'] = dt_series.dt.year\n",
+    "\n",
+    "    data['interest_rate'] = data['interest_rate'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n",
+    "\n",
+    "    data['revol_util'] = data['revol_util'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n",
+    "\n",
+    "    data['term'] = data['term'].apply(lambda x: x if type(x) == int else int(x[:-7]))\n",
+    "\n",
+    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x + 's' if x.endswith('year') else x)\n",
+    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x[:-6])\n",
+    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 0.5 if '<' in x else x)\n",
+    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 10 if '+' in x else x)\n",
+    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)\n",
+    "\n",
+    "    data['zip_code'] = data['zip_code'].apply(lambda x: x if type(x) == int else int(x[:-2]))\n",
+    "\n",
+    "    west = ['CA', 'OR', 'UT', 'WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']\n",
+    "    south_west = ['AZ', 'TX', 'NM', 'OK']\n",
+    "    south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN']\n",
+    "    mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']\n",
+    "    north_east = ['CT', 'NY', 'PA', 'NJ', 'RI', 'MA', 'MD', 'VT', 'NH', 'ME']\n",
+    "\n",
+    "    data['region'] = np.nan\n",
+    "\n",
+    "    def finding_regions(state):\n",
+    "        if state in west:\n",
+    "            return 'West'\n",
+    "        elif state in south_west:\n",
+    "            return 'SouthWest'\n",
+    "        elif state in south_east:\n",
+    "            return 'SouthEast'\n",
+    "        elif state in mid_west:\n",
+    "            return 'MidWest'\n",
+    "        elif state in north_east:\n",
+    "            return 'NorthEast'\n",
+    "\n",
+    "    data['region'] = data['addr_state'].apply(finding_regions)\n",
+    "    return data\n",
+    "\n",
+    "def fill_missing(data):\n",
+    "    for col in ('dti_joint', 'annual_inc_joint', 'il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'open_il_12m',\n",
+    "                'open_il_24m', 'inq_last_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi',\n",
+    "                'total_cu_tl', 'loan_amount', 'funded_amount', 'investor_funds', 'term', 'interest_rate',\n",
+    "                'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'total_bal_il',\n",
+    "                'tot_coll_amt', 'installment', 'emp_length', 'annual_income', 'zip_code', 'delinq_2yrs',\n",
+    "                'tot_cur_bal', 'total_rev_hi_lim', 'revol_util', 'collections_12_mths_ex_med', 'open_acc',\n",
+    "                'inq_last_6mths', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_rec_int',\n",
+    "                'verification_status_joint', 'acc_now_delinq', 'settlement_amount', 'settlement_percentage',\n",
+    "                'settlement_term', 'dti', 'total_rec_late_fee', 'policy_code', 'chargeoff_within_12_mths', 'total_rec_int',\n",
+    "                'last_credit_pull_year', 'last_pymnt_year', 'delinq_amnt', 'tax_liens', 'year', 'earliest_cr_year'):\n",
+    "        try:\n",
+    "            data[col] = data[col].fillna(0)\n",
+    "        except KeyError:\n",
+    "            print('missing column ' + col)\n",
+    "\n",
+    "    for col in ('settlement_status', 'emp_title', 'region'):\n",
+    "        try:\n",
+    "            data[col] = data[col].fillna('NA')\n",
+    "        except KeyError:\n",
+    "            print('missing column ' + col)\n",
+    "            \n",
+    "    return data\n",
+    "\n",
+    "def modify_label(data):\n",
+    "    bad_loan = [\"Charged Off\", \"Default\", \"Does not meet the credit policy. Status:Charged Off\", \"In Grace Period\",\n",
+    "                \"Late (16-30 days)\", \"Late (31-120 days)\"]\n",
+    "\n",
+    "    data['label'] = np.nan\n",
+    "\n",
+    "    def loan_condition(status):\n",
+    "        if status in bad_loan:\n",
+    "            return 0\n",
+    "        else:\n",
+    "            return 1\n",
+    "\n",
+    "    data['label'] = data['loan_status'].apply(loan_condition)\n",
+    "    return data\n",
+    "\n",
+    "def remove_unwanted_features(data):\n",
+    "    features_to_remove = ['loan_status', 'id', 'member_id', 'url', 'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint',\n",
+    "              'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',\n",
+    "              'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',\n",
+    "              'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',\n",
+    "              'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',\n",
+    "              'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',\n",
+    "              'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',\n",
+    "              'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',\n",
+    "              'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',\n",
+    "              'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tot_hi_cred_lim',\n",
+    "              'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_earliest_cr_line',\n",
+    "              'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il',\n",
+    "              'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',\n",
+    "              'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status',\n",
+    "              'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',\n",
+    "              'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',\n",
+    "              'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'mths_since_recent_bc','desc', 'emp_title', 'title',\n",
+    "              'issue_d', 'earliest_cr_line', 'last_credit_pull_d',\n",
+    "              'debt_settlement_flag_date', 'settlement_date', 'last_pymnt_d', 'recoveries', 'collection_recovery_fee', 'total_rec_prncp',\n",
+    "              'last_pymnt_year', 'last_credit_pull_year', 'total_pymnt', 'total_pymnt_inv', 'debt_settlement_flag', 'settlement_status',\n",
+    "              'settlement_amount', 'settlement_percentage', 'settlement_term', 'addr_state']\n",
+    "    return data.drop(features_to_remove, axis=1)\n",
+    "\n",
+    "def split_data(data, create_validation):\n",
+    "    # divide into 3 datasets for training, validation and test\n",
+    "    stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
+    "    for train_set, test_set in stratified.split(data, data['label']):\n",
+    "        train = data.iloc[train_set]\n",
+    "        test = data.iloc[test_set] # 20% of data\n",
+    "\n",
+    "    stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=42)\n",
+    "    for train_set, test_set in stratified.split(train, train['label']):\n",
+    "        model_train = data.iloc[train_set] # 40% of data\n",
+    "        generalizer_train = data.iloc[test_set] # 40% of data\n",
+    "\n",
+    "    validation = None\n",
+    "    if create_validation:\n",
+    "        stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)\n",
+    "        for train_set, test_set in stratified.split(generalizer_train, generalizer_train['label']):\n",
+    "            generalizer_train = data.iloc[train_set]  # 30% of data\n",
+    "            validation = data.iloc[test_set]  # 10% of data\n",
+    "    return train, test, generalizer_train, validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>loan_amount</th>\n",
+       "      <th>funded_amount</th>\n",
+       "      <th>investor_funds</th>\n",
+       "      <th>term</th>\n",
+       "      <th>interest_rate</th>\n",
+       "      <th>installment</th>\n",
+       "      <th>grade</th>\n",
+       "      <th>sub_grade</th>\n",
+       "      <th>emp_length</th>\n",
+       "      <th>home_ownership</th>\n",
+       "      <th>...</th>\n",
+       "      <th>application_type</th>\n",
+       "      <th>acc_now_delinq</th>\n",
+       "      <th>chargeoff_within_12_mths</th>\n",
+       "      <th>delinq_amnt</th>\n",
+       "      <th>tax_liens</th>\n",
+       "      <th>hardship_flag</th>\n",
+       "      <th>disbursement_method</th>\n",
+       "      <th>year</th>\n",
+       "      <th>earliest_cr_year</th>\n",
+       "      <th>region</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>371269</th>\n",
+       "      <td>28000</td>\n",
+       "      <td>28000</td>\n",
+       "      <td>28000.0</td>\n",
+       "      <td>60</td>\n",
+       "      <td>16.55</td>\n",
+       "      <td>689.12</td>\n",
+       "      <td>D</td>\n",
+       "      <td>D2</td>\n",
+       "      <td>0</td>\n",
+       "      <td>OWN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>1997</td>\n",
+       "      <td>West</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>413904</th>\n",
+       "      <td>15000</td>\n",
+       "      <td>15000</td>\n",
+       "      <td>15000.0</td>\n",
+       "      <td>36</td>\n",
+       "      <td>6.03</td>\n",
+       "      <td>456.54</td>\n",
+       "      <td>A</td>\n",
+       "      <td>A1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>NorthEast</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>390183</th>\n",
+       "      <td>8000</td>\n",
+       "      <td>8000</td>\n",
+       "      <td>8000.0</td>\n",
+       "      <td>36</td>\n",
+       "      <td>17.86</td>\n",
+       "      <td>288.66</td>\n",
+       "      <td>D</td>\n",
+       "      <td>D5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>RENT</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>2000</td>\n",
+       "      <td>West</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>171520</th>\n",
+       "      <td>4750</td>\n",
+       "      <td>4750</td>\n",
+       "      <td>4750.0</td>\n",
+       "      <td>36</td>\n",
+       "      <td>16.99</td>\n",
+       "      <td>169.33</td>\n",
+       "      <td>D</td>\n",
+       "      <td>D3</td>\n",
+       "      <td>2</td>\n",
+       "      <td>OWN</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>1992</td>\n",
+       "      <td>SouthEast</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>79658</th>\n",
+       "      <td>3950</td>\n",
+       "      <td>3950</td>\n",
+       "      <td>3950.0</td>\n",
+       "      <td>36</td>\n",
+       "      <td>10.99</td>\n",
+       "      <td>129.30</td>\n",
+       "      <td>B</td>\n",
+       "      <td>B4</td>\n",
+       "      <td>4</td>\n",
+       "      <td>RENT</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>MidWest</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>397229</th>\n",
+       "      <td>26500</td>\n",
+       "      <td>26500</td>\n",
+       "      <td>26500.0</td>\n",
+       "      <td>60</td>\n",
+       "      <td>8.67</td>\n",
+       "      <td>545.87</td>\n",
+       "      <td>B</td>\n",
+       "      <td>B1</td>\n",
+       "      <td>10</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>1994</td>\n",
+       "      <td>SouthEast</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>110958</th>\n",
+       "      <td>18900</td>\n",
+       "      <td>18900</td>\n",
+       "      <td>18900.0</td>\n",
+       "      <td>36</td>\n",
+       "      <td>5.32</td>\n",
+       "      <td>569.17</td>\n",
+       "      <td>A</td>\n",
+       "      <td>A1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>RENT</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>1999</td>\n",
+       "      <td>SouthEast</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>91253</th>\n",
+       "      <td>18000</td>\n",
+       "      <td>18000</td>\n",
+       "      <td>18000.0</td>\n",
+       "      <td>60</td>\n",
+       "      <td>11.53</td>\n",
+       "      <td>396.14</td>\n",
+       "      <td>B</td>\n",
+       "      <td>B5</td>\n",
+       "      <td>0</td>\n",
+       "      <td>RENT</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>1987</td>\n",
+       "      <td>NorthEast</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>61757</th>\n",
+       "      <td>14450</td>\n",
+       "      <td>14450</td>\n",
+       "      <td>14375.0</td>\n",
+       "      <td>60</td>\n",
+       "      <td>23.99</td>\n",
+       "      <td>415.62</td>\n",
+       "      <td>F</td>\n",
+       "      <td>F3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>2006</td>\n",
+       "      <td>MidWest</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>113397</th>\n",
+       "      <td>12600</td>\n",
+       "      <td>12600</td>\n",
+       "      <td>12600.0</td>\n",
+       "      <td>36</td>\n",
+       "      <td>9.99</td>\n",
+       "      <td>406.51</td>\n",
+       "      <td>B</td>\n",
+       "      <td>B3</td>\n",
+       "      <td>10</td>\n",
+       "      <td>MORTGAGE</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Individual</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>N</td>\n",
+       "      <td>Cash</td>\n",
+       "      <td>2015</td>\n",
+       "      <td>1993</td>\n",
+       "      <td>NorthEast</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>336876 rows × 43 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        loan_amount  funded_amount  investor_funds  term  interest_rate  \\\n",
+       "371269        28000          28000         28000.0    60          16.55   \n",
+       "413904        15000          15000         15000.0    36           6.03   \n",
+       "390183         8000           8000          8000.0    36          17.86   \n",
+       "171520         4750           4750          4750.0    36          16.99   \n",
+       "79658          3950           3950          3950.0    36          10.99   \n",
+       "...             ...            ...             ...   ...            ...   \n",
+       "397229        26500          26500         26500.0    60           8.67   \n",
+       "110958        18900          18900         18900.0    36           5.32   \n",
+       "91253         18000          18000         18000.0    60          11.53   \n",
+       "61757         14450          14450         14375.0    60          23.99   \n",
+       "113397        12600          12600         12600.0    36           9.99   \n",
+       "\n",
+       "        installment grade sub_grade  emp_length home_ownership  ...  \\\n",
+       "371269       689.12     D        D2           0            OWN  ...   \n",
+       "413904       456.54     A        A1           1       MORTGAGE  ...   \n",
+       "390183       288.66     D        D5           0           RENT  ...   \n",
+       "171520       169.33     D        D3           2            OWN  ...   \n",
+       "79658        129.30     B        B4           4           RENT  ...   \n",
+       "...             ...   ...       ...         ...            ...  ...   \n",
+       "397229       545.87     B        B1          10       MORTGAGE  ...   \n",
+       "110958       569.17     A        A1           0           RENT  ...   \n",
+       "91253        396.14     B        B5           0           RENT  ...   \n",
+       "61757        415.62     F        F3           1       MORTGAGE  ...   \n",
+       "113397       406.51     B        B3          10       MORTGAGE  ...   \n",
+       "\n",
+       "        application_type acc_now_delinq chargeoff_within_12_mths delinq_amnt  \\\n",
+       "371269        Individual              0                        0           0   \n",
+       "413904        Individual              0                        0           0   \n",
+       "390183        Individual              0                        0           0   \n",
+       "171520        Individual              0                        0           0   \n",
+       "79658         Individual              0                        0           0   \n",
+       "...                  ...            ...                      ...         ...   \n",
+       "397229        Individual              0                        0           0   \n",
+       "110958        Individual              0                        0           0   \n",
+       "91253         Individual              0                        0           0   \n",
+       "61757         Individual              0                        0           0   \n",
+       "113397        Individual              0                        0           0   \n",
+       "\n",
+       "        tax_liens  hardship_flag  disbursement_method  year  earliest_cr_year  \\\n",
+       "371269          0              N                 Cash  2015              1997   \n",
+       "413904          0              N                 Cash  2015              2003   \n",
+       "390183          0              N                 Cash  2015              2000   \n",
+       "171520          0              N                 Cash  2015              1992   \n",
+       "79658           0              N                 Cash  2015              2010   \n",
+       "...           ...            ...                  ...   ...               ...   \n",
+       "397229          0              N                 Cash  2015              1994   \n",
+       "110958          0              N                 Cash  2015              1999   \n",
+       "91253           0              N                 Cash  2015              1987   \n",
+       "61757           0              N                 Cash  2015              2006   \n",
+       "113397          0              N                 Cash  2015              1993   \n",
+       "\n",
+       "           region  \n",
+       "371269       West  \n",
+       "413904  NorthEast  \n",
+       "390183       West  \n",
+       "171520  SouthEast  \n",
+       "79658     MidWest  \n",
+       "...           ...  \n",
+       "397229  SouthEast  \n",
+       "110958  SouthEast  \n",
+       "91253   NorthEast  \n",
+       "61757     MidWest  \n",
+       "113397  NorthEast  \n",
+       "\n",
+       "[336876 rows x 43 columns]"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.utils import shuffle\n",
+    "from sklearn.model_selection import StratifiedShuffleSplit\n",
+    "\n",
+    "input_file_path = \"/Users/abigailt/Desktop/Projects/mlPrivacy Projects/Minimization-Patent-DT/income/data/loan.csv\"\n",
+    "dataset = pd.read_csv(input_file_path, low_memory=False)\n",
+    "dataset = shuffle(dataset, random_state=14)\n",
+    "\n",
+    "dataset = modify_specific_features(dataset)\n",
+    "dataset = fill_missing(dataset)\n",
+    "dataset = modify_label(dataset)\n",
+    "dataset = remove_unwanted_features(dataset)\n",
+    "        \n",
+    "train, test, generalizer_train, _ = split_data(dataset, False)\n",
+    "\n",
+    "x_train = train.drop('label', axis=1)\n",
+    "y_train = train['label']\n",
+    "x_test = test.drop('label', axis=1)\n",
+    "y_test = test['label']\n",
+    "\n",
+    "x_train"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train decision tree model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Base model accuracy:  0.9442762322041345\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "\n",
+    "features = x_train.columns\n",
+    "categorical_features = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose',\n",
+    "                        'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'region']\n",
+    "# QI parameter determines which features will be minimized.\n",
+    "QI = [\"annual_income\", \"zip_code\", \"dti\", \"last_pymnt_amnt\", \"total_rec_int\"]\n",
+    "\n",
+    "numeric_features = [f for f in features if f not in categorical_features]\n",
+    "numeric_transformer = Pipeline(\n",
+    "    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
+    ")\n",
+    "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
+    "preprocessor = ColumnTransformer(\n",
+    "    transformers=[\n",
+    "        (\"num\", numeric_transformer, numeric_features),\n",
+    "        (\"cat\", categorical_transformer, categorical_features),\n",
+    "    ]\n",
+    ")\n",
+    "encoded_train = preprocessor.fit_transform(x_train)\n",
+    "model = DecisionTreeClassifier()\n",
+    "model.fit(encoded_train, y_train)\n",
+    "\n",
+    "encoded_test = preprocessor.transform(x_test)\n",
+    "print('Base model accuracy: ', model.score(encoded_test, y_test))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run minimization\n",
+    "We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n",
+      "Improving accuracy\n",
+      "feature to remove: zip_code\n",
+      "Removed feature: zip_code, new relative accuracy: 0.861250\n",
+      "feature to remove: total_rec_int\n",
+      "Removed feature: total_rec_int, new relative accuracy: 0.912500\n",
+      "feature to remove: last_pymnt_amnt\n",
+      "Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n",
+      "feature to remove: dti\n",
+      "Removed feature: dti, new relative accuracy: 0.995000\n",
+      "feature to remove: annual_income\n",
+      "Removed feature: annual_income, new relative accuracy: 1.000000\n",
+      "Accuracy on minimized data:  0.9425\n"
+     ]
+    }
+   ],
+   "source": [
+    "from apt.minimization import GeneralizeToRepresentative\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# default target_accuracy is 0.998\n",
+    "minimizer = GeneralizeToRepresentative(model, \n",
+    "                                       categorical_features=categorical_features, \n",
+    "                                       features_to_minimize=QI,\n",
+    "                                       encoder=preprocessor)\n",
+    "\n",
+    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
+    "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
+    "# data it could result in a larger gap)\n",
+    "# Don't forget to leave a hold-out set for final validation!\n",
+    "generalizer_train_small = generalizer_train[:2000]\n",
+    "x_test_small = x_test[:2000]\n",
+    "y_test_small = y_test[:2000]\n",
+    "X_generalizer_train = generalizer_train_small.drop('label', axis=1)\n",
+    "features_names = features.tolist()\n",
+    "\n",
+    "encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
+    "x_train_predictions = model.predict(encoded_generalizer_train)\n",
+    "minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n",
+    "transformed = minimizer.transform(x_test_small, features_names=features_names)\n",
+    "\n",
+    "encoded_transformed = preprocessor.transform(transformed)\n",
+    "print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test_small))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Let's see what features were generalized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'ranges': {}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'dti', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'annual_income', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "generalizations = minimizer.generalizations\n",
+    "print(generalizations)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
+    "\n",
+    "Let's change to a slightly lower target accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n",
+      "Improving accuracy\n",
+      "feature to remove: zip_code\n",
+      "Removed feature: zip_code, new relative accuracy: 0.861250\n",
+      "feature to remove: total_rec_int\n",
+      "Removed feature: total_rec_int, new relative accuracy: 0.912500\n",
+      "feature to remove: last_pymnt_amnt\n",
+      "Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n",
+      "Accuracy on minimized data:  0.933\n",
+      "{'ranges': {'annual_income': [10500.0, 18000.0, 28000.0, 35250.0, 36500.0, 37500.0, 40500.0, 43750.0, 48600.0, 49650.0, 50800.0, 51000.0, 53000.0, 54500.0, 55280.0, 56500.0, 56712.5, 61000.0, 66994.5, 69494.5, 70500.0, 75000.0, 82500.0, 84500.0, 90000.0, 91000.0, 95000.0, 127500.0, 135000.0, 141500.0, 179500.0, 297679.5], 'dti': [4.054999828338623, 8.869999885559082, 12.130000114440918, 14.735000133514404, 15.625, 15.84000015258789, 15.984999656677246, 17.34500026702881, 17.664999961853027, 18.954999923706055, 19.020000457763672, 19.3700008392334, 19.545000076293945, 20.65999984741211, 20.78499984741211, 21.260000228881836, 22.40000057220459, 22.984999656677246, 23.179999351501465, 23.1850004196167, 26.139999389648438, 26.460000038146973, 28.050000190734863, 28.375, 28.894999504089355, 30.414999961853027, 33.85000038146973, 34.46500015258789, 36.720001220703125]}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We allow a 2% deviation in accuracy from the original model accuracy\n",
+    "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.98, \n",
+    "                                        categorical_features=categorical_features, \n",
+    "                                        features_to_minimize=QI,\n",
+    "                                        encoder=preprocessor)\n",
+    "\n",
+    "minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n",
+    "transformed2 = minimizer2.transform(x_test_small, features_names=features_names)\n",
+    "\n",
+    "encoded_transformed2 = preprocessor.transform(transformed2)\n",
+    "print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test_small))\n",
+    "generalizations2 = minimizer2.generalizations\n",
+    "print(generalizations2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Two features are generalized: annual income and debt to income ratio"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}