ai-privacy-toolkit/notebooks/minimization_loan.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Applying data minimization to loans data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
    "\n",
    "This will be demonstarted using the Loans dataset."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and preprocess data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "sys.path.insert(0, os.path.abspath('..'))\n",
    "\n",
    "def modify_specific_features(data):\n",
    "    data = data.rename(\n",
    "        columns={\"loan_amnt\": \"loan_amount\", \"funded_amnt\": \"funded_amount\", \"funded_amnt_inv\": \"investor_funds\",\n",
    "                 \"int_rate\": \"interest_rate\", \"annual_inc\": \"annual_income\"})\n",
    "\n",
    "    date_format = '%b-%Y'\n",
    "    dt_series = pd.to_datetime(data['issue_d'], format=date_format)\n",
    "    data['year'] = dt_series.dt.year\n",
    "\n",
    "    dt_series = pd.to_datetime(data['earliest_cr_line'], format=date_format)\n",
    "    data['earliest_cr_year'] = dt_series.dt.year\n",
    "\n",
    "    dt_series = pd.to_datetime(data['last_credit_pull_d'], format=date_format)\n",
    "    data['last_credit_pull_year'] = dt_series.dt.year\n",
    "\n",
    "    # TODO: Maybe year is not enough, we may want time since last payment? Or some other diff (time between last payment and X)\n",
    "    dt_series = pd.to_datetime(data['last_pymnt_d'], format=date_format)\n",
    "    data['last_pymnt_year'] = dt_series.dt.year\n",
    "\n",
    "    data['interest_rate'] = data['interest_rate'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n",
    "\n",
    "    data['revol_util'] = data['revol_util'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n",
    "\n",
    "    data['term'] = data['term'].apply(lambda x: x if type(x) == int else int(x[:-7]))\n",
    "\n",
    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x + 's' if x.endswith('year') else x)\n",
    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x[:-6])\n",
    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 0.5 if '<' in x else x)\n",
    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 10 if '+' in x else x)\n",
    "    data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)\n",
    "\n",
    "    data['zip_code'] = data['zip_code'].apply(lambda x: x if type(x) == int else int(x[:-2]))\n",
    "\n",
    "    west = ['CA', 'OR', 'UT', 'WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']\n",
    "    south_west = ['AZ', 'TX', 'NM', 'OK']\n",
    "    south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN']\n",
    "    mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']\n",
    "    north_east = ['CT', 'NY', 'PA', 'NJ', 'RI', 'MA', 'MD', 'VT', 'NH', 'ME']\n",
    "\n",
    "    data['region'] = np.nan\n",
    "\n",
    "    def finding_regions(state):\n",
    "        if state in west:\n",
    "            return 'West'\n",
    "        elif state in south_west:\n",
    "            return 'SouthWest'\n",
    "        elif state in south_east:\n",
    "            return 'SouthEast'\n",
    "        elif state in mid_west:\n",
    "            return 'MidWest'\n",
    "        elif state in north_east:\n",
    "            return 'NorthEast'\n",
    "\n",
    "    data['region'] = data['addr_state'].apply(finding_regions)\n",
    "    return data\n",
    "\n",
    "def fill_missing(data):\n",
    "    for col in ('dti_joint', 'annual_inc_joint', 'il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'open_il_12m',\n",
    "                'open_il_24m', 'inq_last_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi',\n",
    "                'total_cu_tl', 'loan_amount', 'funded_amount', 'investor_funds', 'term', 'interest_rate',\n",
    "                'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'total_bal_il',\n",
    "                'tot_coll_amt', 'installment', 'emp_length', 'annual_income', 'zip_code', 'delinq_2yrs',\n",
    "                'tot_cur_bal', 'total_rev_hi_lim', 'revol_util', 'collections_12_mths_ex_med', 'open_acc',\n",
    "                'inq_last_6mths', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_rec_int',\n",
    "                'verification_status_joint', 'acc_now_delinq', 'settlement_amount', 'settlement_percentage',\n",
    "                'settlement_term', 'dti', 'total_rec_late_fee', 'policy_code', 'chargeoff_within_12_mths', 'total_rec_int',\n",
    "                'last_credit_pull_year', 'last_pymnt_year', 'delinq_amnt', 'tax_liens', 'year', 'earliest_cr_year'):\n",
    "        try:\n",
    "            data[col] = data[col].fillna(0)\n",
    "        except KeyError:\n",
    "            print('missing column ' + col)\n",
    "\n",
    "    for col in ('settlement_status', 'emp_title', 'region'):\n",
    "        try:\n",
    "            data[col] = data[col].fillna('NA')\n",
    "        except KeyError:\n",
    "            print('missing column ' + col)\n",
    "            \n",
    "    return data\n",
    "\n",
    "def modify_label(data):\n",
    "    bad_loan = [\"Charged Off\", \"Default\", \"Does not meet the credit policy. Status:Charged Off\", \"In Grace Period\",\n",
    "                \"Late (16-30 days)\", \"Late (31-120 days)\"]\n",
    "\n",
    "    data['label'] = np.nan\n",
    "\n",
    "    def loan_condition(status):\n",
    "        if status in bad_loan:\n",
    "            return 0\n",
    "        else:\n",
    "            return 1\n",
    "\n",
    "    data['label'] = data['loan_status'].apply(loan_condition)\n",
    "    return data\n",
    "\n",
    "def remove_unwanted_features(data):\n",
    "    features_to_remove = ['loan_status', 'id', 'member_id', 'url', 'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint',\n",
    "              'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',\n",
    "              'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',\n",
    "              'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',\n",
    "              'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',\n",
    "              'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',\n",
    "              'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',\n",
    "              'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',\n",
    "              'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',\n",
    "              'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tot_hi_cred_lim',\n",
    "              'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_earliest_cr_line',\n",
    "              'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il',\n",
    "              'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',\n",
    "              'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status',\n",
    "              'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',\n",
    "              'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',\n",
    "              'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'mths_since_recent_bc','desc', 'emp_title', 'title',\n",
    "              'issue_d', 'earliest_cr_line', 'last_credit_pull_d',\n",
    "              'debt_settlement_flag_date', 'settlement_date', 'last_pymnt_d', 'recoveries', 'collection_recovery_fee', 'total_rec_prncp',\n",
    "              'last_pymnt_year', 'last_credit_pull_year', 'total_pymnt', 'total_pymnt_inv', 'debt_settlement_flag', 'settlement_status',\n",
    "              'settlement_amount', 'settlement_percentage', 'settlement_term', 'addr_state']\n",
    "    return data.drop(features_to_remove, axis=1)\n",
    "\n",
    "def split_data(data, create_validation):\n",
    "    # divide into 3 datasets for training, validation and test\n",
    "    stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
    "    for train_set, test_set in stratified.split(data, data['label']):\n",
    "        train = data.iloc[train_set]\n",
    "        test = data.iloc[test_set] # 20% of data\n",
    "\n",
    "    stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=42)\n",
    "    for train_set, test_set in stratified.split(train, train['label']):\n",
    "        model_train = data.iloc[train_set] # 40% of data\n",
    "        generalizer_train = data.iloc[test_set] # 40% of data\n",
    "\n",
    "    validation = None\n",
    "    if create_validation:\n",
    "        stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)\n",
    "        for train_set, test_set in stratified.split(generalizer_train, generalizer_train['label']):\n",
    "            generalizer_train = data.iloc[train_set]  # 30% of data\n",
    "            validation = data.iloc[test_set]  # 10% of data\n",
    "    return train, test, generalizer_train, validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>loan_amount</th>\n",
       "      <th>funded_amount</th>\n",
       "      <th>investor_funds</th>\n",
       "      <th>term</th>\n",
       "      <th>interest_rate</th>\n",
       "      <th>installment</th>\n",
       "      <th>grade</th>\n",
       "      <th>sub_grade</th>\n",
       "      <th>emp_length</th>\n",
       "      <th>home_ownership</th>\n",
       "      <th>...</th>\n",
       "      <th>application_type</th>\n",
       "      <th>acc_now_delinq</th>\n",
       "      <th>chargeoff_within_12_mths</th>\n",
       "      <th>delinq_amnt</th>\n",
       "      <th>tax_liens</th>\n",
       "      <th>hardship_flag</th>\n",
       "      <th>disbursement_method</th>\n",
       "      <th>year</th>\n",
       "      <th>earliest_cr_year</th>\n",
       "      <th>region</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>371269</th>\n",
       "      <td>28000</td>\n",
       "      <td>28000</td>\n",
       "      <td>28000.0</td>\n",
       "      <td>60</td>\n",
       "      <td>16.55</td>\n",
       "      <td>689.12</td>\n",
       "      <td>D</td>\n",
       "      <td>D2</td>\n",
       "      <td>0</td>\n",
       "      <td>OWN</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>1997</td>\n",
       "      <td>West</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>413904</th>\n",
       "      <td>15000</td>\n",
       "      <td>15000</td>\n",
       "      <td>15000.0</td>\n",
       "      <td>36</td>\n",
       "      <td>6.03</td>\n",
       "      <td>456.54</td>\n",
       "      <td>A</td>\n",
       "      <td>A1</td>\n",
       "      <td>1</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>2003</td>\n",
       "      <td>NorthEast</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>390183</th>\n",
       "      <td>8000</td>\n",
       "      <td>8000</td>\n",
       "      <td>8000.0</td>\n",
       "      <td>36</td>\n",
       "      <td>17.86</td>\n",
       "      <td>288.66</td>\n",
       "      <td>D</td>\n",
       "      <td>D5</td>\n",
       "      <td>0</td>\n",
       "      <td>RENT</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>2000</td>\n",
       "      <td>West</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>171520</th>\n",
       "      <td>4750</td>\n",
       "      <td>4750</td>\n",
       "      <td>4750.0</td>\n",
       "      <td>36</td>\n",
       "      <td>16.99</td>\n",
       "      <td>169.33</td>\n",
       "      <td>D</td>\n",
       "      <td>D3</td>\n",
       "      <td>2</td>\n",
       "      <td>OWN</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>1992</td>\n",
       "      <td>SouthEast</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79658</th>\n",
       "      <td>3950</td>\n",
       "      <td>3950</td>\n",
       "      <td>3950.0</td>\n",
       "      <td>36</td>\n",
       "      <td>10.99</td>\n",
       "      <td>129.30</td>\n",
       "      <td>B</td>\n",
       "      <td>B4</td>\n",
       "      <td>4</td>\n",
       "      <td>RENT</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>2010</td>\n",
       "      <td>MidWest</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>397229</th>\n",
       "      <td>26500</td>\n",
       "      <td>26500</td>\n",
       "      <td>26500.0</td>\n",
       "      <td>60</td>\n",
       "      <td>8.67</td>\n",
       "      <td>545.87</td>\n",
       "      <td>B</td>\n",
       "      <td>B1</td>\n",
       "      <td>10</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>1994</td>\n",
       "      <td>SouthEast</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110958</th>\n",
       "      <td>18900</td>\n",
       "      <td>18900</td>\n",
       "      <td>18900.0</td>\n",
       "      <td>36</td>\n",
       "      <td>5.32</td>\n",
       "      <td>569.17</td>\n",
       "      <td>A</td>\n",
       "      <td>A1</td>\n",
       "      <td>0</td>\n",
       "      <td>RENT</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>1999</td>\n",
       "      <td>SouthEast</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91253</th>\n",
       "      <td>18000</td>\n",
       "      <td>18000</td>\n",
       "      <td>18000.0</td>\n",
       "      <td>60</td>\n",
       "      <td>11.53</td>\n",
       "      <td>396.14</td>\n",
       "      <td>B</td>\n",
       "      <td>B5</td>\n",
       "      <td>0</td>\n",
       "      <td>RENT</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>1987</td>\n",
       "      <td>NorthEast</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61757</th>\n",
       "      <td>14450</td>\n",
       "      <td>14450</td>\n",
       "      <td>14375.0</td>\n",
       "      <td>60</td>\n",
       "      <td>23.99</td>\n",
       "      <td>415.62</td>\n",
       "      <td>F</td>\n",
       "      <td>F3</td>\n",
       "      <td>1</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>2006</td>\n",
       "      <td>MidWest</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113397</th>\n",
       "      <td>12600</td>\n",
       "      <td>12600</td>\n",
       "      <td>12600.0</td>\n",
       "      <td>36</td>\n",
       "      <td>9.99</td>\n",
       "      <td>406.51</td>\n",
       "      <td>B</td>\n",
       "      <td>B3</td>\n",
       "      <td>10</td>\n",
       "      <td>MORTGAGE</td>\n",
       "      <td>...</td>\n",
       "      <td>Individual</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>N</td>\n",
       "      <td>Cash</td>\n",
       "      <td>2015</td>\n",
       "      <td>1993</td>\n",
       "      <td>NorthEast</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>336876 rows × 43 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        loan_amount  funded_amount  investor_funds  term  interest_rate  \\\n",
       "371269        28000          28000         28000.0    60          16.55   \n",
       "413904        15000          15000         15000.0    36           6.03   \n",
       "390183         8000           8000          8000.0    36          17.86   \n",
       "171520         4750           4750          4750.0    36          16.99   \n",
       "79658          3950           3950          3950.0    36          10.99   \n",
       "...             ...            ...             ...   ...            ...   \n",
       "397229        26500          26500         26500.0    60           8.67   \n",
       "110958        18900          18900         18900.0    36           5.32   \n",
       "91253         18000          18000         18000.0    60          11.53   \n",
       "61757         14450          14450         14375.0    60          23.99   \n",
       "113397        12600          12600         12600.0    36           9.99   \n",
       "\n",
       "        installment grade sub_grade  emp_length home_ownership  ...  \\\n",
       "371269       689.12     D        D2           0            OWN  ...   \n",
       "413904       456.54     A        A1           1       MORTGAGE  ...   \n",
       "390183       288.66     D        D5           0           RENT  ...   \n",
       "171520       169.33     D        D3           2            OWN  ...   \n",
       "79658        129.30     B        B4           4           RENT  ...   \n",
       "...             ...   ...       ...         ...            ...  ...   \n",
       "397229       545.87     B        B1          10       MORTGAGE  ...   \n",
       "110958       569.17     A        A1           0           RENT  ...   \n",
       "91253        396.14     B        B5           0           RENT  ...   \n",
       "61757        415.62     F        F3           1       MORTGAGE  ...   \n",
       "113397       406.51     B        B3          10       MORTGAGE  ...   \n",
       "\n",
       "        application_type acc_now_delinq chargeoff_within_12_mths delinq_amnt  \\\n",
       "371269        Individual              0                        0           0   \n",
       "413904        Individual              0                        0           0   \n",
       "390183        Individual              0                        0           0   \n",
       "171520        Individual              0                        0           0   \n",
       "79658         Individual              0                        0           0   \n",
       "...                  ...            ...                      ...         ...   \n",
       "397229        Individual              0                        0           0   \n",
       "110958        Individual              0                        0           0   \n",
       "91253         Individual              0                        0           0   \n",
       "61757         Individual              0                        0           0   \n",
       "113397        Individual              0                        0           0   \n",
       "\n",
       "        tax_liens  hardship_flag  disbursement_method  year  earliest_cr_year  \\\n",
       "371269          0              N                 Cash  2015              1997   \n",
       "413904          0              N                 Cash  2015              2003   \n",
       "390183          0              N                 Cash  2015              2000   \n",
       "171520          0              N                 Cash  2015              1992   \n",
       "79658           0              N                 Cash  2015              2010   \n",
       "...           ...            ...                  ...   ...               ...   \n",
       "397229          0              N                 Cash  2015              1994   \n",
       "110958          0              N                 Cash  2015              1999   \n",
       "91253           0              N                 Cash  2015              1987   \n",
       "61757           0              N                 Cash  2015              2006   \n",
       "113397          0              N                 Cash  2015              1993   \n",
       "\n",
       "           region  \n",
       "371269       West  \n",
       "413904  NorthEast  \n",
       "390183       West  \n",
       "171520  SouthEast  \n",
       "79658     MidWest  \n",
       "...           ...  \n",
       "397229  SouthEast  \n",
       "110958  SouthEast  \n",
       "91253   NorthEast  \n",
       "61757     MidWest  \n",
       "113397  NorthEast  \n",
       "\n",
       "[336876 rows x 43 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.utils import shuffle\n",
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
    "\n",
    "input_file_path = \"/Users/abigailt/Desktop/Projects/mlPrivacy Projects/Minimization-Patent-DT/income/data/loan.csv\"\n",
    "dataset = pd.read_csv(input_file_path, low_memory=False)\n",
    "dataset = shuffle(dataset, random_state=14)\n",
    "\n",
    "dataset = modify_specific_features(dataset)\n",
    "dataset = fill_missing(dataset)\n",
    "dataset = modify_label(dataset)\n",
    "dataset = remove_unwanted_features(dataset)\n",
    "        \n",
    "train, test, generalizer_train, _ = split_data(dataset, False)\n",
    "\n",
    "x_train = train.drop('label', axis=1)\n",
    "y_train = train['label']\n",
    "x_test = test.drop('label', axis=1)\n",
    "y_test = test['label']\n",
    "\n",
    "x_train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train decision tree model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Base model accuracy:  0.9442762322041345\n"
     ]
    }
   ],
   "source": [
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "features = x_train.columns\n",
    "categorical_features = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose',\n",
    "                        'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'region']\n",
    "# QI parameter determines which features will be minimized.\n",
    "QI = [\"annual_income\", \"zip_code\", \"dti\", \"last_pymnt_amnt\", \"total_rec_int\"]\n",
    "\n",
    "numeric_features = [f for f in features if f not in categorical_features]\n",
    "numeric_transformer = Pipeline(\n",
    "    steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
    ")\n",
    "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        (\"num\", numeric_transformer, numeric_features),\n",
    "        (\"cat\", categorical_transformer, categorical_features),\n",
    "    ]\n",
    ")\n",
    "encoded_train = preprocessor.fit_transform(x_train)\n",
    "model = DecisionTreeClassifier()\n",
    "model.fit(encoded_train, y_train)\n",
    "\n",
    "encoded_test = preprocessor.transform(x_test)\n",
    "print('Base model accuracy: ', model.score(encoded_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run minimization\n",
    "We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n",
      "Improving accuracy\n",
      "feature to remove: zip_code\n",
      "Removed feature: zip_code, new relative accuracy: 0.861250\n",
      "feature to remove: total_rec_int\n",
      "Removed feature: total_rec_int, new relative accuracy: 0.912500\n",
      "feature to remove: last_pymnt_amnt\n",
      "Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n",
      "feature to remove: dti\n",
      "Removed feature: dti, new relative accuracy: 0.995000\n",
      "feature to remove: annual_income\n",
      "Removed feature: annual_income, new relative accuracy: 1.000000\n",
      "Accuracy on minimized data:  0.9425\n"
     ]
    }
   ],
   "source": [
    "from apt.minimization import GeneralizeToRepresentative\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# default target_accuracy is 0.998\n",
    "minimizer = GeneralizeToRepresentative(model, \n",
    "                                       categorical_features=categorical_features, \n",
    "                                       features_to_minimize=QI,\n",
    "                                       encoder=preprocessor)\n",
    "\n",
    "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
    "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
    "# data it could result in a larger gap)\n",
    "# Don't forget to leave a hold-out set for final validation!\n",
    "generalizer_train_small = generalizer_train[:2000]\n",
    "x_test_small = x_test[:2000]\n",
    "y_test_small = y_test[:2000]\n",
    "X_generalizer_train = generalizer_train_small.drop('label', axis=1)\n",
    "features_names = features.tolist()\n",
    "\n",
    "encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
    "x_train_predictions = model.predict(encoded_generalizer_train)\n",
    "minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n",
    "transformed = minimizer.transform(x_test_small, features_names=features_names)\n",
    "\n",
    "encoded_transformed = preprocessor.transform(transformed)\n",
    "print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test_small))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Let's see what features were generalized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ranges': {}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'dti', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'annual_income', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n"
     ]
    }
   ],
   "source": [
    "generalizations = minimizer.generalizations\n",
    "print(generalizations)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
    "\n",
    "Let's change to a slightly lower target accuracy."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n",
      "Improving accuracy\n",
      "feature to remove: zip_code\n",
      "Removed feature: zip_code, new relative accuracy: 0.861250\n",
      "feature to remove: total_rec_int\n",
      "Removed feature: total_rec_int, new relative accuracy: 0.912500\n",
      "feature to remove: last_pymnt_amnt\n",
      "Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n",
      "Accuracy on minimized data:  0.933\n",
      "{'ranges': {'annual_income': [10500.0, 18000.0, 28000.0, 35250.0, 36500.0, 37500.0, 40500.0, 43750.0, 48600.0, 49650.0, 50800.0, 51000.0, 53000.0, 54500.0, 55280.0, 56500.0, 56712.5, 61000.0, 66994.5, 69494.5, 70500.0, 75000.0, 82500.0, 84500.0, 90000.0, 91000.0, 95000.0, 127500.0, 135000.0, 141500.0, 179500.0, 297679.5], 'dti': [4.054999828338623, 8.869999885559082, 12.130000114440918, 14.735000133514404, 15.625, 15.84000015258789, 15.984999656677246, 17.34500026702881, 17.664999961853027, 18.954999923706055, 19.020000457763672, 19.3700008392334, 19.545000076293945, 20.65999984741211, 20.78499984741211, 21.260000228881836, 22.40000057220459, 22.984999656677246, 23.179999351501465, 23.1850004196167, 26.139999389648438, 26.460000038146973, 28.050000190734863, 28.375, 28.894999504089355, 30.414999961853027, 33.85000038146973, 34.46500015258789, 36.720001220703125]}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n"
     ]
    }
   ],
   "source": [
    "# We allow a 2% deviation in accuracy from the original model accuracy\n",
    "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.98, \n",
    "                                        categorical_features=categorical_features, \n",
    "                                        features_to_minimize=QI,\n",
    "                                        encoder=preprocessor)\n",
    "\n",
    "minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n",
    "transformed2 = minimizer2.transform(x_test_small, features_names=features_names)\n",
    "\n",
    "encoded_transformed2 = preprocessor.transform(transformed2)\n",
    "print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test_small))\n",
    "generalizations2 = minimizer2.generalizations\n",
    "print(generalizations2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Two features are generalized: annual income and debt to income ratio"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}