ai-privacy-toolkit/notebooks/minimization_loan.ipynb
abigailt 0672cd04b7 loan demo for paper
Signed-off-by: abigailt <abigailt@il.ibm.com>
2022-11-09 09:36:34 +02:00

829 lines
35 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Applying data minimization to loans data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
"\n",
"This will be demonstarted using the Loans dataset."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and preprocess data"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"sys.path.insert(0, os.path.abspath('..'))\n",
"\n",
"def modify_specific_features(data):\n",
" data = data.rename(\n",
" columns={\"loan_amnt\": \"loan_amount\", \"funded_amnt\": \"funded_amount\", \"funded_amnt_inv\": \"investor_funds\",\n",
" \"int_rate\": \"interest_rate\", \"annual_inc\": \"annual_income\"})\n",
"\n",
" date_format = '%b-%Y'\n",
" dt_series = pd.to_datetime(data['issue_d'], format=date_format)\n",
" data['year'] = dt_series.dt.year\n",
"\n",
" dt_series = pd.to_datetime(data['earliest_cr_line'], format=date_format)\n",
" data['earliest_cr_year'] = dt_series.dt.year\n",
"\n",
" dt_series = pd.to_datetime(data['last_credit_pull_d'], format=date_format)\n",
" data['last_credit_pull_year'] = dt_series.dt.year\n",
"\n",
" # TODO: Maybe year is not enough, we may want time since last payment? Or some other diff (time between last payment and X)\n",
" dt_series = pd.to_datetime(data['last_pymnt_d'], format=date_format)\n",
" data['last_pymnt_year'] = dt_series.dt.year\n",
"\n",
" data['interest_rate'] = data['interest_rate'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n",
"\n",
" data['revol_util'] = data['revol_util'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n",
"\n",
" data['term'] = data['term'].apply(lambda x: x if type(x) == int else int(x[:-7]))\n",
"\n",
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x + 's' if x.endswith('year') else x)\n",
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x[:-6])\n",
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 0.5 if '<' in x else x)\n",
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 10 if '+' in x else x)\n",
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)\n",
"\n",
" data['zip_code'] = data['zip_code'].apply(lambda x: x if type(x) == int else int(x[:-2]))\n",
"\n",
" west = ['CA', 'OR', 'UT', 'WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']\n",
" south_west = ['AZ', 'TX', 'NM', 'OK']\n",
" south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN']\n",
" mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']\n",
" north_east = ['CT', 'NY', 'PA', 'NJ', 'RI', 'MA', 'MD', 'VT', 'NH', 'ME']\n",
"\n",
" data['region'] = np.nan\n",
"\n",
" def finding_regions(state):\n",
" if state in west:\n",
" return 'West'\n",
" elif state in south_west:\n",
" return 'SouthWest'\n",
" elif state in south_east:\n",
" return 'SouthEast'\n",
" elif state in mid_west:\n",
" return 'MidWest'\n",
" elif state in north_east:\n",
" return 'NorthEast'\n",
"\n",
" data['region'] = data['addr_state'].apply(finding_regions)\n",
" return data\n",
"\n",
"def fill_missing(data):\n",
" for col in ('dti_joint', 'annual_inc_joint', 'il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'open_il_12m',\n",
" 'open_il_24m', 'inq_last_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi',\n",
" 'total_cu_tl', 'loan_amount', 'funded_amount', 'investor_funds', 'term', 'interest_rate',\n",
" 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'total_bal_il',\n",
" 'tot_coll_amt', 'installment', 'emp_length', 'annual_income', 'zip_code', 'delinq_2yrs',\n",
" 'tot_cur_bal', 'total_rev_hi_lim', 'revol_util', 'collections_12_mths_ex_med', 'open_acc',\n",
" 'inq_last_6mths', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_rec_int',\n",
" 'verification_status_joint', 'acc_now_delinq', 'settlement_amount', 'settlement_percentage',\n",
" 'settlement_term', 'dti', 'total_rec_late_fee', 'policy_code', 'chargeoff_within_12_mths', 'total_rec_int',\n",
" 'last_credit_pull_year', 'last_pymnt_year', 'delinq_amnt', 'tax_liens', 'year', 'earliest_cr_year'):\n",
" try:\n",
" data[col] = data[col].fillna(0)\n",
" except KeyError:\n",
" print('missing column ' + col)\n",
"\n",
" for col in ('settlement_status', 'emp_title', 'region'):\n",
" try:\n",
" data[col] = data[col].fillna('NA')\n",
" except KeyError:\n",
" print('missing column ' + col)\n",
" \n",
" return data\n",
"\n",
"def modify_label(data):\n",
" bad_loan = [\"Charged Off\", \"Default\", \"Does not meet the credit policy. Status:Charged Off\", \"In Grace Period\",\n",
" \"Late (16-30 days)\", \"Late (31-120 days)\"]\n",
"\n",
" data['label'] = np.nan\n",
"\n",
" def loan_condition(status):\n",
" if status in bad_loan:\n",
" return 0\n",
" else:\n",
" return 1\n",
"\n",
" data['label'] = data['loan_status'].apply(loan_condition)\n",
" return data\n",
"\n",
"def remove_unwanted_features(data):\n",
" features_to_remove = ['loan_status', 'id', 'member_id', 'url', 'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint',\n",
" 'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',\n",
" 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',\n",
" 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',\n",
" 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',\n",
" 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',\n",
" 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',\n",
" 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',\n",
" 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',\n",
" 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tot_hi_cred_lim',\n",
" 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_earliest_cr_line',\n",
" 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il',\n",
" 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',\n",
" 'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status',\n",
" 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',\n",
" 'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',\n",
" 'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'mths_since_recent_bc','desc', 'emp_title', 'title',\n",
" 'issue_d', 'earliest_cr_line', 'last_credit_pull_d',\n",
" 'debt_settlement_flag_date', 'settlement_date', 'last_pymnt_d', 'recoveries', 'collection_recovery_fee', 'total_rec_prncp',\n",
" 'last_pymnt_year', 'last_credit_pull_year', 'total_pymnt', 'total_pymnt_inv', 'debt_settlement_flag', 'settlement_status',\n",
" 'settlement_amount', 'settlement_percentage', 'settlement_term', 'addr_state']\n",
" return data.drop(features_to_remove, axis=1)\n",
"\n",
"def split_data(data, create_validation):\n",
" # divide into 3 datasets for training, validation and test\n",
" stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
" for train_set, test_set in stratified.split(data, data['label']):\n",
" train = data.iloc[train_set]\n",
" test = data.iloc[test_set] # 20% of data\n",
"\n",
" stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=42)\n",
" for train_set, test_set in stratified.split(train, train['label']):\n",
" model_train = data.iloc[train_set] # 40% of data\n",
" generalizer_train = data.iloc[test_set] # 40% of data\n",
"\n",
" validation = None\n",
" if create_validation:\n",
" stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)\n",
" for train_set, test_set in stratified.split(generalizer_train, generalizer_train['label']):\n",
" generalizer_train = data.iloc[train_set] # 30% of data\n",
" validation = data.iloc[test_set] # 10% of data\n",
" return train, test, generalizer_train, validation"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>loan_amount</th>\n",
" <th>funded_amount</th>\n",
" <th>investor_funds</th>\n",
" <th>term</th>\n",
" <th>interest_rate</th>\n",
" <th>installment</th>\n",
" <th>grade</th>\n",
" <th>sub_grade</th>\n",
" <th>emp_length</th>\n",
" <th>home_ownership</th>\n",
" <th>...</th>\n",
" <th>application_type</th>\n",
" <th>acc_now_delinq</th>\n",
" <th>chargeoff_within_12_mths</th>\n",
" <th>delinq_amnt</th>\n",
" <th>tax_liens</th>\n",
" <th>hardship_flag</th>\n",
" <th>disbursement_method</th>\n",
" <th>year</th>\n",
" <th>earliest_cr_year</th>\n",
" <th>region</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>371269</th>\n",
" <td>28000</td>\n",
" <td>28000</td>\n",
" <td>28000.0</td>\n",
" <td>60</td>\n",
" <td>16.55</td>\n",
" <td>689.12</td>\n",
" <td>D</td>\n",
" <td>D2</td>\n",
" <td>0</td>\n",
" <td>OWN</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>1997</td>\n",
" <td>West</td>\n",
" </tr>\n",
" <tr>\n",
" <th>413904</th>\n",
" <td>15000</td>\n",
" <td>15000</td>\n",
" <td>15000.0</td>\n",
" <td>36</td>\n",
" <td>6.03</td>\n",
" <td>456.54</td>\n",
" <td>A</td>\n",
" <td>A1</td>\n",
" <td>1</td>\n",
" <td>MORTGAGE</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>2003</td>\n",
" <td>NorthEast</td>\n",
" </tr>\n",
" <tr>\n",
" <th>390183</th>\n",
" <td>8000</td>\n",
" <td>8000</td>\n",
" <td>8000.0</td>\n",
" <td>36</td>\n",
" <td>17.86</td>\n",
" <td>288.66</td>\n",
" <td>D</td>\n",
" <td>D5</td>\n",
" <td>0</td>\n",
" <td>RENT</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>2000</td>\n",
" <td>West</td>\n",
" </tr>\n",
" <tr>\n",
" <th>171520</th>\n",
" <td>4750</td>\n",
" <td>4750</td>\n",
" <td>4750.0</td>\n",
" <td>36</td>\n",
" <td>16.99</td>\n",
" <td>169.33</td>\n",
" <td>D</td>\n",
" <td>D3</td>\n",
" <td>2</td>\n",
" <td>OWN</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>1992</td>\n",
" <td>SouthEast</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79658</th>\n",
" <td>3950</td>\n",
" <td>3950</td>\n",
" <td>3950.0</td>\n",
" <td>36</td>\n",
" <td>10.99</td>\n",
" <td>129.30</td>\n",
" <td>B</td>\n",
" <td>B4</td>\n",
" <td>4</td>\n",
" <td>RENT</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>2010</td>\n",
" <td>MidWest</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>397229</th>\n",
" <td>26500</td>\n",
" <td>26500</td>\n",
" <td>26500.0</td>\n",
" <td>60</td>\n",
" <td>8.67</td>\n",
" <td>545.87</td>\n",
" <td>B</td>\n",
" <td>B1</td>\n",
" <td>10</td>\n",
" <td>MORTGAGE</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>1994</td>\n",
" <td>SouthEast</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110958</th>\n",
" <td>18900</td>\n",
" <td>18900</td>\n",
" <td>18900.0</td>\n",
" <td>36</td>\n",
" <td>5.32</td>\n",
" <td>569.17</td>\n",
" <td>A</td>\n",
" <td>A1</td>\n",
" <td>0</td>\n",
" <td>RENT</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>1999</td>\n",
" <td>SouthEast</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91253</th>\n",
" <td>18000</td>\n",
" <td>18000</td>\n",
" <td>18000.0</td>\n",
" <td>60</td>\n",
" <td>11.53</td>\n",
" <td>396.14</td>\n",
" <td>B</td>\n",
" <td>B5</td>\n",
" <td>0</td>\n",
" <td>RENT</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>1987</td>\n",
" <td>NorthEast</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61757</th>\n",
" <td>14450</td>\n",
" <td>14450</td>\n",
" <td>14375.0</td>\n",
" <td>60</td>\n",
" <td>23.99</td>\n",
" <td>415.62</td>\n",
" <td>F</td>\n",
" <td>F3</td>\n",
" <td>1</td>\n",
" <td>MORTGAGE</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>2006</td>\n",
" <td>MidWest</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113397</th>\n",
" <td>12600</td>\n",
" <td>12600</td>\n",
" <td>12600.0</td>\n",
" <td>36</td>\n",
" <td>9.99</td>\n",
" <td>406.51</td>\n",
" <td>B</td>\n",
" <td>B3</td>\n",
" <td>10</td>\n",
" <td>MORTGAGE</td>\n",
" <td>...</td>\n",
" <td>Individual</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>N</td>\n",
" <td>Cash</td>\n",
" <td>2015</td>\n",
" <td>1993</td>\n",
" <td>NorthEast</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>336876 rows × 43 columns</p>\n",
"</div>"
],
"text/plain": [
" loan_amount funded_amount investor_funds term interest_rate \\\n",
"371269 28000 28000 28000.0 60 16.55 \n",
"413904 15000 15000 15000.0 36 6.03 \n",
"390183 8000 8000 8000.0 36 17.86 \n",
"171520 4750 4750 4750.0 36 16.99 \n",
"79658 3950 3950 3950.0 36 10.99 \n",
"... ... ... ... ... ... \n",
"397229 26500 26500 26500.0 60 8.67 \n",
"110958 18900 18900 18900.0 36 5.32 \n",
"91253 18000 18000 18000.0 60 11.53 \n",
"61757 14450 14450 14375.0 60 23.99 \n",
"113397 12600 12600 12600.0 36 9.99 \n",
"\n",
" installment grade sub_grade emp_length home_ownership ... \\\n",
"371269 689.12 D D2 0 OWN ... \n",
"413904 456.54 A A1 1 MORTGAGE ... \n",
"390183 288.66 D D5 0 RENT ... \n",
"171520 169.33 D D3 2 OWN ... \n",
"79658 129.30 B B4 4 RENT ... \n",
"... ... ... ... ... ... ... \n",
"397229 545.87 B B1 10 MORTGAGE ... \n",
"110958 569.17 A A1 0 RENT ... \n",
"91253 396.14 B B5 0 RENT ... \n",
"61757 415.62 F F3 1 MORTGAGE ... \n",
"113397 406.51 B B3 10 MORTGAGE ... \n",
"\n",
" application_type acc_now_delinq chargeoff_within_12_mths delinq_amnt \\\n",
"371269 Individual 0 0 0 \n",
"413904 Individual 0 0 0 \n",
"390183 Individual 0 0 0 \n",
"171520 Individual 0 0 0 \n",
"79658 Individual 0 0 0 \n",
"... ... ... ... ... \n",
"397229 Individual 0 0 0 \n",
"110958 Individual 0 0 0 \n",
"91253 Individual 0 0 0 \n",
"61757 Individual 0 0 0 \n",
"113397 Individual 0 0 0 \n",
"\n",
" tax_liens hardship_flag disbursement_method year earliest_cr_year \\\n",
"371269 0 N Cash 2015 1997 \n",
"413904 0 N Cash 2015 2003 \n",
"390183 0 N Cash 2015 2000 \n",
"171520 0 N Cash 2015 1992 \n",
"79658 0 N Cash 2015 2010 \n",
"... ... ... ... ... ... \n",
"397229 0 N Cash 2015 1994 \n",
"110958 0 N Cash 2015 1999 \n",
"91253 0 N Cash 2015 1987 \n",
"61757 0 N Cash 2015 2006 \n",
"113397 0 N Cash 2015 1993 \n",
"\n",
" region \n",
"371269 West \n",
"413904 NorthEast \n",
"390183 West \n",
"171520 SouthEast \n",
"79658 MidWest \n",
"... ... \n",
"397229 SouthEast \n",
"110958 SouthEast \n",
"91253 NorthEast \n",
"61757 MidWest \n",
"113397 NorthEast \n",
"\n",
"[336876 rows x 43 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.utils import shuffle\n",
"from sklearn.model_selection import StratifiedShuffleSplit\n",
"\n",
"input_file_path = \"/Users/abigailt/Desktop/Projects/mlPrivacy Projects/Minimization-Patent-DT/income/data/loan.csv\"\n",
"dataset = pd.read_csv(input_file_path, low_memory=False)\n",
"dataset = shuffle(dataset, random_state=14)\n",
"\n",
"dataset = modify_specific_features(dataset)\n",
"dataset = fill_missing(dataset)\n",
"dataset = modify_label(dataset)\n",
"dataset = remove_unwanted_features(dataset)\n",
" \n",
"train, test, generalizer_train, _ = split_data(dataset, False)\n",
"\n",
"x_train = train.drop('label', axis=1)\n",
"y_train = train['label']\n",
"x_test = test.drop('label', axis=1)\n",
"y_test = test['label']\n",
"\n",
"x_train"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train decision tree model"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Base model accuracy: 0.9442762322041345\n"
]
}
],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"features = x_train.columns\n",
"categorical_features = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose',\n",
" 'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'region']\n",
"# QI parameter determines which features will be minimized.\n",
"QI = [\"annual_income\", \"zip_code\", \"dti\", \"last_pymnt_amnt\", \"total_rec_int\"]\n",
"\n",
"numeric_features = [f for f in features if f not in categorical_features]\n",
"numeric_transformer = Pipeline(\n",
" steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
")\n",
"categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" (\"num\", numeric_transformer, numeric_features),\n",
" (\"cat\", categorical_transformer, categorical_features),\n",
" ]\n",
")\n",
"encoded_train = preprocessor.fit_transform(x_train)\n",
"model = DecisionTreeClassifier()\n",
"model.fit(encoded_train, y_train)\n",
"\n",
"encoded_test = preprocessor.transform(x_test)\n",
"print('Base model accuracy: ', model.score(encoded_test, y_test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Run minimization\n",
"We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n",
"Improving accuracy\n",
"feature to remove: zip_code\n",
"Removed feature: zip_code, new relative accuracy: 0.861250\n",
"feature to remove: total_rec_int\n",
"Removed feature: total_rec_int, new relative accuracy: 0.912500\n",
"feature to remove: last_pymnt_amnt\n",
"Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n",
"feature to remove: dti\n",
"Removed feature: dti, new relative accuracy: 0.995000\n",
"feature to remove: annual_income\n",
"Removed feature: annual_income, new relative accuracy: 1.000000\n",
"Accuracy on minimized data: 0.9425\n"
]
}
],
"source": [
"from apt.minimization import GeneralizeToRepresentative\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# default target_accuracy is 0.998\n",
"minimizer = GeneralizeToRepresentative(model, \n",
" categorical_features=categorical_features, \n",
" features_to_minimize=QI,\n",
" encoder=preprocessor)\n",
"\n",
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
"# data it could result in a larger gap)\n",
"# Don't forget to leave a hold-out set for final validation!\n",
"generalizer_train_small = generalizer_train[:2000]\n",
"x_test_small = x_test[:2000]\n",
"y_test_small = y_test[:2000]\n",
"X_generalizer_train = generalizer_train_small.drop('label', axis=1)\n",
"features_names = features.tolist()\n",
"\n",
"encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
"x_train_predictions = model.predict(encoded_generalizer_train)\n",
"minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n",
"transformed = minimizer.transform(x_test_small, features_names=features_names)\n",
"\n",
"encoded_transformed = preprocessor.transform(transformed)\n",
"print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test_small))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Let's see what features were generalized"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ranges': {}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'dti', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'annual_income', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n"
]
}
],
"source": [
"generalizations = minimizer.generalizations\n",
"print(generalizations)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
"\n",
"Let's change to a slightly lower target accuracy."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n",
"Improving accuracy\n",
"feature to remove: zip_code\n",
"Removed feature: zip_code, new relative accuracy: 0.861250\n",
"feature to remove: total_rec_int\n",
"Removed feature: total_rec_int, new relative accuracy: 0.912500\n",
"feature to remove: last_pymnt_amnt\n",
"Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n",
"Accuracy on minimized data: 0.933\n",
"{'ranges': {'annual_income': [10500.0, 18000.0, 28000.0, 35250.0, 36500.0, 37500.0, 40500.0, 43750.0, 48600.0, 49650.0, 50800.0, 51000.0, 53000.0, 54500.0, 55280.0, 56500.0, 56712.5, 61000.0, 66994.5, 69494.5, 70500.0, 75000.0, 82500.0, 84500.0, 90000.0, 91000.0, 95000.0, 127500.0, 135000.0, 141500.0, 179500.0, 297679.5], 'dti': [4.054999828338623, 8.869999885559082, 12.130000114440918, 14.735000133514404, 15.625, 15.84000015258789, 15.984999656677246, 17.34500026702881, 17.664999961853027, 18.954999923706055, 19.020000457763672, 19.3700008392334, 19.545000076293945, 20.65999984741211, 20.78499984741211, 21.260000228881836, 22.40000057220459, 22.984999656677246, 23.179999351501465, 23.1850004196167, 26.139999389648438, 26.460000038146973, 28.050000190734863, 28.375, 28.894999504089355, 30.414999961853027, 33.85000038146973, 34.46500015258789, 36.720001220703125]}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n"
]
}
],
"source": [
"# We allow a 2% deviation in accuracy from the original model accuracy\n",
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.98, \n",
" categorical_features=categorical_features, \n",
" features_to_minimize=QI,\n",
" encoder=preprocessor)\n",
"\n",
"minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n",
"transformed2 = minimizer2.transform(x_test_small, features_names=features_names)\n",
"\n",
"encoded_transformed2 = preprocessor.transform(transformed2)\n",
"print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test_small))\n",
"generalizations2 = minimizer2.generalizations\n",
"print(generalizations2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Two features are generalized: annual income and debt to income ratio"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}