mirror of
https://github.com/IBM/ai-privacy-toolkit.git
synced 2026-04-25 04:46:21 +02:00
loan demo for paper
Signed-off-by: abigailt <abigailt@il.ibm.com>
This commit is contained in:
parent
44d012857f
commit
0672cd04b7
1 changed files with 829 additions and 0 deletions
829
notebooks/minimization_loan.ipynb
Normal file
829
notebooks/minimization_loan.ipynb
Normal file
|
|
@ -0,0 +1,829 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Applying data minimization to loans data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n",
|
||||
"\n",
|
||||
"This will be demonstarted using the Loans dataset."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load and preprocess data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"sys.path.insert(0, os.path.abspath('..'))\n",
|
||||
"\n",
|
||||
"def modify_specific_features(data):\n",
|
||||
" data = data.rename(\n",
|
||||
" columns={\"loan_amnt\": \"loan_amount\", \"funded_amnt\": \"funded_amount\", \"funded_amnt_inv\": \"investor_funds\",\n",
|
||||
" \"int_rate\": \"interest_rate\", \"annual_inc\": \"annual_income\"})\n",
|
||||
"\n",
|
||||
" date_format = '%b-%Y'\n",
|
||||
" dt_series = pd.to_datetime(data['issue_d'], format=date_format)\n",
|
||||
" data['year'] = dt_series.dt.year\n",
|
||||
"\n",
|
||||
" dt_series = pd.to_datetime(data['earliest_cr_line'], format=date_format)\n",
|
||||
" data['earliest_cr_year'] = dt_series.dt.year\n",
|
||||
"\n",
|
||||
" dt_series = pd.to_datetime(data['last_credit_pull_d'], format=date_format)\n",
|
||||
" data['last_credit_pull_year'] = dt_series.dt.year\n",
|
||||
"\n",
|
||||
" # TODO: Maybe year is not enough, we may want time since last payment? Or some other diff (time between last payment and X)\n",
|
||||
" dt_series = pd.to_datetime(data['last_pymnt_d'], format=date_format)\n",
|
||||
" data['last_pymnt_year'] = dt_series.dt.year\n",
|
||||
"\n",
|
||||
" data['interest_rate'] = data['interest_rate'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n",
|
||||
"\n",
|
||||
" data['revol_util'] = data['revol_util'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n",
|
||||
"\n",
|
||||
" data['term'] = data['term'].apply(lambda x: x if type(x) == int else int(x[:-7]))\n",
|
||||
"\n",
|
||||
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x + 's' if x.endswith('year') else x)\n",
|
||||
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x[:-6])\n",
|
||||
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 0.5 if '<' in x else x)\n",
|
||||
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 10 if '+' in x else x)\n",
|
||||
" data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)\n",
|
||||
"\n",
|
||||
" data['zip_code'] = data['zip_code'].apply(lambda x: x if type(x) == int else int(x[:-2]))\n",
|
||||
"\n",
|
||||
" west = ['CA', 'OR', 'UT', 'WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']\n",
|
||||
" south_west = ['AZ', 'TX', 'NM', 'OK']\n",
|
||||
" south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN']\n",
|
||||
" mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']\n",
|
||||
" north_east = ['CT', 'NY', 'PA', 'NJ', 'RI', 'MA', 'MD', 'VT', 'NH', 'ME']\n",
|
||||
"\n",
|
||||
" data['region'] = np.nan\n",
|
||||
"\n",
|
||||
" def finding_regions(state):\n",
|
||||
" if state in west:\n",
|
||||
" return 'West'\n",
|
||||
" elif state in south_west:\n",
|
||||
" return 'SouthWest'\n",
|
||||
" elif state in south_east:\n",
|
||||
" return 'SouthEast'\n",
|
||||
" elif state in mid_west:\n",
|
||||
" return 'MidWest'\n",
|
||||
" elif state in north_east:\n",
|
||||
" return 'NorthEast'\n",
|
||||
"\n",
|
||||
" data['region'] = data['addr_state'].apply(finding_regions)\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"def fill_missing(data):\n",
|
||||
" for col in ('dti_joint', 'annual_inc_joint', 'il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'open_il_12m',\n",
|
||||
" 'open_il_24m', 'inq_last_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi',\n",
|
||||
" 'total_cu_tl', 'loan_amount', 'funded_amount', 'investor_funds', 'term', 'interest_rate',\n",
|
||||
" 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'total_bal_il',\n",
|
||||
" 'tot_coll_amt', 'installment', 'emp_length', 'annual_income', 'zip_code', 'delinq_2yrs',\n",
|
||||
" 'tot_cur_bal', 'total_rev_hi_lim', 'revol_util', 'collections_12_mths_ex_med', 'open_acc',\n",
|
||||
" 'inq_last_6mths', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_rec_int',\n",
|
||||
" 'verification_status_joint', 'acc_now_delinq', 'settlement_amount', 'settlement_percentage',\n",
|
||||
" 'settlement_term', 'dti', 'total_rec_late_fee', 'policy_code', 'chargeoff_within_12_mths', 'total_rec_int',\n",
|
||||
" 'last_credit_pull_year', 'last_pymnt_year', 'delinq_amnt', 'tax_liens', 'year', 'earliest_cr_year'):\n",
|
||||
" try:\n",
|
||||
" data[col] = data[col].fillna(0)\n",
|
||||
" except KeyError:\n",
|
||||
" print('missing column ' + col)\n",
|
||||
"\n",
|
||||
" for col in ('settlement_status', 'emp_title', 'region'):\n",
|
||||
" try:\n",
|
||||
" data[col] = data[col].fillna('NA')\n",
|
||||
" except KeyError:\n",
|
||||
" print('missing column ' + col)\n",
|
||||
" \n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"def modify_label(data):\n",
|
||||
" bad_loan = [\"Charged Off\", \"Default\", \"Does not meet the credit policy. Status:Charged Off\", \"In Grace Period\",\n",
|
||||
" \"Late (16-30 days)\", \"Late (31-120 days)\"]\n",
|
||||
"\n",
|
||||
" data['label'] = np.nan\n",
|
||||
"\n",
|
||||
" def loan_condition(status):\n",
|
||||
" if status in bad_loan:\n",
|
||||
" return 0\n",
|
||||
" else:\n",
|
||||
" return 1\n",
|
||||
"\n",
|
||||
" data['label'] = data['loan_status'].apply(loan_condition)\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"def remove_unwanted_features(data):\n",
|
||||
" features_to_remove = ['loan_status', 'id', 'member_id', 'url', 'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint',\n",
|
||||
" 'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',\n",
|
||||
" 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',\n",
|
||||
" 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',\n",
|
||||
" 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',\n",
|
||||
" 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',\n",
|
||||
" 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',\n",
|
||||
" 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',\n",
|
||||
" 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',\n",
|
||||
" 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tot_hi_cred_lim',\n",
|
||||
" 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_earliest_cr_line',\n",
|
||||
" 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il',\n",
|
||||
" 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',\n",
|
||||
" 'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status',\n",
|
||||
" 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',\n",
|
||||
" 'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',\n",
|
||||
" 'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'mths_since_recent_bc','desc', 'emp_title', 'title',\n",
|
||||
" 'issue_d', 'earliest_cr_line', 'last_credit_pull_d',\n",
|
||||
" 'debt_settlement_flag_date', 'settlement_date', 'last_pymnt_d', 'recoveries', 'collection_recovery_fee', 'total_rec_prncp',\n",
|
||||
" 'last_pymnt_year', 'last_credit_pull_year', 'total_pymnt', 'total_pymnt_inv', 'debt_settlement_flag', 'settlement_status',\n",
|
||||
" 'settlement_amount', 'settlement_percentage', 'settlement_term', 'addr_state']\n",
|
||||
" return data.drop(features_to_remove, axis=1)\n",
|
||||
"\n",
|
||||
"def split_data(data, create_validation):\n",
|
||||
" # divide into 3 datasets for training, validation and test\n",
|
||||
" stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n",
|
||||
" for train_set, test_set in stratified.split(data, data['label']):\n",
|
||||
" train = data.iloc[train_set]\n",
|
||||
" test = data.iloc[test_set] # 20% of data\n",
|
||||
"\n",
|
||||
" stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=42)\n",
|
||||
" for train_set, test_set in stratified.split(train, train['label']):\n",
|
||||
" model_train = data.iloc[train_set] # 40% of data\n",
|
||||
" generalizer_train = data.iloc[test_set] # 40% of data\n",
|
||||
"\n",
|
||||
" validation = None\n",
|
||||
" if create_validation:\n",
|
||||
" stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)\n",
|
||||
" for train_set, test_set in stratified.split(generalizer_train, generalizer_train['label']):\n",
|
||||
" generalizer_train = data.iloc[train_set] # 30% of data\n",
|
||||
" validation = data.iloc[test_set] # 10% of data\n",
|
||||
" return train, test, generalizer_train, validation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>loan_amount</th>\n",
|
||||
" <th>funded_amount</th>\n",
|
||||
" <th>investor_funds</th>\n",
|
||||
" <th>term</th>\n",
|
||||
" <th>interest_rate</th>\n",
|
||||
" <th>installment</th>\n",
|
||||
" <th>grade</th>\n",
|
||||
" <th>sub_grade</th>\n",
|
||||
" <th>emp_length</th>\n",
|
||||
" <th>home_ownership</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>application_type</th>\n",
|
||||
" <th>acc_now_delinq</th>\n",
|
||||
" <th>chargeoff_within_12_mths</th>\n",
|
||||
" <th>delinq_amnt</th>\n",
|
||||
" <th>tax_liens</th>\n",
|
||||
" <th>hardship_flag</th>\n",
|
||||
" <th>disbursement_method</th>\n",
|
||||
" <th>year</th>\n",
|
||||
" <th>earliest_cr_year</th>\n",
|
||||
" <th>region</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>371269</th>\n",
|
||||
" <td>28000</td>\n",
|
||||
" <td>28000</td>\n",
|
||||
" <td>28000.0</td>\n",
|
||||
" <td>60</td>\n",
|
||||
" <td>16.55</td>\n",
|
||||
" <td>689.12</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" <td>D2</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>OWN</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>1997</td>\n",
|
||||
" <td>West</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>413904</th>\n",
|
||||
" <td>15000</td>\n",
|
||||
" <td>15000</td>\n",
|
||||
" <td>15000.0</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>6.03</td>\n",
|
||||
" <td>456.54</td>\n",
|
||||
" <td>A</td>\n",
|
||||
" <td>A1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>MORTGAGE</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>2003</td>\n",
|
||||
" <td>NorthEast</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>390183</th>\n",
|
||||
" <td>8000</td>\n",
|
||||
" <td>8000</td>\n",
|
||||
" <td>8000.0</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>17.86</td>\n",
|
||||
" <td>288.66</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" <td>D5</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>RENT</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>2000</td>\n",
|
||||
" <td>West</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>171520</th>\n",
|
||||
" <td>4750</td>\n",
|
||||
" <td>4750</td>\n",
|
||||
" <td>4750.0</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>16.99</td>\n",
|
||||
" <td>169.33</td>\n",
|
||||
" <td>D</td>\n",
|
||||
" <td>D3</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>OWN</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>1992</td>\n",
|
||||
" <td>SouthEast</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>79658</th>\n",
|
||||
" <td>3950</td>\n",
|
||||
" <td>3950</td>\n",
|
||||
" <td>3950.0</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>10.99</td>\n",
|
||||
" <td>129.30</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" <td>B4</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>RENT</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>2010</td>\n",
|
||||
" <td>MidWest</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>397229</th>\n",
|
||||
" <td>26500</td>\n",
|
||||
" <td>26500</td>\n",
|
||||
" <td>26500.0</td>\n",
|
||||
" <td>60</td>\n",
|
||||
" <td>8.67</td>\n",
|
||||
" <td>545.87</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" <td>B1</td>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>MORTGAGE</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>1994</td>\n",
|
||||
" <td>SouthEast</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>110958</th>\n",
|
||||
" <td>18900</td>\n",
|
||||
" <td>18900</td>\n",
|
||||
" <td>18900.0</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>5.32</td>\n",
|
||||
" <td>569.17</td>\n",
|
||||
" <td>A</td>\n",
|
||||
" <td>A1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>RENT</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>1999</td>\n",
|
||||
" <td>SouthEast</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>91253</th>\n",
|
||||
" <td>18000</td>\n",
|
||||
" <td>18000</td>\n",
|
||||
" <td>18000.0</td>\n",
|
||||
" <td>60</td>\n",
|
||||
" <td>11.53</td>\n",
|
||||
" <td>396.14</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" <td>B5</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>RENT</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>1987</td>\n",
|
||||
" <td>NorthEast</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>61757</th>\n",
|
||||
" <td>14450</td>\n",
|
||||
" <td>14450</td>\n",
|
||||
" <td>14375.0</td>\n",
|
||||
" <td>60</td>\n",
|
||||
" <td>23.99</td>\n",
|
||||
" <td>415.62</td>\n",
|
||||
" <td>F</td>\n",
|
||||
" <td>F3</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>MORTGAGE</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>2006</td>\n",
|
||||
" <td>MidWest</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>113397</th>\n",
|
||||
" <td>12600</td>\n",
|
||||
" <td>12600</td>\n",
|
||||
" <td>12600.0</td>\n",
|
||||
" <td>36</td>\n",
|
||||
" <td>9.99</td>\n",
|
||||
" <td>406.51</td>\n",
|
||||
" <td>B</td>\n",
|
||||
" <td>B3</td>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>MORTGAGE</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>Individual</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>N</td>\n",
|
||||
" <td>Cash</td>\n",
|
||||
" <td>2015</td>\n",
|
||||
" <td>1993</td>\n",
|
||||
" <td>NorthEast</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>336876 rows × 43 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" loan_amount funded_amount investor_funds term interest_rate \\\n",
|
||||
"371269 28000 28000 28000.0 60 16.55 \n",
|
||||
"413904 15000 15000 15000.0 36 6.03 \n",
|
||||
"390183 8000 8000 8000.0 36 17.86 \n",
|
||||
"171520 4750 4750 4750.0 36 16.99 \n",
|
||||
"79658 3950 3950 3950.0 36 10.99 \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"397229 26500 26500 26500.0 60 8.67 \n",
|
||||
"110958 18900 18900 18900.0 36 5.32 \n",
|
||||
"91253 18000 18000 18000.0 60 11.53 \n",
|
||||
"61757 14450 14450 14375.0 60 23.99 \n",
|
||||
"113397 12600 12600 12600.0 36 9.99 \n",
|
||||
"\n",
|
||||
" installment grade sub_grade emp_length home_ownership ... \\\n",
|
||||
"371269 689.12 D D2 0 OWN ... \n",
|
||||
"413904 456.54 A A1 1 MORTGAGE ... \n",
|
||||
"390183 288.66 D D5 0 RENT ... \n",
|
||||
"171520 169.33 D D3 2 OWN ... \n",
|
||||
"79658 129.30 B B4 4 RENT ... \n",
|
||||
"... ... ... ... ... ... ... \n",
|
||||
"397229 545.87 B B1 10 MORTGAGE ... \n",
|
||||
"110958 569.17 A A1 0 RENT ... \n",
|
||||
"91253 396.14 B B5 0 RENT ... \n",
|
||||
"61757 415.62 F F3 1 MORTGAGE ... \n",
|
||||
"113397 406.51 B B3 10 MORTGAGE ... \n",
|
||||
"\n",
|
||||
" application_type acc_now_delinq chargeoff_within_12_mths delinq_amnt \\\n",
|
||||
"371269 Individual 0 0 0 \n",
|
||||
"413904 Individual 0 0 0 \n",
|
||||
"390183 Individual 0 0 0 \n",
|
||||
"171520 Individual 0 0 0 \n",
|
||||
"79658 Individual 0 0 0 \n",
|
||||
"... ... ... ... ... \n",
|
||||
"397229 Individual 0 0 0 \n",
|
||||
"110958 Individual 0 0 0 \n",
|
||||
"91253 Individual 0 0 0 \n",
|
||||
"61757 Individual 0 0 0 \n",
|
||||
"113397 Individual 0 0 0 \n",
|
||||
"\n",
|
||||
" tax_liens hardship_flag disbursement_method year earliest_cr_year \\\n",
|
||||
"371269 0 N Cash 2015 1997 \n",
|
||||
"413904 0 N Cash 2015 2003 \n",
|
||||
"390183 0 N Cash 2015 2000 \n",
|
||||
"171520 0 N Cash 2015 1992 \n",
|
||||
"79658 0 N Cash 2015 2010 \n",
|
||||
"... ... ... ... ... ... \n",
|
||||
"397229 0 N Cash 2015 1994 \n",
|
||||
"110958 0 N Cash 2015 1999 \n",
|
||||
"91253 0 N Cash 2015 1987 \n",
|
||||
"61757 0 N Cash 2015 2006 \n",
|
||||
"113397 0 N Cash 2015 1993 \n",
|
||||
"\n",
|
||||
" region \n",
|
||||
"371269 West \n",
|
||||
"413904 NorthEast \n",
|
||||
"390183 West \n",
|
||||
"171520 SouthEast \n",
|
||||
"79658 MidWest \n",
|
||||
"... ... \n",
|
||||
"397229 SouthEast \n",
|
||||
"110958 SouthEast \n",
|
||||
"91253 NorthEast \n",
|
||||
"61757 MidWest \n",
|
||||
"113397 NorthEast \n",
|
||||
"\n",
|
||||
"[336876 rows x 43 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.utils import shuffle\n",
|
||||
"from sklearn.model_selection import StratifiedShuffleSplit\n",
|
||||
"\n",
|
||||
"input_file_path = \"/Users/abigailt/Desktop/Projects/mlPrivacy Projects/Minimization-Patent-DT/income/data/loan.csv\"\n",
|
||||
"dataset = pd.read_csv(input_file_path, low_memory=False)\n",
|
||||
"dataset = shuffle(dataset, random_state=14)\n",
|
||||
"\n",
|
||||
"dataset = modify_specific_features(dataset)\n",
|
||||
"dataset = fill_missing(dataset)\n",
|
||||
"dataset = modify_label(dataset)\n",
|
||||
"dataset = remove_unwanted_features(dataset)\n",
|
||||
" \n",
|
||||
"train, test, generalizer_train, _ = split_data(dataset, False)\n",
|
||||
"\n",
|
||||
"x_train = train.drop('label', axis=1)\n",
|
||||
"y_train = train['label']\n",
|
||||
"x_test = test.drop('label', axis=1)\n",
|
||||
"y_test = test['label']\n",
|
||||
"\n",
|
||||
"x_train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Train decision tree model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Base model accuracy: 0.9442762322041345\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.compose import ColumnTransformer\n",
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"\n",
|
||||
"features = x_train.columns\n",
|
||||
"categorical_features = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose',\n",
|
||||
" 'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'region']\n",
|
||||
"# QI parameter determines which features will be minimized.\n",
|
||||
"QI = [\"annual_income\", \"zip_code\", \"dti\", \"last_pymnt_amnt\", \"total_rec_int\"]\n",
|
||||
"\n",
|
||||
"numeric_features = [f for f in features if f not in categorical_features]\n",
|
||||
"numeric_transformer = Pipeline(\n",
|
||||
" steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n",
|
||||
")\n",
|
||||
"categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n",
|
||||
"preprocessor = ColumnTransformer(\n",
|
||||
" transformers=[\n",
|
||||
" (\"num\", numeric_transformer, numeric_features),\n",
|
||||
" (\"cat\", categorical_transformer, categorical_features),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"encoded_train = preprocessor.fit_transform(x_train)\n",
|
||||
"model = DecisionTreeClassifier()\n",
|
||||
"model.fit(encoded_train, y_train)\n",
|
||||
"\n",
|
||||
"encoded_test = preprocessor.transform(x_test)\n",
|
||||
"print('Base model accuracy: ', model.score(encoded_test, y_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Run minimization\n",
|
||||
"We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n",
|
||||
"Improving accuracy\n",
|
||||
"feature to remove: zip_code\n",
|
||||
"Removed feature: zip_code, new relative accuracy: 0.861250\n",
|
||||
"feature to remove: total_rec_int\n",
|
||||
"Removed feature: total_rec_int, new relative accuracy: 0.912500\n",
|
||||
"feature to remove: last_pymnt_amnt\n",
|
||||
"Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n",
|
||||
"feature to remove: dti\n",
|
||||
"Removed feature: dti, new relative accuracy: 0.995000\n",
|
||||
"feature to remove: annual_income\n",
|
||||
"Removed feature: annual_income, new relative accuracy: 1.000000\n",
|
||||
"Accuracy on minimized data: 0.9425\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from apt.minimization import GeneralizeToRepresentative\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"# default target_accuracy is 0.998\n",
|
||||
"minimizer = GeneralizeToRepresentative(model, \n",
|
||||
" categorical_features=categorical_features, \n",
|
||||
" features_to_minimize=QI,\n",
|
||||
" encoder=preprocessor)\n",
|
||||
"\n",
|
||||
"# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n",
|
||||
"# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n",
|
||||
"# data it could result in a larger gap)\n",
|
||||
"# Don't forget to leave a hold-out set for final validation!\n",
|
||||
"generalizer_train_small = generalizer_train[:2000]\n",
|
||||
"x_test_small = x_test[:2000]\n",
|
||||
"y_test_small = y_test[:2000]\n",
|
||||
"X_generalizer_train = generalizer_train_small.drop('label', axis=1)\n",
|
||||
"features_names = features.tolist()\n",
|
||||
"\n",
|
||||
"encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n",
|
||||
"x_train_predictions = model.predict(encoded_generalizer_train)\n",
|
||||
"minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n",
|
||||
"transformed = minimizer.transform(x_test_small, features_names=features_names)\n",
|
||||
"\n",
|
||||
"encoded_transformed = preprocessor.transform(transformed)\n",
|
||||
"print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test_small))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Let's see what features were generalized"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'ranges': {}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'dti', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'annual_income', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"generalizations = minimizer.generalizations\n",
|
||||
"print(generalizations)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n",
|
||||
"\n",
|
||||
"Let's change to a slightly lower target accuracy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n",
|
||||
"Improving accuracy\n",
|
||||
"feature to remove: zip_code\n",
|
||||
"Removed feature: zip_code, new relative accuracy: 0.861250\n",
|
||||
"feature to remove: total_rec_int\n",
|
||||
"Removed feature: total_rec_int, new relative accuracy: 0.912500\n",
|
||||
"feature to remove: last_pymnt_amnt\n",
|
||||
"Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n",
|
||||
"Accuracy on minimized data: 0.933\n",
|
||||
"{'ranges': {'annual_income': [10500.0, 18000.0, 28000.0, 35250.0, 36500.0, 37500.0, 40500.0, 43750.0, 48600.0, 49650.0, 50800.0, 51000.0, 53000.0, 54500.0, 55280.0, 56500.0, 56712.5, 61000.0, 66994.5, 69494.5, 70500.0, 75000.0, 82500.0, 84500.0, 90000.0, 91000.0, 95000.0, 127500.0, 135000.0, 141500.0, 179500.0, 297679.5], 'dti': [4.054999828338623, 8.869999885559082, 12.130000114440918, 14.735000133514404, 15.625, 15.84000015258789, 15.984999656677246, 17.34500026702881, 17.664999961853027, 18.954999923706055, 19.020000457763672, 19.3700008392334, 19.545000076293945, 20.65999984741211, 20.78499984741211, 21.260000228881836, 22.40000057220459, 22.984999656677246, 23.179999351501465, 23.1850004196167, 26.139999389648438, 26.460000038146973, 28.050000190734863, 28.375, 28.894999504089355, 30.414999961853027, 33.85000038146973, 34.46500015258789, 36.720001220703125]}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# We allow a 2% deviation in accuracy from the original model accuracy\n",
|
||||
"minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.98, \n",
|
||||
" categorical_features=categorical_features, \n",
|
||||
" features_to_minimize=QI,\n",
|
||||
" encoder=preprocessor)\n",
|
||||
"\n",
|
||||
"minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n",
|
||||
"transformed2 = minimizer2.transform(x_test_small, features_names=features_names)\n",
|
||||
"\n",
|
||||
"encoded_transformed2 = preprocessor.transform(transformed2)\n",
|
||||
"print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test_small))\n",
|
||||
"generalizations2 = minimizer2.generalizations\n",
|
||||
"print(generalizations2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Two features are generalized: annual income and debt to income ratio"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue