diff --git a/notebooks/minimization_loan.ipynb b/notebooks/minimization_loan.ipynb new file mode 100644 index 0000000..4f11475 --- /dev/null +++ b/notebooks/minimization_loan.ipynb @@ -0,0 +1,829 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Applying data minimization to loans data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n", + "\n", + "This will be demonstarted using the Loans dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load and preprocess data" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.insert(0, os.path.abspath('..'))\n", + "\n", + "def modify_specific_features(data):\n", + " data = data.rename(\n", + " columns={\"loan_amnt\": \"loan_amount\", \"funded_amnt\": \"funded_amount\", \"funded_amnt_inv\": \"investor_funds\",\n", + " \"int_rate\": \"interest_rate\", \"annual_inc\": \"annual_income\"})\n", + "\n", + " date_format = '%b-%Y'\n", + " dt_series = pd.to_datetime(data['issue_d'], format=date_format)\n", + " data['year'] = dt_series.dt.year\n", + "\n", + " dt_series = pd.to_datetime(data['earliest_cr_line'], format=date_format)\n", + " data['earliest_cr_year'] = dt_series.dt.year\n", + "\n", + " dt_series = pd.to_datetime(data['last_credit_pull_d'], format=date_format)\n", + " data['last_credit_pull_year'] = dt_series.dt.year\n", + "\n", + " # TODO: Maybe year is not enough, we may want time since last payment? Or some other diff (time between last payment and X)\n", + " dt_series = pd.to_datetime(data['last_pymnt_d'], format=date_format)\n", + " data['last_pymnt_year'] = dt_series.dt.year\n", + "\n", + " data['interest_rate'] = data['interest_rate'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n", + "\n", + " data['revol_util'] = data['revol_util'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n", + "\n", + " data['term'] = data['term'].apply(lambda x: x if type(x) == int else int(x[:-7]))\n", + "\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x + 's' if x.endswith('year') else x)\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x[:-6])\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 0.5 if '<' in x else x)\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 10 if '+' in x else x)\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)\n", + "\n", + " data['zip_code'] = data['zip_code'].apply(lambda x: x if type(x) == int else int(x[:-2]))\n", + "\n", + " west = ['CA', 'OR', 'UT', 'WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']\n", + " south_west = ['AZ', 'TX', 'NM', 'OK']\n", + " south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN']\n", + " mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']\n", + " north_east = ['CT', 'NY', 'PA', 'NJ', 'RI', 'MA', 'MD', 'VT', 'NH', 'ME']\n", + "\n", + " data['region'] = np.nan\n", + "\n", + " def finding_regions(state):\n", + " if state in west:\n", + " return 'West'\n", + " elif state in south_west:\n", + " return 'SouthWest'\n", + " elif state in south_east:\n", + " return 'SouthEast'\n", + " elif state in mid_west:\n", + " return 'MidWest'\n", + " elif state in north_east:\n", + " return 'NorthEast'\n", + "\n", + " data['region'] = data['addr_state'].apply(finding_regions)\n", + " return data\n", + "\n", + "def fill_missing(data):\n", + " for col in ('dti_joint', 'annual_inc_joint', 'il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'open_il_12m',\n", + " 'open_il_24m', 'inq_last_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi',\n", + " 'total_cu_tl', 'loan_amount', 'funded_amount', 'investor_funds', 'term', 'interest_rate',\n", + " 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'total_bal_il',\n", + " 'tot_coll_amt', 'installment', 'emp_length', 'annual_income', 'zip_code', 'delinq_2yrs',\n", + " 'tot_cur_bal', 'total_rev_hi_lim', 'revol_util', 'collections_12_mths_ex_med', 'open_acc',\n", + " 'inq_last_6mths', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_rec_int',\n", + " 'verification_status_joint', 'acc_now_delinq', 'settlement_amount', 'settlement_percentage',\n", + " 'settlement_term', 'dti', 'total_rec_late_fee', 'policy_code', 'chargeoff_within_12_mths', 'total_rec_int',\n", + " 'last_credit_pull_year', 'last_pymnt_year', 'delinq_amnt', 'tax_liens', 'year', 'earliest_cr_year'):\n", + " try:\n", + " data[col] = data[col].fillna(0)\n", + " except KeyError:\n", + " print('missing column ' + col)\n", + "\n", + " for col in ('settlement_status', 'emp_title', 'region'):\n", + " try:\n", + " data[col] = data[col].fillna('NA')\n", + " except KeyError:\n", + " print('missing column ' + col)\n", + " \n", + " return data\n", + "\n", + "def modify_label(data):\n", + " bad_loan = [\"Charged Off\", \"Default\", \"Does not meet the credit policy. Status:Charged Off\", \"In Grace Period\",\n", + " \"Late (16-30 days)\", \"Late (31-120 days)\"]\n", + "\n", + " data['label'] = np.nan\n", + "\n", + " def loan_condition(status):\n", + " if status in bad_loan:\n", + " return 0\n", + " else:\n", + " return 1\n", + "\n", + " data['label'] = data['loan_status'].apply(loan_condition)\n", + " return data\n", + "\n", + "def remove_unwanted_features(data):\n", + " features_to_remove = ['loan_status', 'id', 'member_id', 'url', 'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint',\n", + " 'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',\n", + " 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',\n", + " 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',\n", + " 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',\n", + " 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',\n", + " 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',\n", + " 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',\n", + " 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',\n", + " 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tot_hi_cred_lim',\n", + " 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_earliest_cr_line',\n", + " 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il',\n", + " 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',\n", + " 'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status',\n", + " 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',\n", + " 'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',\n", + " 'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'mths_since_recent_bc','desc', 'emp_title', 'title',\n", + " 'issue_d', 'earliest_cr_line', 'last_credit_pull_d',\n", + " 'debt_settlement_flag_date', 'settlement_date', 'last_pymnt_d', 'recoveries', 'collection_recovery_fee', 'total_rec_prncp',\n", + " 'last_pymnt_year', 'last_credit_pull_year', 'total_pymnt', 'total_pymnt_inv', 'debt_settlement_flag', 'settlement_status',\n", + " 'settlement_amount', 'settlement_percentage', 'settlement_term', 'addr_state']\n", + " return data.drop(features_to_remove, axis=1)\n", + "\n", + "def split_data(data, create_validation):\n", + " # divide into 3 datasets for training, validation and test\n", + " stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n", + " for train_set, test_set in stratified.split(data, data['label']):\n", + " train = data.iloc[train_set]\n", + " test = data.iloc[test_set] # 20% of data\n", + "\n", + " stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=42)\n", + " for train_set, test_set in stratified.split(train, train['label']):\n", + " model_train = data.iloc[train_set] # 40% of data\n", + " generalizer_train = data.iloc[test_set] # 40% of data\n", + "\n", + " validation = None\n", + " if create_validation:\n", + " stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)\n", + " for train_set, test_set in stratified.split(generalizer_train, generalizer_train['label']):\n", + " generalizer_train = data.iloc[train_set] # 30% of data\n", + " validation = data.iloc[test_set] # 10% of data\n", + " return train, test, generalizer_train, validation" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | loan_amount | \n", + "funded_amount | \n", + "investor_funds | \n", + "term | \n", + "interest_rate | \n", + "installment | \n", + "grade | \n", + "sub_grade | \n", + "emp_length | \n", + "home_ownership | \n", + "... | \n", + "application_type | \n", + "acc_now_delinq | \n", + "chargeoff_within_12_mths | \n", + "delinq_amnt | \n", + "tax_liens | \n", + "hardship_flag | \n", + "disbursement_method | \n", + "year | \n", + "earliest_cr_year | \n", + "region | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 371269 | \n", + "28000 | \n", + "28000 | \n", + "28000.0 | \n", + "60 | \n", + "16.55 | \n", + "689.12 | \n", + "D | \n", + "D2 | \n", + "0 | \n", + "OWN | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "1997 | \n", + "West | \n", + "
| 413904 | \n", + "15000 | \n", + "15000 | \n", + "15000.0 | \n", + "36 | \n", + "6.03 | \n", + "456.54 | \n", + "A | \n", + "A1 | \n", + "1 | \n", + "MORTGAGE | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "2003 | \n", + "NorthEast | \n", + "
| 390183 | \n", + "8000 | \n", + "8000 | \n", + "8000.0 | \n", + "36 | \n", + "17.86 | \n", + "288.66 | \n", + "D | \n", + "D5 | \n", + "0 | \n", + "RENT | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "2000 | \n", + "West | \n", + "
| 171520 | \n", + "4750 | \n", + "4750 | \n", + "4750.0 | \n", + "36 | \n", + "16.99 | \n", + "169.33 | \n", + "D | \n", + "D3 | \n", + "2 | \n", + "OWN | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "1992 | \n", + "SouthEast | \n", + "
| 79658 | \n", + "3950 | \n", + "3950 | \n", + "3950.0 | \n", + "36 | \n", + "10.99 | \n", + "129.30 | \n", + "B | \n", + "B4 | \n", + "4 | \n", + "RENT | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "2010 | \n", + "MidWest | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 397229 | \n", + "26500 | \n", + "26500 | \n", + "26500.0 | \n", + "60 | \n", + "8.67 | \n", + "545.87 | \n", + "B | \n", + "B1 | \n", + "10 | \n", + "MORTGAGE | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "1994 | \n", + "SouthEast | \n", + "
| 110958 | \n", + "18900 | \n", + "18900 | \n", + "18900.0 | \n", + "36 | \n", + "5.32 | \n", + "569.17 | \n", + "A | \n", + "A1 | \n", + "0 | \n", + "RENT | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "1999 | \n", + "SouthEast | \n", + "
| 91253 | \n", + "18000 | \n", + "18000 | \n", + "18000.0 | \n", + "60 | \n", + "11.53 | \n", + "396.14 | \n", + "B | \n", + "B5 | \n", + "0 | \n", + "RENT | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "1987 | \n", + "NorthEast | \n", + "
| 61757 | \n", + "14450 | \n", + "14450 | \n", + "14375.0 | \n", + "60 | \n", + "23.99 | \n", + "415.62 | \n", + "F | \n", + "F3 | \n", + "1 | \n", + "MORTGAGE | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "2006 | \n", + "MidWest | \n", + "
| 113397 | \n", + "12600 | \n", + "12600 | \n", + "12600.0 | \n", + "36 | \n", + "9.99 | \n", + "406.51 | \n", + "B | \n", + "B3 | \n", + "10 | \n", + "MORTGAGE | \n", + "... | \n", + "Individual | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "N | \n", + "Cash | \n", + "2015 | \n", + "1993 | \n", + "NorthEast | \n", + "
336876 rows × 43 columns
\n", + "