diff --git a/notebooks/minimization_loan.ipynb b/notebooks/minimization_loan.ipynb new file mode 100644 index 0000000..4f11475 --- /dev/null +++ b/notebooks/minimization_loan.ipynb @@ -0,0 +1,829 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Applying data minimization to loans data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we will show how to perform data minimization for ML models using the minimization module.\n", + "\n", + "This will be demonstarted using the Loans dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load and preprocess data" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.insert(0, os.path.abspath('..'))\n", + "\n", + "def modify_specific_features(data):\n", + " data = data.rename(\n", + " columns={\"loan_amnt\": \"loan_amount\", \"funded_amnt\": \"funded_amount\", \"funded_amnt_inv\": \"investor_funds\",\n", + " \"int_rate\": \"interest_rate\", \"annual_inc\": \"annual_income\"})\n", + "\n", + " date_format = '%b-%Y'\n", + " dt_series = pd.to_datetime(data['issue_d'], format=date_format)\n", + " data['year'] = dt_series.dt.year\n", + "\n", + " dt_series = pd.to_datetime(data['earliest_cr_line'], format=date_format)\n", + " data['earliest_cr_year'] = dt_series.dt.year\n", + "\n", + " dt_series = pd.to_datetime(data['last_credit_pull_d'], format=date_format)\n", + " data['last_credit_pull_year'] = dt_series.dt.year\n", + "\n", + " # TODO: Maybe year is not enough, we may want time since last payment? Or some other diff (time between last payment and X)\n", + " dt_series = pd.to_datetime(data['last_pymnt_d'], format=date_format)\n", + " data['last_pymnt_year'] = dt_series.dt.year\n", + "\n", + " data['interest_rate'] = data['interest_rate'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n", + "\n", + " data['revol_util'] = data['revol_util'].apply(lambda x: x if type(x) == float else float(x[:-1]))\n", + "\n", + " data['term'] = data['term'].apply(lambda x: x if type(x) == int else int(x[:-7]))\n", + "\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x + 's' if x.endswith('year') else x)\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: x[:-6])\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 0.5 if '<' in x else x)\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: 10 if '+' in x else x)\n", + " data['emp_length'] = data['emp_length'].astype(str).apply(lambda x: int(x) if x.isnumeric() else 0)\n", + "\n", + " data['zip_code'] = data['zip_code'].apply(lambda x: x if type(x) == int else int(x[:-2]))\n", + "\n", + " west = ['CA', 'OR', 'UT', 'WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']\n", + " south_west = ['AZ', 'TX', 'NM', 'OK']\n", + " south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN']\n", + " mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']\n", + " north_east = ['CT', 'NY', 'PA', 'NJ', 'RI', 'MA', 'MD', 'VT', 'NH', 'ME']\n", + "\n", + " data['region'] = np.nan\n", + "\n", + " def finding_regions(state):\n", + " if state in west:\n", + " return 'West'\n", + " elif state in south_west:\n", + " return 'SouthWest'\n", + " elif state in south_east:\n", + " return 'SouthEast'\n", + " elif state in mid_west:\n", + " return 'MidWest'\n", + " elif state in north_east:\n", + " return 'NorthEast'\n", + "\n", + " data['region'] = data['addr_state'].apply(finding_regions)\n", + " return data\n", + "\n", + "def fill_missing(data):\n", + " for col in ('dti_joint', 'annual_inc_joint', 'il_util', 'mths_since_rcnt_il', 'open_acc_6m', 'open_il_12m',\n", + " 'open_il_24m', 'inq_last_12m', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi',\n", + " 'total_cu_tl', 'loan_amount', 'funded_amount', 'investor_funds', 'term', 'interest_rate',\n", + " 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'total_bal_il',\n", + " 'tot_coll_amt', 'installment', 'emp_length', 'annual_income', 'zip_code', 'delinq_2yrs',\n", + " 'tot_cur_bal', 'total_rev_hi_lim', 'revol_util', 'collections_12_mths_ex_med', 'open_acc',\n", + " 'inq_last_6mths', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_rec_int',\n", + " 'verification_status_joint', 'acc_now_delinq', 'settlement_amount', 'settlement_percentage',\n", + " 'settlement_term', 'dti', 'total_rec_late_fee', 'policy_code', 'chargeoff_within_12_mths', 'total_rec_int',\n", + " 'last_credit_pull_year', 'last_pymnt_year', 'delinq_amnt', 'tax_liens', 'year', 'earliest_cr_year'):\n", + " try:\n", + " data[col] = data[col].fillna(0)\n", + " except KeyError:\n", + " print('missing column ' + col)\n", + "\n", + " for col in ('settlement_status', 'emp_title', 'region'):\n", + " try:\n", + " data[col] = data[col].fillna('NA')\n", + " except KeyError:\n", + " print('missing column ' + col)\n", + " \n", + " return data\n", + "\n", + "def modify_label(data):\n", + " bad_loan = [\"Charged Off\", \"Default\", \"Does not meet the credit policy. Status:Charged Off\", \"In Grace Period\",\n", + " \"Late (16-30 days)\", \"Late (31-120 days)\"]\n", + "\n", + " data['label'] = np.nan\n", + "\n", + " def loan_condition(status):\n", + " if status in bad_loan:\n", + " return 0\n", + " else:\n", + " return 1\n", + "\n", + " data['label'] = data['loan_status'].apply(loan_condition)\n", + " return data\n", + "\n", + "def remove_unwanted_features(data):\n", + " features_to_remove = ['loan_status', 'id', 'member_id', 'url', 'next_pymnt_d', 'mths_since_last_major_derog', 'annual_inc_joint',\n", + " 'dti_joint', 'verification_status_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',\n", + " 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',\n", + " 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',\n", + " 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'mo_sin_old_il_acct',\n", + " 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',\n", + " 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',\n", + " 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl',\n", + " 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',\n", + " 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tot_hi_cred_lim',\n", + " 'total_bal_ex_mort', 'total_bc_limit', 'total_il_high_credit_limit', 'revol_bal_joint', 'sec_app_earliest_cr_line',\n", + " 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il',\n", + " 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med',\n", + " 'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status',\n", + " 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date',\n", + " 'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest',\n", + " 'hardship_payoff_balance_amount', 'hardship_last_payment_amount', 'mths_since_recent_bc','desc', 'emp_title', 'title',\n", + " 'issue_d', 'earliest_cr_line', 'last_credit_pull_d',\n", + " 'debt_settlement_flag_date', 'settlement_date', 'last_pymnt_d', 'recoveries', 'collection_recovery_fee', 'total_rec_prncp',\n", + " 'last_pymnt_year', 'last_credit_pull_year', 'total_pymnt', 'total_pymnt_inv', 'debt_settlement_flag', 'settlement_status',\n", + " 'settlement_amount', 'settlement_percentage', 'settlement_term', 'addr_state']\n", + " return data.drop(features_to_remove, axis=1)\n", + "\n", + "def split_data(data, create_validation):\n", + " # divide into 3 datasets for training, validation and test\n", + " stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n", + " for train_set, test_set in stratified.split(data, data['label']):\n", + " train = data.iloc[train_set]\n", + " test = data.iloc[test_set] # 20% of data\n", + "\n", + " stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.9, random_state=42)\n", + " for train_set, test_set in stratified.split(train, train['label']):\n", + " model_train = data.iloc[train_set] # 40% of data\n", + " generalizer_train = data.iloc[test_set] # 40% of data\n", + "\n", + " validation = None\n", + " if create_validation:\n", + " stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)\n", + " for train_set, test_set in stratified.split(generalizer_train, generalizer_train['label']):\n", + " generalizer_train = data.iloc[train_set] # 30% of data\n", + " validation = data.iloc[test_set] # 10% of data\n", + " return train, test, generalizer_train, validation" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
loan_amountfunded_amountinvestor_fundsterminterest_rateinstallmentgradesub_gradeemp_lengthhome_ownership...application_typeacc_now_delinqchargeoff_within_12_mthsdelinq_amnttax_lienshardship_flagdisbursement_methodyearearliest_cr_yearregion
371269280002800028000.06016.55689.12DD20OWN...Individual0000NCash20151997West
413904150001500015000.0366.03456.54AA11MORTGAGE...Individual0000NCash20152003NorthEast
390183800080008000.03617.86288.66DD50RENT...Individual0000NCash20152000West
171520475047504750.03616.99169.33DD32OWN...Individual0000NCash20151992SouthEast
79658395039503950.03610.99129.30BB44RENT...Individual0000NCash20152010MidWest
..................................................................
397229265002650026500.0608.67545.87BB110MORTGAGE...Individual0000NCash20151994SouthEast
110958189001890018900.0365.32569.17AA10RENT...Individual0000NCash20151999SouthEast
91253180001800018000.06011.53396.14BB50RENT...Individual0000NCash20151987NorthEast
61757144501445014375.06023.99415.62FF31MORTGAGE...Individual0000NCash20152006MidWest
113397126001260012600.0369.99406.51BB310MORTGAGE...Individual0000NCash20151993NorthEast
\n", + "

336876 rows × 43 columns

\n", + "
" + ], + "text/plain": [ + " loan_amount funded_amount investor_funds term interest_rate \\\n", + "371269 28000 28000 28000.0 60 16.55 \n", + "413904 15000 15000 15000.0 36 6.03 \n", + "390183 8000 8000 8000.0 36 17.86 \n", + "171520 4750 4750 4750.0 36 16.99 \n", + "79658 3950 3950 3950.0 36 10.99 \n", + "... ... ... ... ... ... \n", + "397229 26500 26500 26500.0 60 8.67 \n", + "110958 18900 18900 18900.0 36 5.32 \n", + "91253 18000 18000 18000.0 60 11.53 \n", + "61757 14450 14450 14375.0 60 23.99 \n", + "113397 12600 12600 12600.0 36 9.99 \n", + "\n", + " installment grade sub_grade emp_length home_ownership ... \\\n", + "371269 689.12 D D2 0 OWN ... \n", + "413904 456.54 A A1 1 MORTGAGE ... \n", + "390183 288.66 D D5 0 RENT ... \n", + "171520 169.33 D D3 2 OWN ... \n", + "79658 129.30 B B4 4 RENT ... \n", + "... ... ... ... ... ... ... \n", + "397229 545.87 B B1 10 MORTGAGE ... \n", + "110958 569.17 A A1 0 RENT ... \n", + "91253 396.14 B B5 0 RENT ... \n", + "61757 415.62 F F3 1 MORTGAGE ... \n", + "113397 406.51 B B3 10 MORTGAGE ... \n", + "\n", + " application_type acc_now_delinq chargeoff_within_12_mths delinq_amnt \\\n", + "371269 Individual 0 0 0 \n", + "413904 Individual 0 0 0 \n", + "390183 Individual 0 0 0 \n", + "171520 Individual 0 0 0 \n", + "79658 Individual 0 0 0 \n", + "... ... ... ... ... \n", + "397229 Individual 0 0 0 \n", + "110958 Individual 0 0 0 \n", + "91253 Individual 0 0 0 \n", + "61757 Individual 0 0 0 \n", + "113397 Individual 0 0 0 \n", + "\n", + " tax_liens hardship_flag disbursement_method year earliest_cr_year \\\n", + "371269 0 N Cash 2015 1997 \n", + "413904 0 N Cash 2015 2003 \n", + "390183 0 N Cash 2015 2000 \n", + "171520 0 N Cash 2015 1992 \n", + "79658 0 N Cash 2015 2010 \n", + "... ... ... ... ... ... \n", + "397229 0 N Cash 2015 1994 \n", + "110958 0 N Cash 2015 1999 \n", + "91253 0 N Cash 2015 1987 \n", + "61757 0 N Cash 2015 2006 \n", + "113397 0 N Cash 2015 1993 \n", + "\n", + " region \n", + "371269 West \n", + "413904 NorthEast \n", + "390183 West \n", + "171520 SouthEast \n", + "79658 MidWest \n", + "... ... \n", + "397229 SouthEast \n", + "110958 SouthEast \n", + "91253 NorthEast \n", + "61757 MidWest \n", + "113397 NorthEast \n", + "\n", + "[336876 rows x 43 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.utils import shuffle\n", + "from sklearn.model_selection import StratifiedShuffleSplit\n", + "\n", + "input_file_path = \"/Users/abigailt/Desktop/Projects/mlPrivacy Projects/Minimization-Patent-DT/income/data/loan.csv\"\n", + "dataset = pd.read_csv(input_file_path, low_memory=False)\n", + "dataset = shuffle(dataset, random_state=14)\n", + "\n", + "dataset = modify_specific_features(dataset)\n", + "dataset = fill_missing(dataset)\n", + "dataset = modify_label(dataset)\n", + "dataset = remove_unwanted_features(dataset)\n", + " \n", + "train, test, generalizer_train, _ = split_data(dataset, False)\n", + "\n", + "x_train = train.drop('label', axis=1)\n", + "y_train = train['label']\n", + "x_test = test.drop('label', axis=1)\n", + "y_test = test['label']\n", + "\n", + "x_train" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train decision tree model" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Base model accuracy: 0.9442762322041345\n" + ] + } + ], + "source": [ + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "features = x_train.columns\n", + "categorical_features = ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose',\n", + " 'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'region']\n", + "# QI parameter determines which features will be minimized.\n", + "QI = [\"annual_income\", \"zip_code\", \"dti\", \"last_pymnt_amnt\", \"total_rec_int\"]\n", + "\n", + "numeric_features = [f for f in features if f not in categorical_features]\n", + "numeric_transformer = Pipeline(\n", + " steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))]\n", + ")\n", + "categorical_transformer = OneHotEncoder(handle_unknown=\"ignore\", sparse=False)\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, numeric_features),\n", + " (\"cat\", categorical_transformer, categorical_features),\n", + " ]\n", + ")\n", + "encoded_train = preprocessor.fit_transform(x_train)\n", + "model = DecisionTreeClassifier()\n", + "model.fit(encoded_train, y_train)\n", + "\n", + "encoded_test = preprocessor.transform(x_test)\n", + "print('Base model accuracy: ', model.score(encoded_test, y_test))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run minimization\n", + "We will try to run minimization with categorical features and only a subset of the features with different possible values of target accuracy (how close to the original model's accuracy we want to get, 1 being same accuracy as for original data)." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n", + "Improving accuracy\n", + "feature to remove: zip_code\n", + "Removed feature: zip_code, new relative accuracy: 0.861250\n", + "feature to remove: total_rec_int\n", + "Removed feature: total_rec_int, new relative accuracy: 0.912500\n", + "feature to remove: last_pymnt_amnt\n", + "Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n", + "feature to remove: dti\n", + "Removed feature: dti, new relative accuracy: 0.995000\n", + "feature to remove: annual_income\n", + "Removed feature: annual_income, new relative accuracy: 1.000000\n", + "Accuracy on minimized data: 0.9425\n" + ] + } + ], + "source": [ + "from apt.minimization import GeneralizeToRepresentative\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# default target_accuracy is 0.998\n", + "minimizer = GeneralizeToRepresentative(model, \n", + " categorical_features=categorical_features, \n", + " features_to_minimize=QI,\n", + " encoder=preprocessor)\n", + "\n", + "# Fitting the minimizar can be done either on training or test data. Doing it with test data is better as the\n", + "# resulting accuracy on test data will be closer to the desired target accuracy (when working with training\n", + "# data it could result in a larger gap)\n", + "# Don't forget to leave a hold-out set for final validation!\n", + "generalizer_train_small = generalizer_train[:2000]\n", + "x_test_small = x_test[:2000]\n", + "y_test_small = y_test[:2000]\n", + "X_generalizer_train = generalizer_train_small.drop('label', axis=1)\n", + "features_names = features.tolist()\n", + "\n", + "encoded_generalizer_train = preprocessor.transform(X_generalizer_train)\n", + "x_train_predictions = model.predict(encoded_generalizer_train)\n", + "minimizer.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n", + "transformed = minimizer.transform(x_test_small, features_names=features_names)\n", + "\n", + "encoded_transformed = preprocessor.transform(transformed)\n", + "print('Accuracy on minimized data: ', model.score(encoded_transformed, y_test_small))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Let's see what features were generalized" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ranges': {}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'dti', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'annual_income', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n" + ] + } + ], + "source": [ + "generalizations = minimizer.generalizations\n", + "print(generalizations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that for the default target accuracy of 0.998 of the original accuracy, no generalizations are possible (all features are left untouched, i.e., not generalized).\n", + "\n", + "Let's change to a slightly lower target accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial accuracy of model on generalized data, relative to original model predictions (base generalization derived from tree, before improvements): 0.861250\n", + "Improving accuracy\n", + "feature to remove: zip_code\n", + "Removed feature: zip_code, new relative accuracy: 0.861250\n", + "feature to remove: total_rec_int\n", + "Removed feature: total_rec_int, new relative accuracy: 0.912500\n", + "feature to remove: last_pymnt_amnt\n", + "Removed feature: last_pymnt_amnt, new relative accuracy: 0.987500\n", + "Accuracy on minimized data: 0.933\n", + "{'ranges': {'annual_income': [10500.0, 18000.0, 28000.0, 35250.0, 36500.0, 37500.0, 40500.0, 43750.0, 48600.0, 49650.0, 50800.0, 51000.0, 53000.0, 54500.0, 55280.0, 56500.0, 56712.5, 61000.0, 66994.5, 69494.5, 70500.0, 75000.0, 82500.0, 84500.0, 90000.0, 91000.0, 95000.0, 127500.0, 135000.0, 141500.0, 179500.0, 297679.5], 'dti': [4.054999828338623, 8.869999885559082, 12.130000114440918, 14.735000133514404, 15.625, 15.84000015258789, 15.984999656677246, 17.34500026702881, 17.664999961853027, 18.954999923706055, 19.020000457763672, 19.3700008392334, 19.545000076293945, 20.65999984741211, 20.78499984741211, 21.260000228881836, 22.40000057220459, 22.984999656677246, 23.179999351501465, 23.1850004196167, 26.139999389648438, 26.460000038146973, 28.050000190734863, 28.375, 28.894999504089355, 30.414999961853027, 33.85000038146973, 34.46500015258789, 36.720001220703125]}, 'categories': {}, 'untouched': ['region', 'interest_rate', 'total_rec_late_fee', 'acc_now_delinq', 'home_ownership', 'earliest_cr_year', 'last_pymnt_amnt', 'revol_bal', 'pymnt_plan', 'pub_rec', 'out_prncp', 'loan_amount', 'delinq_2yrs', 'mths_since_last_delinq', 'verification_status', 'investor_funds', 'purpose', 'mths_since_last_record', 'tax_liens', 'grade', 'sub_grade', 'inq_last_6mths', 'total_acc', 'initial_list_status', 'zip_code', 'total_rec_int', 'application_type', 'emp_length', 'hardship_flag', 'revol_util', 'policy_code', 'term', 'installment', 'open_acc', 'out_prncp_inv', 'chargeoff_within_12_mths', 'delinq_amnt', 'funded_amount', 'collections_12_mths_ex_med', 'disbursement_method', 'year']}\n" + ] + } + ], + "source": [ + "# We allow a 2% deviation in accuracy from the original model accuracy\n", + "minimizer2 = GeneralizeToRepresentative(model, target_accuracy=0.98, \n", + " categorical_features=categorical_features, \n", + " features_to_minimize=QI,\n", + " encoder=preprocessor)\n", + "\n", + "minimizer2.fit(X_generalizer_train, x_train_predictions, features_names=features_names)\n", + "transformed2 = minimizer2.transform(x_test_small, features_names=features_names)\n", + "\n", + "encoded_transformed2 = preprocessor.transform(transformed2)\n", + "print('Accuracy on minimized data: ', model.score(encoded_transformed2, y_test_small))\n", + "generalizations2 = minimizer2.generalizations\n", + "print(generalizations2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Two features are generalized: annual income and debt to income ratio" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}