Loan Risk EDA¶

Sachin Shekhar
SachinShekhar@Outlook.com

In [ ]:
# Importing essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
# Settings for the workspace
pd.set_option('max_colwidth', 500)

Loading Data¶

In [ ]:
# Loading 'application_data.csv'
clients = pd.read_csv('application_data.csv')
clients.head()
Out[ ]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY ... FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
0 100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
1 100003 0 Cash loans F N N 0 270000.0 1293502.5 35698.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
2 100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
3 100006 0 Cash loans F N Y 0 135000.0 312682.5 29686.5 ... 0 0 0 0 NaN NaN NaN NaN NaN NaN
4 100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 ... 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 122 columns

In [ ]:
clients.shape
Out[ ]:
(307511, 122)
In [ ]:
# Loading 'previous_application.csv'
loans = pd.read_csv('previous_application.csv')
loans.head()
Out[ ]:
SK_ID_PREV SK_ID_CURR NAME_CONTRACT_TYPE AMT_ANNUITY AMT_APPLICATION AMT_CREDIT AMT_DOWN_PAYMENT AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START ... NAME_SELLER_INDUSTRY CNT_PAYMENT NAME_YIELD_GROUP PRODUCT_COMBINATION DAYS_FIRST_DRAWING DAYS_FIRST_DUE DAYS_LAST_DUE_1ST_VERSION DAYS_LAST_DUE DAYS_TERMINATION NFLAG_INSURED_ON_APPROVAL
0 2030495 271877 Consumer loans 1730.430 17145.0 17145.0 0.0 17145.0 SATURDAY 15 ... Connectivity 12.0 middle POS mobile with interest 365243.0 -42.0 300.0 -42.0 -37.0 0.0
1 2802425 108129 Cash loans 25188.615 607500.0 679671.0 NaN 607500.0 THURSDAY 11 ... XNA 36.0 low_action Cash X-Sell: low 365243.0 -134.0 916.0 365243.0 365243.0 1.0
2 2523466 122040 Cash loans 15060.735 112500.0 136444.5 NaN 112500.0 TUESDAY 11 ... XNA 12.0 high Cash X-Sell: high 365243.0 -271.0 59.0 365243.0 365243.0 1.0
3 2819243 176158 Cash loans 47041.335 450000.0 470790.0 NaN 450000.0 MONDAY 7 ... XNA 12.0 middle Cash X-Sell: middle 365243.0 -482.0 -152.0 -182.0 -177.0 1.0
4 1784265 202054 Cash loans 31924.395 337500.0 404055.0 NaN 337500.0 THURSDAY 9 ... XNA 24.0 high Cash Street: high NaN NaN NaN NaN NaN NaN

5 rows × 37 columns

In [ ]:
loans.shape
Out[ ]:
(1670214, 37)
In [ ]:
# Loading 'columns_description.csv'
# Loading data dictionary for easy exploration of features
data_dict = pd.read_csv('columns_description.csv', encoding='cp1252')
data_dict.head()
Out[ ]:
Unnamed: 0 Table Row Description Special
0 1 application_data SK_ID_CURR ID of loan in our sample NaN
1 2 application_data TARGET Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases) NaN
2 5 application_data NAME_CONTRACT_TYPE Identification if loan is cash or revolving NaN
3 6 application_data CODE_GENDER Gender of the client NaN
4 7 application_data FLAG_OWN_CAR Flag if the client owns a car NaN
In [ ]:
# Minor aethetic changes to data dictionary
data_dict['Table'] = data_dict['Table'].replace('application_data', 'clients')
data_dict['Table'] = data_dict['Table'].replace('previous_application.csv', 'loans')
data_dict = data_dict.iloc[:, 1:]
data_dict.rename(columns={'Row': 'Column'}, inplace=True)
data_dict.head()
Out[ ]:
Table Column Description Special
0 clients SK_ID_CURR ID of loan in our sample NaN
1 clients TARGET Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases) NaN
2 clients NAME_CONTRACT_TYPE Identification if loan is cash or revolving NaN
3 clients CODE_GENDER Gender of the client NaN
4 clients FLAG_OWN_CAR Flag if the client owns a car NaN
In [ ]:
# Add easy way to access column descriptions
def describe(table, col):
  return data_dict[(data_dict['Table'] == table) & (data_dict['Column'] == col)]

class Table:
  def __init__(self, table_name):
    self.table_name = table_name
    self.columns = []
    self.col_index = -1
    if table_name == 'clients':
      self.columns = clients.columns
    elif table_name == 'loans':
      self.columns = loans.columns
  def describe_next(self):
    self.col_index += 1
    return describe(self.table_name, self.columns[self.col_index])
  def describe(self, col_name):
    return describe(self.table_name, col_name)

Cleaning Data¶

Duplicate Records¶

In [ ]:
# Checking for duplicate rows
clients.duplicated().sum(), loans.duplicated().sum()
Out[ ]:
(0, 0)

Missing Values¶

In [ ]:
# Checking missing values from clients data frame
clients_missing = clients.isnull().mean() * 100
clients_missing[clients_missing > 0]
Out[ ]:
AMT_ANNUITY                    0.003902
AMT_GOODS_PRICE                0.090403
NAME_TYPE_SUITE                0.420148
OWN_CAR_AGE                   65.990810
OCCUPATION_TYPE               31.345545
                                ...    
AMT_REQ_CREDIT_BUREAU_DAY     13.501631
AMT_REQ_CREDIT_BUREAU_WEEK    13.501631
AMT_REQ_CREDIT_BUREAU_MON     13.501631
AMT_REQ_CREDIT_BUREAU_QRT     13.501631
AMT_REQ_CREDIT_BUREAU_YEAR    13.501631
Length: 67, dtype: float64
In [ ]:
# Checking columns which have more than 35% missing values
clients_missing[clients_missing > 35].index
Out[ ]:
Index(['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG',
       'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG',
       'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG',
       'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG',
       'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE',
       'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE',
       'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE',
       'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE',
       'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE',
       'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI',
       'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI',
       'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI',
       'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI',
       'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
       'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')

After consulting data dictionary, the columns having more than 35% missing values don't seem important for analysis. So, I am dropping them.

In [ ]:
clients.drop(clients_missing[clients_missing > 35].index, axis=1, inplace=True)
In [ ]:
clients.shape
Out[ ]:
(307511, 73)
In [ ]:
# Checking missing values from loans data frame
loans_missing = loans.isnull().mean() * 100
loans_missing[loans_missing > 0]
Out[ ]:
AMT_ANNUITY                  22.286665
AMT_CREDIT                    0.000060
AMT_DOWN_PAYMENT             53.636480
AMT_GOODS_PRICE              23.081773
RATE_DOWN_PAYMENT            53.636480
RATE_INTEREST_PRIMARY        99.643698
RATE_INTEREST_PRIVILEGED     99.643698
NAME_TYPE_SUITE              49.119754
CNT_PAYMENT                  22.286366
PRODUCT_COMBINATION           0.020716
DAYS_FIRST_DRAWING           40.298129
DAYS_FIRST_DUE               40.298129
DAYS_LAST_DUE_1ST_VERSION    40.298129
DAYS_LAST_DUE                40.298129
DAYS_TERMINATION             40.298129
NFLAG_INSURED_ON_APPROVAL    40.298129
dtype: float64
In [ ]:
# Checking columns which have more than 35% missing values
loans_missing[loans_missing > 35]
Out[ ]:
AMT_DOWN_PAYMENT             53.636480
RATE_DOWN_PAYMENT            53.636480
RATE_INTEREST_PRIMARY        99.643698
RATE_INTEREST_PRIVILEGED     99.643698
NAME_TYPE_SUITE              49.119754
DAYS_FIRST_DRAWING           40.298129
DAYS_FIRST_DUE               40.298129
DAYS_LAST_DUE_1ST_VERSION    40.298129
DAYS_LAST_DUE                40.298129
DAYS_TERMINATION             40.298129
NFLAG_INSURED_ON_APPROVAL    40.298129
dtype: float64

After consulting data dictionary, the columns having more than 35% missing values don't seem important for analysis. So, I am dropping them.

In [ ]:
loans.drop(loans_missing[loans_missing > 35].index, axis=1, inplace=True)
In [ ]:
loans.shape
Out[ ]:
(1670214, 26)

Non-essential Features¶

FLAGDOCUMENT* features are completely unnecssary for analysis. So, I am dropping them.

In [ ]:
clients.drop(list(filter(lambda x: True if x.startswith('FLAG_DOCUMENT_') else False, clients.columns)), axis=1, inplace=True)
In [ ]:
clients.shape
Out[ ]:
(307511, 53)
In [ ]:
clients.columns
Out[ ]:
Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL',
       'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS',
       'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
       'ORGANIZATION_TYPE', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
       'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
       'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object')

Based on Data Dictionary, I am dropping irrelevant or less important columns from clients DataFrame.

In [ ]:
clients_cols_to_drop = ['NAME_TYPE_SUITE', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_LAST_PHONE_CHANGE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_CONT_MOBILE', 'FLAG_EMAIL', 'FLAG_PHONE', 'FLAG_WORK_PHONE', 'FLAG_EMP_PHONE', 'FLAG_MOBIL']

clients.drop(clients_cols_to_drop, axis=1, inplace=True)
In [ ]:
clients.shape
Out[ ]:
(307511, 28)
In [ ]:
loans.shape
Out[ ]:
(1670214, 26)
In [ ]:
loans.columns
Out[ ]:
Index(['SK_ID_PREV', 'SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'AMT_ANNUITY',
       'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_GOODS_PRICE',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY',
       'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'DAYS_DECISION',
       'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_CLIENT_TYPE',
       'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE',
       'CHANNEL_TYPE', 'SELLERPLACE_AREA', 'NAME_SELLER_INDUSTRY',
       'CNT_PAYMENT', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION'],
      dtype='object')

Based on Data Dictionary, I am dropping irrelevant or less important columns from loans DataFrame.

In [ ]:
loans_cols_to_drop = ['SK_ID_PREV', 'AMT_GOODS_PRICE', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY', 'DAYS_DECISION','NAME_CASH_LOAN_PURPOSE', 'CODE_REJECT_REASON', 'NAME_PORTFOLIO', 'SELLERPLACE_AREA', 'NAME_PRODUCT_TYPE', 'NAME_SELLER_INDUSTRY', 'PRODUCT_COMBINATION']

loans.drop(loans_cols_to_drop, axis=1, inplace=True)
In [ ]:
loans.shape
Out[ ]:
(1670214, 12)

Exploring Data & Univariate Analysis¶

Features of Clients¶

In [ ]:
# Initialising Table class to pull column description from data dictionary
clients_cols = Table('clients')
loans_cols = Table('loans')
In [ ]:
# Helper function for Categorical Univariate Analysis
def clients_cat_uni_analysis(col, friendly_name):
  tmp_df = clients[col].value_counts(normalize=True).mul(100).rename('Percent').reset_index().rename(columns={'index': col})
  ax = sns.barplot(data=tmp_df, x=col, y='Percent')
  for p in ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() + (p.get_width()/2)
    txt_y = p.get_height()
    ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
  ax.set_title(f'{friendly_name} Univariate Analysis (Normalised)', fontsize=20)
  ax.set_xlabel(friendly_name)
  ax.figure.set_size_inches(16,8)
  return ax
In [ ]:
# Helper function for Categorical Segmented Univariate Analysis
def clients_cat_seg_analysis(col, friendly_name):
  tmp_df = clients.groupby(col)['TARGET'].value_counts(normalize=True).mul(100).rename('Percent').reset_index().replace(0,'Other Cases').replace(1,'Payment Difficulties')
  ax = sns.barplot(data=tmp_df, x=col, y='Percent', hue='TARGET')
  for p in ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() + (p.get_width()/2)
    txt_y = p.get_height()
    ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
  ax.set_title(f'{friendly_name} Segmented Univariate Analysis (Normalised)', fontsize=20)
  ax.set_xlabel(friendly_name)
  ax.figure.set_size_inches(16,8)
  return ax

Feature: 'SK_ID_CURR'¶

In [ ]:
# Describing first feature
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
0 clients SK_ID_CURR ID of loan in our sample NaN
In [ ]:
clients.SK_ID_CURR.dtype, clients.SK_ID_CURR.isnull().sum()
Out[ ]:
(dtype('int64'), 0)
In [ ]:
loans_cols.describe('SK_ID_CURR')
Out[ ]:
Table Column Description Special
123 loans SK_ID_CURR ID of loan in our sample hashed

Assessment:
From what I can gather at this point, SK_ID_CURR can be used to join clients and loans tables. Otherwise, it has no significance in data analysis.

Feature: 'TARGET'¶

In [ ]:
# Describing next feature
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
1 clients TARGET Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases) NaN
In [ ]:
clients.TARGET.dtype, clients.TARGET.isnull().sum()
Out[ ]:
(dtype('int64'), 0)
In [ ]:
clients.TARGET.value_counts(normalize=True) * 100
Out[ ]:
0    91.927118
1     8.072882
Name: TARGET, dtype: float64
In [ ]:
ax = sns.countplot(y=clients.TARGET)
plt.yticks([0,1], ['Other Cases', 'Payment Difficulties'], rotation=45)
ax.set_title('Target Variable Data Imbalance', fontsize=20)
ax.figure.set_size_inches(10,4)
plt.show()

Assessment:

  • TARGET is a categorical variable which is also the target variable for analysis.
  • 8% clients had payment difficulties in the past.
  • Data imbalance is in the favour of the bank.

Feature: 'NAME_CONTRACT_TYPE'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
2 clients NAME_CONTRACT_TYPE Identification if loan is cash or revolving NaN
In [ ]:
clients_cat_uni_analysis('NAME_CONTRACT_TYPE', 'Loan Type')
plt.show()
In [ ]:
clients_cat_seg_analysis('NAME_CONTRACT_TYPE', 'Loan Type')
plt.show()

Assessment:

  • Cash loans are more popular with over 90% clients using it.
  • Clients who take cash loans have more payment difficulties.

Feature: CODE_GENDER¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
3 clients CODE_GENDER Gender of the client NaN
In [ ]:
clients.CODE_GENDER.value_counts(normalize=True) * 100
Out[ ]:
F      65.834393
M      34.164306
XNA     0.001301
Name: CODE_GENDER, dtype: float64
In [ ]:
# Checking if the rows containing XNA gender have issues
clients[clients.CODE_GENDER == 'XNA']
Out[ ]:
SK_ID_CURR TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY ... OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT ORGANIZATION_TYPE AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
35657 141289 0 Revolving loans XNA Y Y 0 207000.0 382500.0 19125.0 ... NaN 2.0 3 Business Entity Type 1 0.0 0.0 0.0 0.0 0.0 1.0
38566 144669 0 Revolving loans XNA N Y 2 157500.0 270000.0 13500.0 ... Low-skill Laborers 4.0 2 Industry: type 3 0.0 0.0 0.0 3.0 0.0 4.0
83382 196708 0 Revolving loans XNA N Y 1 135000.0 405000.0 20250.0 ... Core staff 3.0 2 Kindergarten 0.0 0.0 0.0 0.0 0.0 3.0
189640 319880 0 Revolving loans XNA Y Y 0 247500.0 540000.0 27000.0 ... NaN 2.0 2 Medicine 0.0 0.0 0.0 0.0 1.0 6.0

4 rows × 28 columns

In [ ]:
# Replacing XNA with NaN
clients.CODE_GENDER = clients.CODE_GENDER.replace('XNA', np.NaN)
In [ ]:
ax = clients_cat_uni_analysis('CODE_GENDER', 'Gender')
ax.set_xticklabels(['Female', 'Male'])
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('CODE_GENDER', 'Gender')
ax.set_xticklabels(['Female', 'Male'])
plt.show()

Assessment:

  • Over 65% clients are females.
  • Males have more payment difficulties than females. This is an important observation which isn't visible without normalised visualisation.

Feature: 'FLAG_OWN_CAR'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
4 clients FLAG_OWN_CAR Flag if the client owns a car NaN
In [ ]:
ax = clients_cat_uni_analysis('FLAG_OWN_CAR', 'Car Ownership')
ax.set_xticklabels(['No', 'Yes'])
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('FLAG_OWN_CAR', 'Car Ownership')
ax.set_xticklabels(['No', 'Yes'])
plt.show()

Assessment:

  • 66% clients don't own a car.
  • Car ownership doesn't tell anything strong about payment difficulties. Although, those clients who don't own a car have little bit more payment difficulties on average.

Feature: 'FLAG_OWN_REALTY'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
5 clients FLAG_OWN_REALTY Flag if client owns a house or flat NaN
In [ ]:
ax = clients_cat_uni_analysis('FLAG_OWN_REALTY', 'House/Flat Ownership')
ax.set_xticklabels(['Yes', 'No'])
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('FLAG_OWN_REALTY', 'House/Flat Ownership')
ax.set_xticklabels(['Yes', 'No'])
plt.show()

Assessment:

  • 69% clients own a house/flat.
  • House/flat ownership doesn't tell anything strong about payment difficulties. Although, those who own house/flat have little bit more payment difficulties.

Feature: 'CNT_CHILDREN'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
6 clients CNT_CHILDREN Number of children the client has NaN
In [ ]:
clients.CNT_CHILDREN.unique()
Out[ ]:
array([ 0,  1,  2,  3,  4,  7,  5,  6,  8,  9, 11, 12, 10, 19, 14])

Binning this into buckets for easy analysis

In [ ]:
clients.CNT_CHILDREN = pd.cut(clients.CNT_CHILDREN, bins=[0,1,3,6,11,20], labels=['No', 'Low', 'Medium', 'High', 'Very High'], right=False)
clients.CNT_CHILDREN.value_counts()
Out[ ]:
No           215371
Low           87868
Medium         4230
High             34
Very High         8
Name: CNT_CHILDREN, dtype: int64
In [ ]:
clients_cat_uni_analysis('CNT_CHILDREN', 'Children Count')
plt.show()
In [ ]:
clients_cat_seg_analysis('CNT_CHILDREN', 'Children Count')
plt.show()

Assessment:

  • 70% clients have no children.
  • Clients with high number of children have more payment difficulties.

Feature: 'AMT_INCOME_TOTAL'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
7 clients AMT_INCOME_TOTAL Income of the client NaN
In [ ]:
ax = sns.boxplot(x = clients.AMT_INCOME_TOTAL)
ax.set_xlabel('Income')
ax.set_title('Income Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
# Making a sense of income distribution of majority of customers
income_minus_outliers = clients.loc[clients.AMT_INCOME_TOTAL<300000,:]
ax = sns.boxplot(data= income_minus_outliers, x='AMT_INCOME_TOTAL')
ax.set_xticks(list(income_minus_outliers.AMT_INCOME_TOTAL.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Income')
ax.set_title('Income Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
income_minus_outliers = clients[~(clients.AMT_INCOME_TOTAL > 300000)]
ax = sns.boxplot(data=income_minus_outliers, x='TARGET', y = 'AMT_INCOME_TOTAL')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Income')
ax.set_title('Income Distribution Segmented Univariate Analysis')
ax.figure.set_size_inches(12,12)
plt.show()
In [ ]:
ax = sns.histplot(data=income_minus_outliers ,x='AMT_INCOME_TOTAL', bins=20, stat='percent', hue='TARGET', multiple='stack')
ax.set_xticks(np.linspace(0,300000,21))
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.set_xlabel('Income')
ax.set_title('Income Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=income_minus_outliers ,x='AMT_INCOME_TOTAL', bins=20, stat='percent', hue='TARGET', multiple='fill')
ax.set_xticks(np.linspace(0,300000,21))
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.set_xlabel('Income')
ax.set_title('Income Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • There are large number of outliers at the higher end, but as high income individuals can be valued customers, they shouldn't be removed.
  • For majority of clients, income median is 135000.
  • Income doesn't strongly affect payment difficulty.

Feature: 'AMT_CREDIT'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
8 clients AMT_CREDIT Credit amount of the loan NaN
In [ ]:
ax = sns.boxplot(x = clients.AMT_CREDIT)
ax.set_xticks(list(clients.AMT_CREDIT.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Loan Credit (in 100,000)')
ax.set_title('Loan Credit Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients, x='TARGET', y='AMT_CREDIT')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Loan Credit (in 100,000)')
ax.set_title('Loan Credit Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.histplot(data=clients, x='AMT_CREDIT', hue='TARGET', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Loan Credit (in 100,000)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Loan Credit Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=clients, x='AMT_CREDIT', hue='TARGET', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Loan Credit (in 100,000)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Loan Credit Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • Median of credit amount is approx 50000 and its maximum frequency exists around 25000.
  • Lower loan credit implies higher payment difficulty.

Feature: 'AMT_ANNUITY'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
9 clients AMT_ANNUITY Loan annuity NaN
In [ ]:
ax = sns.boxplot(x = clients.AMT_ANNUITY)
ax.set_xticks(list(clients.AMT_ANNUITY.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_ANNUITY<80000,:], x='TARGET', y='AMT_ANNUITY')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.histplot(data=clients, x='AMT_ANNUITY', hue='TARGET', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Loan Annuity')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Loan Annuity Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=clients, x='AMT_ANNUITY', hue='TARGET', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Loan Annuity')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Loan Annuity Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • Loan Annuity has a median of approx 25000 and its maximum frequency also exists around 25000.
  • Lower loan annuity implies higher payment difficulty.

Feature: 'AMT_GOODS_PRICE'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
10 clients AMT_GOODS_PRICE For consumer loans it is the price of the goods for which the loan is given NaN
In [ ]:
ax = sns.boxplot(x = clients.AMT_GOODS_PRICE)
ax.set_xticks(list(clients.AMT_GOODS_PRICE.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Price of Goods (in 100,000)')
ax.set_title('Price of Goods (for Consumer Loans) Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients, x='TARGET', y='AMT_GOODS_PRICE')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Price of Goods (in 100,000)')
ax.set_title('Price of Goods (for Consumer Loans) Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.histplot(data=clients, x='AMT_GOODS_PRICE', hue='TARGET', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Region Population (Normalised)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Price of Goods (for Consumer Loans) Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=clients, x='AMT_GOODS_PRICE', hue='TARGET', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Region Population (Normalised)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Price of Goods (for Consumer Loans) Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • Price of goods for which consumer loan was applied has a median of approx 450,000.
  • Lower the price of goods implies higher payment difficulties.

Feature: 'NAME_INCOME_TYPE'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
12 clients NAME_INCOME_TYPE Clients income type (businessman, working, maternity leave,…) NaN
In [ ]:
clients_cat_uni_analysis('NAME_INCOME_TYPE', 'Income Type')
plt.show()
In [ ]:
clients_cat_seg_analysis('NAME_INCOME_TYPE', 'Income Type')
plt.show()

Assessment:

  • Most of the clients are working professionals.
  • Clients who are on Maternity Leave or are unemployed have highest payment difficulties on average.
  • Although Businessman and Student never show payment difficulties, we don't have large sample of them to deduce anything.
  • Pensioners and State Servants have least payment difficulties on average.

Feature: 'NAME_EDUCATION_TYPE'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
13 clients NAME_EDUCATION_TYPE Level of highest education the client achieved NaN
In [ ]:
clients_cat_uni_analysis('NAME_EDUCATION_TYPE', 'Highest Education Level')
plt.show()
In [ ]:
clients_cat_seg_analysis('NAME_EDUCATION_TYPE', 'Highest Education Level')
plt.show()

Assessment:

  • Most of the clients have secondary education followed by higher education.
  • Clients with low level of education have more difficulties with payment.

Feature: 'NAME_FAMILY_STATUS'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
14 clients NAME_FAMILY_STATUS Family status of the client NaN
In [ ]:
clients.NAME_FAMILY_STATUS.value_counts()
Out[ ]:
Married                 196432
Single / not married     45444
Civil marriage           29775
Separated                19770
Widow                    16088
Unknown                      2
Name: NAME_FAMILY_STATUS, dtype: int64

For this analysis, the difference between 'Married' and 'Civil marriage' holds no value. Family expenses should be same. So, I am merging these two categories.

In [ ]:
clients.NAME_FAMILY_STATUS = clients.NAME_FAMILY_STATUS.replace('Civil marriage', 'Married')
In [ ]:
# Converting unknown to NaN
clients.NAME_FAMILY_STATUS = clients.NAME_FAMILY_STATUS.replace('Unknown', np.NaN)
In [ ]:
clients_cat_uni_analysis('NAME_FAMILY_STATUS', 'Relationship Status')
plt.show()
In [ ]:
clients_cat_seg_analysis('NAME_FAMILY_STATUS', 'Relationship Status')
plt.show()

Assessment:

  • Most of the clients are married followed by unmarried singles.
  • Unmarried singles have most difficulties with payment.
  • Married clients do well. Widowed clients do best.

Feature: 'NAME_HOUSING_TYPE'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
15 clients NAME_HOUSING_TYPE What is the housing situation of the client (renting, living with parents, ...) NaN
In [ ]:
clients_cat_uni_analysis('NAME_HOUSING_TYPE', 'Housing Type')
plt.show()
In [ ]:
clients_cat_seg_analysis('NAME_HOUSING_TYPE', 'Housing Type')
plt.show()

Assessment:

  • Most of the clients live in house/ apartment.
  • Clients living with parents or in rented apartments have highest difficulties with payment.
  • Clients living in office apartment do well.

Feature: 'REGION_POPULATION_RELATIVE'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
16 clients REGION_POPULATION_RELATIVE Normalized population of region where client lives (higher number means the client lives in more populated region) normalized
In [ ]:
ax = sns.boxplot(x = clients.REGION_POPULATION_RELATIVE)
ax.set_xticks(list(clients.REGION_POPULATION_RELATIVE.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Region Population (Normalised)')
ax.set_title('Region Population (Normalised) Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.REGION_POPULATION_RELATIVE<0.05,:], x='TARGET', y='REGION_POPULATION_RELATIVE')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Region Population (Normalised)')
ax.set_title('Region Population (Normalised) Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.histplot(data=clients, x='REGION_POPULATION_RELATIVE', hue='TARGET', bins=15, stat='percent', multiple='stack')
ax.set_xlabel('Region Population (Normalised)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Region Population (Normalised) Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=clients, x='REGION_POPULATION_RELATIVE', hue='TARGET', bins=15, stat='percent', multiple='fill')
ax.set_xlabel('Region Population (Normalised)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Region Population (Normalised) Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • Most of the clients live in less populated regions.
  • Population of clients' regions doesn't tell anything strong about payment difficulties.

Feature: 'DAYS_BIRTH'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
17 clients DAYS_BIRTH Client's age in days at the time of application time only relative to the application
In [ ]:
# Converting it to absolute age in years
clients['AGE'] = clients.DAYS_BIRTH.abs() / 365
In [ ]:
ax = sns.boxplot(x = clients.AGE)
ax.set_xticks(list(clients.AGE.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('AGE')
ax.set_title('Age Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients, x='TARGET', y='AGE')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Age')
ax.set_title('Age Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
# Creating buckets for better analysis
clients['AGE_BUCKET'] = pd.cut(clients.AGE, bins=range(20,71,10), labels=['20-30', '30-40', '40-50','50-60', '60-70'], right=False)
In [ ]:
clients_cat_uni_analysis('AGE_BUCKET', 'Age Bucket')
plt.show()
In [ ]:
clients_cat_seg_analysis('AGE_BUCKET', 'Age Bucket')
plt.show()

Assesment:

  • Clients age vary from 20 to 69. Most clients belong to 30 - 40 years bucket. Median age is 43 years.
  • Clients with less age have more difficulties with payment.

Feature: 'DAYS_EMPLOYED'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
18 clients DAYS_EMPLOYED How many days before the application the person started current employment time only relative to the application
In [ ]:
# Converting it to years employed
clients['YEARS_EMPLOYED'] = clients.DAYS_EMPLOYED.abs() / 365
In [ ]:
ax = sns.boxplot(x = clients.YEARS_EMPLOYED)
ax.figure.set_size_inches(24,8)
plt.show()

There's an anomaly. Nobody can be employed for 1000 years. This is definitely missing value.

In [ ]:
clients.loc[clients.YEARS_EMPLOYED > 100, 'YEARS_EMPLOYED'] = np.NaN
In [ ]:
ax = sns.boxplot(x = clients.YEARS_EMPLOYED)
ax.set_xticks(list(clients.YEARS_EMPLOYED.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Years Employed')
ax.set_title('Years Employed Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.YEARS_EMPLOYED < 20,:], x='TARGET', y='YEARS_EMPLOYED')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Years Employed')
ax.set_title('Years Employed Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
# Creating buckets for better analysis
clients['EMPLOYED_EXPERIENCE'] = pd.cut(clients.YEARS_EMPLOYED, bins=[0,1,5,10,20,100], labels=['Fresher', 'Associate', 'Senior','Master', 'Grand Master'], right=False)
In [ ]:
clients_cat_uni_analysis('EMPLOYED_EXPERIENCE', 'Job Experience')
plt.show()
In [ ]:
clients_cat_seg_analysis('EMPLOYED_EXPERIENCE', 'Job Experience')
plt.show()

Assessment:

  • More clients have less job experience. Median job experience is 4.5 years.
  • Clients with less years of job experience have more difficulties with payment.

Feature: 'OCCUPATION_TYPE'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
28 clients OCCUPATION_TYPE What kind of occupation does the client have NaN
In [ ]:
ax = clients_cat_uni_analysis('OCCUPATION_TYPE', 'Occupation Type')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('OCCUPATION_TYPE', 'Occupation Type')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()

Assessment:

  • Majority of clients are laborers, followed by sales staff.
  • HR staff and IT staff constitute least of clients.
  • Low-skill laborers have highest difficulties in payment, followed by drivers, waiters/barmen, security staff, laborers and cooking staff.

Feature: 'CNT_FAM_MEMBERS'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
29 clients CNT_FAM_MEMBERS How many family members does client have NaN
In [ ]:
clients.CNT_FAM_MEMBERS.value_counts()
Out[ ]:
2.0     158357
1.0      67847
3.0      52601
4.0      24697
5.0       3478
6.0        408
7.0         81
8.0         20
9.0          6
10.0         3
14.0         2
12.0         2
20.0         2
16.0         2
13.0         1
15.0         1
11.0         1
Name: CNT_FAM_MEMBERS, dtype: int64
In [ ]:
# Binning number of family members into family size for easy analysis
clients.CNT_FAM_MEMBERS = clients.CNT_FAM_MEMBERS.replace(1.0, 'Single').replace(2.0, 'Couple').replace([3.0,4.0,5.0], 'Small Family').replace([6.0,7.0,8.0,9.0,10.0], 'Mid Family').replace([11.0,12.0,13.0,14.0,15.0,16.0,20.0], 'Big Family')
clients.CNT_FAM_MEMBERS.value_counts()
Out[ ]:
Couple          158357
Small Family     80776
Single           67847
Mid Family         518
Big Family          11
Name: CNT_FAM_MEMBERS, dtype: int64
In [ ]:
clients_cat_uni_analysis('CNT_FAM_MEMBERS', 'Family Size')
plt.show()
In [ ]:
clients_cat_seg_analysis('CNT_FAM_MEMBERS', 'Family Size')
plt.show()

Assessment:

  • Most of the clients have 2 family members.
  • Clients having more than 10 family members have highest payment difficulties followed by clients having 5-10 family members.
  • Clients have 2 family members do well.

Feature: 'REGION_RATING_CLIENT'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
30 clients REGION_RATING_CLIENT Our rating of the region where client lives (1,2,3) NaN
In [ ]:
clients_cat_uni_analysis('REGION_RATING_CLIENT', 'Region Rating')
plt.show()
In [ ]:
clients_cat_seg_analysis('REGION_RATING_CLIENT', 'Region Rating')
plt.show()

Assessment:

  • Clients living in a region with rating 3 have highest difficulties in payment.

Feature: 'ORGANIZATION_TYPE'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
40 clients ORGANIZATION_TYPE Type of organization where client works NaN
In [ ]:
len(clients.ORGANIZATION_TYPE.value_counts())
Out[ ]:
58
In [ ]:
# Reducing the number of unique values
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Business Entity' if x.startswith('Business Entity') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Trade' if x.startswith('Trade') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Transport' if x.startswith('Transport') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Industry' if x.startswith('Industry') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Trade' if x.startswith('Trade') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.replace(['XNA', 'Other'], np.NaN)
In [ ]:
ax = clients_cat_uni_analysis('ORGANIZATION_TYPE', 'Organisation Type')
ax.figure.set_size_inches(24, 12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('ORGANIZATION_TYPE', 'Organisation Type')
ax.figure.set_size_inches(24, 12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()

Assessment:

  • Most of the clients work at business entity or are self-employed.
  • Clients who work in restaurant, construction, cleaning, agriculture or is self employed have difficulties in payment.

Feature: 'AMT_REQ_CREDIT_BUREAU_HOUR'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
116 clients AMT_REQ_CREDIT_BUREAU_HOUR Number of enquiries to Credit Bureau about the client one hour before application NaN
In [ ]:
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_HOUR', 'Last Hour Credit Bureau Enquiry')
ax.set_xticklabels([0,1,2,3,4])
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Hour')
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_HOUR', 'Last Hour Credit Bureau Enquiry')
ax.set_xticklabels([0,1,2,3,4])
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Hour')
plt.show()

Assessment:

  • Zero credit bureau enquiries were made by most of the clients in the last hour.
  • Clients who made 2 credit bureau enquiries in the last hour had most difficulties with the payment.

Feature: 'AMT_REQ_CREDIT_BUREAU_DAY'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
117 clients AMT_REQ_CREDIT_BUREAU_DAY Number of enquiries to Credit Bureau about the client one day before application (excluding one hour before application) NaN
In [ ]:
clients.AMT_REQ_CREDIT_BUREAU_DAY = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_DAY, bins=[0.0,1.0,4.0,10.0], labels=['No','Low','High'], right=False)
In [ ]:
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_DAY', 'Last Day Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Day')
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_DAY', 'Last Day Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Day')
plt.show()

Assessment:

  • Zero credit bureau enquiries were made by most of the clients in the last day.
  • Clients who made 1-3 credit bureau enquiries in the last day had most difficulties with the payment.

Feature: 'AMT_REQ_CREDIT_BUREAU_WEEK'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
118 clients AMT_REQ_CREDIT_BUREAU_WEEK Number of enquiries to Credit Bureau about the client one week before application (excluding one day before application) NaN
In [ ]:
clients.AMT_REQ_CREDIT_BUREAU_WEEK = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_WEEK, bins=[0.0,1.0,4.0,10.0], labels=['No','Low','High'], right=False)
In [ ]:
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_WEEK', 'Last Week Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Week')
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_WEEK', 'Last Week Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Week')
plt.show()

Assessment:

  • Zero credit bureau enquiries were made by most of the clients in the last week.
  • No. of enquiries made in the last week doesn't tell anything strong about payment difficulty.

Feature: 'AMT_REQ_CREDIT_BUREAU_MON'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
119 clients AMT_REQ_CREDIT_BUREAU_MON Number of enquiries to Credit Bureau about the client one month before application (excluding one week before application) NaN
In [ ]:
clients.AMT_REQ_CREDIT_BUREAU_MON = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_MON, bins=[0.0,1.0,6.0,11.0,16.0,30.0], labels=['No','Low', 'Mid', 'High', 'Very High'], right=False)
In [ ]:
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_MON', 'Last Month Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Month')
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_MON', 'Last Month Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Month')
plt.show()

Assessment:

  • Zero credit bureau enquiries were made by most of the clients in the last month.
  • Clients who made 16-30 credit bureau enquiries in the last month had least difficulties with the payment.

Feature: 'AMT_REQ_CREDIT_BUREAU_QRT'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
120 clients AMT_REQ_CREDIT_BUREAU_QRT Number of enquiries to Credit Bureau about the client 3 month before application (excluding one month before application) NaN
In [ ]:
clients.AMT_REQ_CREDIT_BUREAU_QRT = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_QRT, bins=[0.0,1.0,6.0,11.0,20.0,500.0], labels=['No','Low', 'Mid', 'High', 'Very High'], right=False)
In [ ]:
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_QRT', 'Last Quarter Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Quarter')
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_QRT', 'Last Quarter Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Quarter')
plt.show()

Assessment:

  • Zero credit bureau enquiries were made by most of the clients in the last quarter.
  • Clients who made 6-10 credit bureau enquiries in the last quarter had most difficulties with the payment.

Feature: 'AMT_REQ_CREDIT_BUREAU_YEAR'¶

In [ ]:
clients_cols.describe_next()
Out[ ]:
Table Column Description Special
121 clients AMT_REQ_CREDIT_BUREAU_YEAR Number of enquiries to Credit Bureau about the client one day year (excluding last 3 months before application) NaN
In [ ]:
clients.AMT_REQ_CREDIT_BUREAU_YEAR = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_YEAR, bins=[0.0,1.0,6.0,11.0,16.0,30.0], labels=['No','Low', 'Mid', 'High', 'Very High'], right=False)
In [ ]:
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_YEAR', 'Last Year Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Year')
plt.show()
In [ ]:
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_YEAR', 'Last Year Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Year')
plt.show()

Assessment:

  • 1 - 5 credit bureau enquiries have been caused by most clients in the last year.
  • Yearly credit bureau enquiries don't tell anything strongly about payment difficulties, but those who haven't made credit enquiries have little bit less difficulty with payment.

Features of Loans¶

In [ ]:
# Helper function for Categorical Univariate Analysis
def loans_cat_uni_analysis(col, friendly_name):
  tmp_df = loans[col].value_counts(normalize=True).mul(100).rename('Percent').reset_index().rename(columns={'index': col})
  ax = sns.barplot(data=tmp_df, x=col, y='Percent')
  for p in ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() + (p.get_width()/2)
    txt_y = p.get_height()
    ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
  ax.set_title(f'{friendly_name} Univariate Analysis (Normalised)', fontsize=20)
  ax.set_xlabel(friendly_name)
  ax.figure.set_size_inches(16,8)
  return ax
In [ ]:
# Helper function for Categorical Segmented Univariate Analysis
def loans_cat_seg_analysis(col, friendly_name):
  tmp_df = loans.groupby(col)['NAME_CONTRACT_STATUS'].value_counts(normalize=True).mul(100).rename('Percent').reset_index()
  ax = sns.barplot(data=tmp_df, x=col, y='Percent', hue='NAME_CONTRACT_STATUS')
  for p in ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() + (p.get_width()/2)
    txt_y = p.get_height()
    ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
  ax.set_title(f'{friendly_name} Segmented Univariate Analysis (Normalised)', fontsize=20)
  ax.set_xlabel(friendly_name)
  ax.figure.set_size_inches(16,8)
  return ax

Feature: 'SK_ID_CURR'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
123 loans SK_ID_CURR ID of loan in our sample hashed
In [ ]:
clients_cols.describe('SK_ID_CURR')
Out[ ]:
Table Column Description Special
0 clients SK_ID_CURR ID of loan in our sample NaN

Assessment:

  • ID of records.
  • Can be used to merge clients and loans data frames.

Feature: 'NAME_CONTRACT_TYPE'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
124 loans NAME_CONTRACT_TYPE Contract product type (Cash loan, consumer loan [POS] ,...) of the previous application NaN
In [ ]:
loans.NAME_CONTRACT_TYPE.value_counts()
Out[ ]:
Cash loans         747553
Consumer loans     729151
Revolving loans    193164
XNA                   346
Name: NAME_CONTRACT_TYPE, dtype: int64
In [ ]:
# Converting 'XNA' to NaN
loans.NAME_CONTRACT_TYPE = loans.NAME_CONTRACT_TYPE.replace('XNA', np.NaN)
In [ ]:
loans_cat_uni_analysis('NAME_CONTRACT_TYPE', 'Loan Type')
plt.show()
In [ ]:
loans_cat_seg_analysis('NAME_CONTRACT_TYPE', 'Loan Type')
plt.show()

Assessment:

  • Cash loan and consumer loan applications are way more than revolving loan applications.
  • Consumer loans get approved most and cancelled least.
  • Revolving loans get refused most.

Feature: 'AMT_ANNUITY'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
125 loans AMT_ANNUITY Annuity of previous application NaN
In [ ]:
ax = sns.boxplot(x = loans.AMT_ANNUITY)
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
loans_annuity_minus_outliers = loans.loc[loans.AMT_ANNUITY<220000,:].AMT_ANNUITY
ax = sns.boxplot(x = loans_annuity_minus_outliers)
ax.set_xticks(list(loans_annuity_minus_outliers.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xticklabels(ax.get_xticks(), rotation=45)
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=loans.loc[loans.AMT_ANNUITY<220000,:], x='NAME_CONTRACT_STATUS', y='AMT_ANNUITY')
ax.set_ylabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.histplot(data=loans.loc[loans.AMT_ANNUITY<220000,:], x='AMT_ANNUITY', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=loans.loc[loans.AMT_ANNUITY<220000,:], x='AMT_ANNUITY', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • Loan annuity median is 11250 which very low from the max which goes beyond 300,000.
  • Higher loan annuity get higher refusal.

Feature: 'AMT_APPLICATION'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
126 loans AMT_APPLICATION For how much credit did client ask on the previous application NaN
In [ ]:
ax = sns.boxplot(x = loans.AMT_APPLICATION)
ax.set_xlabel('Application Amount')
ax.set_title('Application Amount Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
loans_application_minus_outliers = loans.loc[loans.AMT_APPLICATION<3000000,:]
ax = sns.boxplot(x = loans_application_minus_outliers.AMT_APPLICATION)
ax.set_xlabel('Application Amount')
ax.set_title('Application Amount Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=loans_application_minus_outliers, x='NAME_CONTRACT_STATUS', y='AMT_APPLICATION')
ax.set_ylabel('Application Amount')
ax.set_title('Application Amount Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.histplot(data=loans_application_minus_outliers, x='AMT_APPLICATION', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Application Amount')
ax.set_title('Application Amount Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=loans_application_minus_outliers, x='AMT_APPLICATION', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Application Amount')
ax.set_title('Application Amount Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • Most of the asking amount is at lower end.
  • Higher asking amount often gets rejected.

Feature: 'AMT_CREDIT'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
127 loans AMT_CREDIT Final credit amount on the previous application. This differs from AMT_APPLICATION in a way that the AMT_APPLICATION is the amount for which the client initially applied for, but during our approval process he could have received different amount - AMT_CREDIT NaN
In [ ]:
ax = sns.boxplot(x = loans.AMT_CREDIT)
ax.set_xlabel('Loan CREDIT')
ax.set_title('Loan CREDIT Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
loans_credit_minus_outliers = loans.loc[loans.AMT_CREDIT<350000,:]
ax = sns.boxplot(x = loans_credit_minus_outliers.AMT_CREDIT)
ax.set_xticks(list(loans_credit_minus_outliers.AMT_CREDIT.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Loan CREDIT')
ax.set_title('Loan CREDIT Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_CONTRACT_STATUS', y='AMT_CREDIT')
ax.set_ylabel('Loan Credit')
ax.set_title('Loan Credit Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.histplot(data=loans_application_minus_outliers, x='AMT_CREDIT', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Loan Credit')
ax.set_title('Loan Credit Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=loans_application_minus_outliers, x='AMT_CREDIT', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Loan Credit')
ax.set_title('Loan Credit Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • Lower credit loan applications are more in the system.
  • Higher credit loan applications have more chance of refusal and less chance of approval.

Feature: 'NAME_CONTRACT_STATUS'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
139 loans NAME_CONTRACT_STATUS Contract status (approved, cancelled, ...) of previous application NaN
In [ ]:
ax = sns.countplot(x=loans.NAME_CONTRACT_STATUS)
ax.set_title('Target Variable Data Imbalance', fontsize=20)
ax.figure.set_size_inches(12,8)
plt.show()

Assessment:

  • NAME_CONTRACT_STATUS is a target variable.
  • Most of the loan applications get approved.
  • Here, data imbalance is in the favour of the bank.

Feature: 'NAME_PAYMENT_TYPE'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
141 loans NAME_PAYMENT_TYPE Payment method that client chose to pay for the previous application NaN
In [ ]:
# Renaming segments
loans.NAME_PAYMENT_TYPE = loans.NAME_PAYMENT_TYPE.replace('XNA', np.NaN).replace('Cash through the bank', 'Cash').replace('Non-cash from your account', 'Account Debit').replace('Cashless from the account of the employer', 'Employer')
In [ ]:
loans_cat_uni_analysis('NAME_PAYMENT_TYPE', 'Loan Payment')
plt.show()
In [ ]:
loans_cat_seg_analysis('NAME_PAYMENT_TYPE', 'Loan Payment')
plt.show()

Assessment:

  • Most of the loan payments are made through cash.
  • Direct account debit has slightly higher approval rate.

Feature: 'NAME_CLIENT_TYPE'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
144 loans NAME_CLIENT_TYPE Was the client old or new client when applying for the previous application NaN
In [ ]:
# Fixing values
loans.NAME_CLIENT_TYPE = loans.NAME_CLIENT_TYPE.replace('XNA', np.NaN).replace(['Repeater', 'Refreshed'], 'Old')
In [ ]:
loans_cat_uni_analysis('NAME_CLIENT_TYPE', 'Client Type')
plt.show()
In [ ]:
loans_cat_seg_analysis('NAME_CLIENT_TYPE', 'Client Type')
plt.show()

Assessment:

  • Most of the loan applications come from old customers.
  • Loan applications from new customers get approved most and refused less.

Feature: 'NAME_GOODS_CATEGORY'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
145 loans NAME_GOODS_CATEGORY What kind of goods did the client apply for in the previous application NaN
In [ ]:
# Replacing XNA with NaN
loans.NAME_GOODS_CATEGORY = loans.NAME_GOODS_CATEGORY.replace('XNA', np.NaN)
In [ ]:
ax = loans_cat_uni_analysis('NAME_GOODS_CATEGORY', 'Goods')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()
In [ ]:
ax = loans_cat_seg_analysis('NAME_GOODS_CATEGORY', 'Goods')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()

Assessment:

  • Loans applications are submitted most for mobiles, consumer elctronics and computers.
  • Loan applications for mobile gets cancelled most.
  • Loan applications for fitness gets approved most.

Feature: 'CHANNEL_TYPE'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
148 loans CHANNEL_TYPE Through which channel we acquired the client on the previous application NaN
In [ ]:
loans_cat_uni_analysis('CHANNEL_TYPE', 'Acquision Channel')
plt.show()
In [ ]:
ax = loans_cat_seg_analysis('CHANNEL_TYPE', 'Acquision Channel')
ax.figure.set_size_inches(24,12)
plt.show()

Assessment:

  • Credit and cash offices bring most loan applications.
  • Loan applications brought by channel of corporate sales get refused most.
  • Loan applications brought by contact centers and credit/cash offices get cancelled most.
  • Loan applications brought by Regional/local & stone channels get approved most.

Feature: 'CNT_PAYMENT'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
151 loans CNT_PAYMENT Term of previous credit at application of the previous application NaN
In [ ]:
ax = sns.boxplot(x = loans.CNT_PAYMENT)
ax.set_xticks(list(loans.CNT_PAYMENT.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Loan Term')
ax.set_title('Loan Term Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.boxplot(data=loans, x='NAME_CONTRACT_STATUS', y='CNT_PAYMENT')
ax.set_ylabel('Loan Terms')
ax.set_title('Loan Terms Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.histplot(data=loans, x='CNT_PAYMENT', hue='NAME_CONTRACT_STATUS', bins=15, stat='percent', multiple='stack')
ax.set_xlabel('Loan Terms')
ax.set_title('Loan Terms Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.histplot(data=loans, x='CNT_PAYMENT', hue='NAME_CONTRACT_STATUS', bins=15, stat='percent', multiple='fill')
ax.set_xlabel('Loan Terms')
ax.set_title('Loan Terms Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • Most applications are there for smaller terms. Median is 12.
  • Higher term applications get refused as well as cancelled most.

Feature: 'NAME_YIELD_GROUP'¶

In [ ]:
loans_cols.describe_next()
Out[ ]:
Table Column Description Special
152 loans NAME_YIELD_GROUP Grouped interest rate into small medium and high of the previous application grouped
In [ ]:
loans.NAME_YIELD_GROUP = loans.NAME_YIELD_GROUP.replace('XNA', np.NaN)
In [ ]:
# Merging low_normal & low_action segments
loans.NAME_YIELD_GROUP = loans.NAME_YIELD_GROUP.replace(['low_normal', 'low_action'], 'low')
In [ ]:
loans_cat_uni_analysis('NAME_YIELD_GROUP', 'Interest Rate')
plt.show()
In [ ]:
loans_cat_seg_analysis('NAME_YIELD_GROUP', 'Interest Rate')
plt.show()

Assessment:

  • Lower interest rate gets most applications.
  • Higher interest rate applications get approved most.

Bivariate Analysis¶

Feature Sets from Clients¶

Set 1:¶

  • AMT_CREDIT
  • AMT_INCOME
  • CODE_GENDER
  • AGE_BUCKET
  • NAME_EDUCATION_TYPE
In [ ]:
ax = sns.scatterplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET')
ax.set_xlabel('Income')
ax.set_ylabel('Loan Credit')
ax.set_title('Income vs Loan Credit', fontsize=20)
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.figure.set_size_inches(12,12)
plt.show()
In [ ]:
ax = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='CODE_GENDER')
ax.figure.set_size_inches(24,12)
plt.show()
In [ ]:
ax = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='AGE_BUCKET', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
In [ ]:
ax = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='NAME_EDUCATION_TYPE', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()

Assessment:

  • Clients with academic degree have less payment difficulties.
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<1000000,:], x='CODE_GENDER', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Gender', fontsize=20)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<1000000,:], x='AGE_BUCKET', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Age Group', fontsize=20)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<1000000,:], x='NAME_EDUCATION_TYPE', y='AMT_CREDIT', hue='TARGET', order=['Lower secondary', 'Secondary / secondary special', 'Incomplete higher', 'Higher education', 'Academic degree'])
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Education Level', fontsize=20)
plt.show()

Assessment:

  • Clients with academic degress have payment difficulty on higher loan credit.
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='CODE_GENDER', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.set_title('Income vs Gender', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()

Assessment:

  • Males have more income than females.
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AGE_BUCKET', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.set_title('Income vs Age Group', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()

Assessment:

  • Clients having 60-70 age have lowest income and get payment difficulty with it.
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='NAME_EDUCATION_TYPE', y='AMT_INCOME_TOTAL', hue='TARGET', order=['Lower secondary', 'Secondary / secondary special', 'Incomplete higher', 'Higher education', 'Academic degree'])
ax.set_title('Income vs Education Level', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()

Assessment:

  • Clients with higher education level have higher income.

Set 2:¶

  • AMT_CREDIT
  • AMT_INCOME
  • FLAG_OWN_CAR
  • CNT_CHILDREN
  • EMPLOYED_EXPERIENCE
In [ ]:
g = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET')
g.figure.set_size_inches(8,8)
plt.show()
In [ ]:
g = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='FLAG_OWN_CAR')
g.figure.set_size_inches(12,8)
plt.show()
In [ ]:
g = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='CNT_CHILDREN', col_wrap=3)
g.figure.set_size_inches(24,16)
plt.show()
In [ ]:
g = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='EMPLOYED_EXPERIENCE', col_wrap=3)
g.figure.set_size_inches(24,16)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='FLAG_OWN_CAR', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Car Ownership', fontsize=20)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='CNT_CHILDREN', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Children Count', fontsize=20)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='EMPLOYED_EXPERIENCE', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Work Experience', fontsize=20)
plt.show()

Assessment:

  • Those who have 20+ years of work experience get payment difficulty mostly when loan credit amount is high.
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='FLAG_OWN_CAR', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Income vs Car Ownership', fontsize=20)
plt.show()

Assessment:

  • Those having car have higher income.
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='CNT_CHILDREN', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Income vs Children Count', fontsize=20)
plt.show()

Assessment:

  • Those who have 6-10 children and have payment difficulty, also have higher income.
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='EMPLOYED_EXPERIENCE', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Income vs Work Experience', fontsize=20)
plt.show()

Assessment:

  • Those who have 20+ years of experience have less payment difficulties with less income.

Set 3:¶

  • AMT_CREDIT
  • REGION_POPULATION_RELATIVE
  • FLAG_OWN_CAR
  • NAME_FAMILY_STATUS
  • NAME_CONTRACT_TYPE
In [ ]:
ax = sns.scatterplot(data=clients, x='REGION_POPULATION_RELATIVE', y='AMT_CREDIT', hue='TARGET')
ax.set_xlabel('Region Population')
ax.set_ylabel('Loan Credit')
ax.set_title('Loan Credit vs Region Population', fontsize=20)
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.figure.set_size_inches(12,12)
plt.show()
/Users/Sachin/opt/miniconda3/envs/UpGrad/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)
In [ ]:
ax = sns.relplot(data=clients, x='REGION_POPULATION_RELATIVE', y='AMT_CREDIT', hue='TARGET', col='FLAG_OWN_CAR', col_wrap=3)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.relplot(data=clients, x='REGION_POPULATION_RELATIVE', y='AMT_CREDIT', hue='TARGET', col='NAME_FAMILY_STATUS', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
In [ ]:
ax = sns.relplot(data=clients, x='REGION_POPULATION_RELATIVE', y='AMT_CREDIT', hue='TARGET', col='NAME_CONTRACT_TYPE', col_wrap=3)
ax.figure.set_size_inches(24,8)
plt.show()

Assessment:

  • For revolving loans, loan credit amount is less.
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='NAME_CONTRACT_TYPE', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Loan Types', fontsize=20)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='FLAG_OWN_CAR', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Car Ownership', fontsize=20)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='NAME_FAMILY_STATUS', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Relationship Status', fontsize=20)
plt.show()

Set 4:¶

  • AMT_INCOME_TOTAL
  • AMT_CREDIT
  • REGION_POPULATION_RELATIVE
  • AGE
  • YEARS_EMPLOYED
In [ ]:
g = sns.pairplot(clients.loc[clients.AMT_INCOME_TOTAL<300000,:], hue='TARGET', vars=['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'REGION_POPULATION_RELATIVE', 'AGE', 'YEARS_EMPLOYED'])
g.figure.set_size_inches(24,24)
plt.show()

Set 5:¶

  • AMT_REQ_CREDIT_BUREAU_YEAR
  • NAME_FAMILY_STATUS
  • CODE_GENDER
  • AMT_ANNUITY
  • AMT_GOODS_PRICE
In [ ]:
ax = sns.scatterplot(data=clients, x='AMT_ANNUITY', y='AMT_GOODS_PRICE', hue='TARGET')
ax.set_xlabel('Loan Annuity')
ax.set_ylabel('Goods Price')
ax.set_title('Loan Annuity vs Goods Price', fontsize=20)
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.figure.set_size_inches(12,12)
plt.show()

Assessment:

  • For higher loan annuity and goods price, payment difficulty is less.
In [ ]:
ax = sns.relplot(data=clients, x='AMT_ANNUITY', y='AMT_GOODS_PRICE', hue='TARGET', col='NAME_FAMILY_STATUS', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
In [ ]:
ax = sns.relplot(data=clients, x='AMT_ANNUITY', y='AMT_GOODS_PRICE', hue='TARGET', col='CODE_GENDER', col_wrap=3)
ax.figure.set_size_inches(24,8)
plt.show()
In [ ]:
ax = sns.relplot(data=clients, x='AMT_ANNUITY', y='AMT_GOODS_PRICE', hue='TARGET', col='AMT_REQ_CREDIT_BUREAU_YEAR', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_ANNUITY<150000,:], x='CODE_GENDER', y='AMT_ANNUITY', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Annuity vs Gender', fontsize=20)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_ANNUITY<150000,:], x='NAME_FAMILY_STATUS', y='AMT_ANNUITY', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Annuity vs Relationship Status', fontsize=20)
plt.show()
In [ ]:
ax = sns.boxplot(data=clients.loc[clients.AMT_ANNUITY<150000,:], x='AMT_REQ_CREDIT_BUREAU_YEAR', y='AMT_ANNUITY', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Annuity vs Credit Bureau Enquiry', fontsize=20)
plt.show()

Feature Sets from Loans¶

Set 1:¶

  • AMT_CREDIT
  • CNT_PAYMENT
  • NAME_CONTRACT_TYPE
  • NAME_CLIENT_TYPE
  • NAME_YIELD_GROUP
In [ ]:
ax = sns.scatterplot(data=loans, x='CNT_PAYMENT', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS')
ax.set_xlabel('Loan Terms')
ax.set_ylabel('Loan Credit')
ax.set_title('Loan Terms vs Loan Credit', fontsize=20)
ax.figure.set_size_inches(12,12)
plt.show()

Assessment 1:

  • When loan terms and loan credits are low, approval rates are high.
  • When loan term and/or loan credits are high, refusal rates are high.
In [ ]:
ax = sns.relplot(data=loans, x='CNT_PAYMENT', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS', col='NAME_CLIENT_TYPE')
ax.figure.set_size_inches(24,12)
plt.show()

Assessment 2:

  • For big loan credit and big loan term, new clients get approved more than old clients.
In [ ]:
ax = sns.relplot(data=loans, x='CNT_PAYMENT', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS', col='NAME_YIELD_GROUP')
ax.figure.set_size_inches(24,12)
plt.show()

Assessment 3:

  • Higher interest rate doesn't attract big loan credit or big loan term.
In [ ]:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_CONTRACT_STATUS', y='AMT_CREDIT')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Loan Status', fontsize=20)
plt.show()

Assessment 4:

  • The unused offers have mostly lower loan credit.
In [ ]:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_CLIENT_TYPE', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Client Type', fontsize=20)
plt.show()

Assessment 5:

  • New clients mostly get approved on lower loan credit than that of old clients.
In [ ]:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_YIELD_GROUP', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Interest Rate', fontsize=20)
plt.show()

Assessment 6:

  • High interest rate sees approval mostly in lower loan credit.
In [ ]:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_CONTRACT_TYPE', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Contract Type', fontsize=20)
plt.show()

Assessment 7:

  • Cash loans mostly get approved for higher loan credit.
In [ ]:
ax = sns.boxplot(data=loans, x='NAME_CONTRACT_STATUS', y='CNT_PAYMENT')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Terms vs Loan Status', fontsize=20)
plt.show()

Assessment 8:

  • Most of the approved loan application have lower loan terms.
  • Most of the cancelled loan application have higher loan terms.
In [ ]:
ax = sns.boxplot(data=loans, x='NAME_CLIENT_TYPE', y='CNT_PAYMENT', hue='NAME_CONTRACT_STATUS')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Terms vs Client Type', fontsize=20)
plt.show()

Assessment 9:

  • Old clients who cancel their applications mostly have higher terms.

Analysis on Merged Data¶

In [ ]:
# Merging clients and loans data frames
merged = pd.merge(left=clients[['SK_ID_CURR','TARGET']], right=loans[['SK_ID_CURR', 'AMT_CREDIT', 'CNT_PAYMENT', 'NAME_CONTRACT_TYPE', 'NAME_CLIENT_TYPE', 'NAME_YIELD_GROUP']], how='inner', on='SK_ID_CURR')
merged.head()
Out[ ]:
SK_ID_CURR TARGET AMT_CREDIT CNT_PAYMENT NAME_CONTRACT_TYPE NAME_CLIENT_TYPE NAME_YIELD_GROUP
0 100002 1 179055.0 24.0 Consumer loans New low
1 100003 0 1035882.0 12.0 Cash loans Old low
2 100003 0 348637.5 6.0 Consumer loans Old middle
3 100003 0 68053.5 12.0 Consumer loans Old middle
4 100004 0 20106.0 4.0 Consumer loans New middle
In [ ]:
ax = sns.scatterplot(data=merged, x='CNT_PAYMENT', y='AMT_CREDIT', hue='TARGET')
ax.set_xlabel('Loan Terms')
ax.set_ylabel('Loan Credit')
ax.set_title('Loan Terms vs Loan Credit', fontsize=20)
ax.figure.set_size_inches(12,12)
plt.show()

Assessment 1:

  • Clients with very high loan terms have payment difficulties.
In [ ]:
ax = sns.relplot(data=merged, x='CNT_PAYMENT', y='AMT_CREDIT', hue='TARGET', col='NAME_CLIENT_TYPE')
ax.figure.set_size_inches(24,12)
plt.show()

Assessment 2:

  • New clients don't have payment difficulties with high loan terms.
In [ ]:
ax = sns.relplot(data=merged, x='CNT_PAYMENT', y='AMT_CREDIT', hue='TARGET', col='NAME_YIELD_GROUP')
ax.figure.set_size_inches(24,12)
plt.show()

Assessment 3:
Interest rate grouping doesn't tell anything strong about payment difficulties.

In [ ]:
merged_credit_minus_outliers = merged.loc[merged.AMT_CREDIT<350000,:]
ax = sns.boxplot(data=merged_credit_minus_outliers, x='TARGET', y='AMT_CREDIT')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Payment Difficulty', fontsize=20)
plt.show()

Assessment 4:

  • Credit amount doesn't tell anything strong about payment difficulty.
In [ ]:
ax = sns.boxplot(data=merged_credit_minus_outliers, x='NAME_CLIENT_TYPE', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Client Type', fontsize=20)
plt.show()

Assessment 5:

  • New clients who have payment difficulties operates on low credit amount spread than that of old clients.
In [ ]:
ax = sns.boxplot(data=merged_credit_minus_outliers, x='NAME_YIELD_GROUP', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Interest Rate', fontsize=20)
plt.show()

Assessment 6:

  • In the high interest group, clients with payment difficulties have taken higher loan credit.
In [ ]:
ax = sns.boxplot(data=merged_credit_minus_outliers, x='NAME_CONTRACT_TYPE', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Contract Type', fontsize=20)
plt.show()

Assessment 7:

  • In the case of consumer loans, clients having payment difficulties have taken lower loan credit than the other cases.
  • In the case of revolving loans, more clients are in the higher loan credit side who had payment difficulties.
In [ ]:
ax = sns.boxplot(data=merged, x='TARGET', y='CNT_PAYMENT')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Terms vs Payment Difficulty', fontsize=20)
plt.show()

Assessment 8:

  • This analysis doesn't tell anything special about certain loan terms causing more payment difficulties.
In [ ]:
ax = sns.boxplot(data=merged, x='NAME_CLIENT_TYPE', y='CNT_PAYMENT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Terms vs Client Type', fontsize=20)
plt.show()

Assessment 9:

  • New clients who have payment difficulties mostly have lesser loan terms than that of old clients.

Correlations¶

Clients¶

In [ ]:
ax = sns.heatmap(clients[['TARGET', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'AGE', 'YEARS_EMPLOYED']].corr(), cmap='RdYlGn', annot=True)
ax.figure.set_size_inches(24,12)
plt.show()

Loans¶

In [ ]:
ax = sns.heatmap(loans[['AMT_APPLICATION', 'AMT_CREDIT', 'AMT_ANNUITY', 'CNT_PAYMENT']].corr(), cmap='RdYlGn', annot=True)
ax.figure.set_size_inches(24,12)
plt.show()

Assessment:

  • AMT_CREDIT has strong correlation with AMT_GOODS_PRICE, AMT_APPLICATION and AMT_ANNUITY.
  • AGE & YEARS_EMPLOYED have moderate correlation.