Sachin Shekhar
SachinShekhar@Outlook.com
# Importing essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Settings for the workspace
pd.set_option('max_colwidth', 500)
# Loading 'application_data.csv'
clients = pd.read_csv('application_data.csv')
clients.head()
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 122 columns
clients.shape
(307511, 122)
# Loading 'previous_application.csv'
loans = pd.read_csv('previous_application.csv')
loans.head()
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | ... | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2030495 | 271877 | Consumer loans | 1730.430 | 17145.0 | 17145.0 | 0.0 | 17145.0 | SATURDAY | 15 | ... | Connectivity | 12.0 | middle | POS mobile with interest | 365243.0 | -42.0 | 300.0 | -42.0 | -37.0 | 0.0 |
| 1 | 2802425 | 108129 | Cash loans | 25188.615 | 607500.0 | 679671.0 | NaN | 607500.0 | THURSDAY | 11 | ... | XNA | 36.0 | low_action | Cash X-Sell: low | 365243.0 | -134.0 | 916.0 | 365243.0 | 365243.0 | 1.0 |
| 2 | 2523466 | 122040 | Cash loans | 15060.735 | 112500.0 | 136444.5 | NaN | 112500.0 | TUESDAY | 11 | ... | XNA | 12.0 | high | Cash X-Sell: high | 365243.0 | -271.0 | 59.0 | 365243.0 | 365243.0 | 1.0 |
| 3 | 2819243 | 176158 | Cash loans | 47041.335 | 450000.0 | 470790.0 | NaN | 450000.0 | MONDAY | 7 | ... | XNA | 12.0 | middle | Cash X-Sell: middle | 365243.0 | -482.0 | -152.0 | -182.0 | -177.0 | 1.0 |
| 4 | 1784265 | 202054 | Cash loans | 31924.395 | 337500.0 | 404055.0 | NaN | 337500.0 | THURSDAY | 9 | ... | XNA | 24.0 | high | Cash Street: high | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 37 columns
loans.shape
(1670214, 37)
# Loading 'columns_description.csv'
# Loading data dictionary for easy exploration of features
data_dict = pd.read_csv('columns_description.csv', encoding='cp1252')
data_dict.head()
| Unnamed: 0 | Table | Row | Description | Special | |
|---|---|---|---|---|---|
| 0 | 1 | application_data | SK_ID_CURR | ID of loan in our sample | NaN |
| 1 | 2 | application_data | TARGET | Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases) | NaN |
| 2 | 5 | application_data | NAME_CONTRACT_TYPE | Identification if loan is cash or revolving | NaN |
| 3 | 6 | application_data | CODE_GENDER | Gender of the client | NaN |
| 4 | 7 | application_data | FLAG_OWN_CAR | Flag if the client owns a car | NaN |
# Minor aethetic changes to data dictionary
data_dict['Table'] = data_dict['Table'].replace('application_data', 'clients')
data_dict['Table'] = data_dict['Table'].replace('previous_application.csv', 'loans')
data_dict = data_dict.iloc[:, 1:]
data_dict.rename(columns={'Row': 'Column'}, inplace=True)
data_dict.head()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 0 | clients | SK_ID_CURR | ID of loan in our sample | NaN |
| 1 | clients | TARGET | Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases) | NaN |
| 2 | clients | NAME_CONTRACT_TYPE | Identification if loan is cash or revolving | NaN |
| 3 | clients | CODE_GENDER | Gender of the client | NaN |
| 4 | clients | FLAG_OWN_CAR | Flag if the client owns a car | NaN |
# Add easy way to access column descriptions
def describe(table, col):
return data_dict[(data_dict['Table'] == table) & (data_dict['Column'] == col)]
class Table:
def __init__(self, table_name):
self.table_name = table_name
self.columns = []
self.col_index = -1
if table_name == 'clients':
self.columns = clients.columns
elif table_name == 'loans':
self.columns = loans.columns
def describe_next(self):
self.col_index += 1
return describe(self.table_name, self.columns[self.col_index])
def describe(self, col_name):
return describe(self.table_name, col_name)
# Checking for duplicate rows
clients.duplicated().sum(), loans.duplicated().sum()
(0, 0)
# Checking missing values from clients data frame
clients_missing = clients.isnull().mean() * 100
clients_missing[clients_missing > 0]
AMT_ANNUITY 0.003902
AMT_GOODS_PRICE 0.090403
NAME_TYPE_SUITE 0.420148
OWN_CAR_AGE 65.990810
OCCUPATION_TYPE 31.345545
...
AMT_REQ_CREDIT_BUREAU_DAY 13.501631
AMT_REQ_CREDIT_BUREAU_WEEK 13.501631
AMT_REQ_CREDIT_BUREAU_MON 13.501631
AMT_REQ_CREDIT_BUREAU_QRT 13.501631
AMT_REQ_CREDIT_BUREAU_YEAR 13.501631
Length: 67, dtype: float64
# Checking columns which have more than 35% missing values
clients_missing[clients_missing > 35].index
Index(['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG',
'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG',
'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG',
'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG',
'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE',
'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE',
'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE',
'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE',
'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE',
'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI',
'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI',
'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI',
'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI',
'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
'TOTALAREA_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
dtype='object')
After consulting data dictionary, the columns having more than 35% missing values don't seem important for analysis. So, I am dropping them.
clients.drop(clients_missing[clients_missing > 35].index, axis=1, inplace=True)
clients.shape
(307511, 73)
# Checking missing values from loans data frame
loans_missing = loans.isnull().mean() * 100
loans_missing[loans_missing > 0]
AMT_ANNUITY 22.286665 AMT_CREDIT 0.000060 AMT_DOWN_PAYMENT 53.636480 AMT_GOODS_PRICE 23.081773 RATE_DOWN_PAYMENT 53.636480 RATE_INTEREST_PRIMARY 99.643698 RATE_INTEREST_PRIVILEGED 99.643698 NAME_TYPE_SUITE 49.119754 CNT_PAYMENT 22.286366 PRODUCT_COMBINATION 0.020716 DAYS_FIRST_DRAWING 40.298129 DAYS_FIRST_DUE 40.298129 DAYS_LAST_DUE_1ST_VERSION 40.298129 DAYS_LAST_DUE 40.298129 DAYS_TERMINATION 40.298129 NFLAG_INSURED_ON_APPROVAL 40.298129 dtype: float64
# Checking columns which have more than 35% missing values
loans_missing[loans_missing > 35]
AMT_DOWN_PAYMENT 53.636480 RATE_DOWN_PAYMENT 53.636480 RATE_INTEREST_PRIMARY 99.643698 RATE_INTEREST_PRIVILEGED 99.643698 NAME_TYPE_SUITE 49.119754 DAYS_FIRST_DRAWING 40.298129 DAYS_FIRST_DUE 40.298129 DAYS_LAST_DUE_1ST_VERSION 40.298129 DAYS_LAST_DUE 40.298129 DAYS_TERMINATION 40.298129 NFLAG_INSURED_ON_APPROVAL 40.298129 dtype: float64
After consulting data dictionary, the columns having more than 35% missing values don't seem important for analysis. So, I am dropping them.
loans.drop(loans_missing[loans_missing > 35].index, axis=1, inplace=True)
loans.shape
(1670214, 26)
FLAGDOCUMENT* features are completely unnecssary for analysis. So, I am dropping them.
clients.drop(list(filter(lambda x: True if x.startswith('FLAG_DOCUMENT_') else False, clients.columns)), axis=1, inplace=True)
clients.shape
(307511, 53)
clients.columns
Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE',
'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL',
'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE',
'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS',
'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
'ORGANIZATION_TYPE', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
'AMT_REQ_CREDIT_BUREAU_YEAR'],
dtype='object')
Based on Data Dictionary, I am dropping irrelevant or less important columns from clients DataFrame.
clients_cols_to_drop = ['NAME_TYPE_SUITE', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_LAST_PHONE_CHANGE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_CONT_MOBILE', 'FLAG_EMAIL', 'FLAG_PHONE', 'FLAG_WORK_PHONE', 'FLAG_EMP_PHONE', 'FLAG_MOBIL']
clients.drop(clients_cols_to_drop, axis=1, inplace=True)
clients.shape
(307511, 28)
loans.shape
(1670214, 26)
loans.columns
Index(['SK_ID_PREV', 'SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'AMT_ANNUITY',
'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_GOODS_PRICE',
'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY',
'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'DAYS_DECISION',
'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_CLIENT_TYPE',
'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE',
'CHANNEL_TYPE', 'SELLERPLACE_AREA', 'NAME_SELLER_INDUSTRY',
'CNT_PAYMENT', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION'],
dtype='object')
Based on Data Dictionary, I am dropping irrelevant or less important columns from loans DataFrame.
loans_cols_to_drop = ['SK_ID_PREV', 'AMT_GOODS_PRICE', 'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY', 'DAYS_DECISION','NAME_CASH_LOAN_PURPOSE', 'CODE_REJECT_REASON', 'NAME_PORTFOLIO', 'SELLERPLACE_AREA', 'NAME_PRODUCT_TYPE', 'NAME_SELLER_INDUSTRY', 'PRODUCT_COMBINATION']
loans.drop(loans_cols_to_drop, axis=1, inplace=True)
loans.shape
(1670214, 12)
# Initialising Table class to pull column description from data dictionary
clients_cols = Table('clients')
loans_cols = Table('loans')
# Helper function for Categorical Univariate Analysis
def clients_cat_uni_analysis(col, friendly_name):
tmp_df = clients[col].value_counts(normalize=True).mul(100).rename('Percent').reset_index().rename(columns={'index': col})
ax = sns.barplot(data=tmp_df, x=col, y='Percent')
for p in ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x() + (p.get_width()/2)
txt_y = p.get_height()
ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
ax.set_title(f'{friendly_name} Univariate Analysis (Normalised)', fontsize=20)
ax.set_xlabel(friendly_name)
ax.figure.set_size_inches(16,8)
return ax
# Helper function for Categorical Segmented Univariate Analysis
def clients_cat_seg_analysis(col, friendly_name):
tmp_df = clients.groupby(col)['TARGET'].value_counts(normalize=True).mul(100).rename('Percent').reset_index().replace(0,'Other Cases').replace(1,'Payment Difficulties')
ax = sns.barplot(data=tmp_df, x=col, y='Percent', hue='TARGET')
for p in ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x() + (p.get_width()/2)
txt_y = p.get_height()
ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
ax.set_title(f'{friendly_name} Segmented Univariate Analysis (Normalised)', fontsize=20)
ax.set_xlabel(friendly_name)
ax.figure.set_size_inches(16,8)
return ax
# Describing first feature
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 0 | clients | SK_ID_CURR | ID of loan in our sample | NaN |
clients.SK_ID_CURR.dtype, clients.SK_ID_CURR.isnull().sum()
(dtype('int64'), 0)
loans_cols.describe('SK_ID_CURR')
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 123 | loans | SK_ID_CURR | ID of loan in our sample | hashed |
Assessment:
From what I can gather at this point, SK_ID_CURR can be used to join clients and loans tables. Otherwise, it has no significance in data analysis.
# Describing next feature
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 1 | clients | TARGET | Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases) | NaN |
clients.TARGET.dtype, clients.TARGET.isnull().sum()
(dtype('int64'), 0)
clients.TARGET.value_counts(normalize=True) * 100
0 91.927118 1 8.072882 Name: TARGET, dtype: float64
ax = sns.countplot(y=clients.TARGET)
plt.yticks([0,1], ['Other Cases', 'Payment Difficulties'], rotation=45)
ax.set_title('Target Variable Data Imbalance', fontsize=20)
ax.figure.set_size_inches(10,4)
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 2 | clients | NAME_CONTRACT_TYPE | Identification if loan is cash or revolving | NaN |
clients_cat_uni_analysis('NAME_CONTRACT_TYPE', 'Loan Type')
plt.show()
clients_cat_seg_analysis('NAME_CONTRACT_TYPE', 'Loan Type')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 3 | clients | CODE_GENDER | Gender of the client | NaN |
clients.CODE_GENDER.value_counts(normalize=True) * 100
F 65.834393 M 34.164306 XNA 0.001301 Name: CODE_GENDER, dtype: float64
# Checking if the rows containing XNA gender have issues
clients[clients.CODE_GENDER == 'XNA']
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | ORGANIZATION_TYPE | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 35657 | 141289 | 0 | Revolving loans | XNA | Y | Y | 0 | 207000.0 | 382500.0 | 19125.0 | ... | NaN | 2.0 | 3 | Business Entity Type 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 38566 | 144669 | 0 | Revolving loans | XNA | N | Y | 2 | 157500.0 | 270000.0 | 13500.0 | ... | Low-skill Laborers | 4.0 | 2 | Industry: type 3 | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 4.0 |
| 83382 | 196708 | 0 | Revolving loans | XNA | N | Y | 1 | 135000.0 | 405000.0 | 20250.0 | ... | Core staff | 3.0 | 2 | Kindergarten | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 |
| 189640 | 319880 | 0 | Revolving loans | XNA | Y | Y | 0 | 247500.0 | 540000.0 | 27000.0 | ... | NaN | 2.0 | 2 | Medicine | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 6.0 |
4 rows × 28 columns
# Replacing XNA with NaN
clients.CODE_GENDER = clients.CODE_GENDER.replace('XNA', np.NaN)
ax = clients_cat_uni_analysis('CODE_GENDER', 'Gender')
ax.set_xticklabels(['Female', 'Male'])
plt.show()
ax = clients_cat_seg_analysis('CODE_GENDER', 'Gender')
ax.set_xticklabels(['Female', 'Male'])
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 4 | clients | FLAG_OWN_CAR | Flag if the client owns a car | NaN |
ax = clients_cat_uni_analysis('FLAG_OWN_CAR', 'Car Ownership')
ax.set_xticklabels(['No', 'Yes'])
plt.show()
ax = clients_cat_seg_analysis('FLAG_OWN_CAR', 'Car Ownership')
ax.set_xticklabels(['No', 'Yes'])
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 5 | clients | FLAG_OWN_REALTY | Flag if client owns a house or flat | NaN |
ax = clients_cat_uni_analysis('FLAG_OWN_REALTY', 'House/Flat Ownership')
ax.set_xticklabels(['Yes', 'No'])
plt.show()
ax = clients_cat_seg_analysis('FLAG_OWN_REALTY', 'House/Flat Ownership')
ax.set_xticklabels(['Yes', 'No'])
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 6 | clients | CNT_CHILDREN | Number of children the client has | NaN |
clients.CNT_CHILDREN.unique()
array([ 0, 1, 2, 3, 4, 7, 5, 6, 8, 9, 11, 12, 10, 19, 14])
Binning this into buckets for easy analysis
clients.CNT_CHILDREN = pd.cut(clients.CNT_CHILDREN, bins=[0,1,3,6,11,20], labels=['No', 'Low', 'Medium', 'High', 'Very High'], right=False)
clients.CNT_CHILDREN.value_counts()
No 215371 Low 87868 Medium 4230 High 34 Very High 8 Name: CNT_CHILDREN, dtype: int64
clients_cat_uni_analysis('CNT_CHILDREN', 'Children Count')
plt.show()
clients_cat_seg_analysis('CNT_CHILDREN', 'Children Count')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 7 | clients | AMT_INCOME_TOTAL | Income of the client | NaN |
ax = sns.boxplot(x = clients.AMT_INCOME_TOTAL)
ax.set_xlabel('Income')
ax.set_title('Income Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
# Making a sense of income distribution of majority of customers
income_minus_outliers = clients.loc[clients.AMT_INCOME_TOTAL<300000,:]
ax = sns.boxplot(data= income_minus_outliers, x='AMT_INCOME_TOTAL')
ax.set_xticks(list(income_minus_outliers.AMT_INCOME_TOTAL.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Income')
ax.set_title('Income Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
income_minus_outliers = clients[~(clients.AMT_INCOME_TOTAL > 300000)]
ax = sns.boxplot(data=income_minus_outliers, x='TARGET', y = 'AMT_INCOME_TOTAL')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Income')
ax.set_title('Income Distribution Segmented Univariate Analysis')
ax.figure.set_size_inches(12,12)
plt.show()
ax = sns.histplot(data=income_minus_outliers ,x='AMT_INCOME_TOTAL', bins=20, stat='percent', hue='TARGET', multiple='stack')
ax.set_xticks(np.linspace(0,300000,21))
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.set_xlabel('Income')
ax.set_title('Income Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=income_minus_outliers ,x='AMT_INCOME_TOTAL', bins=20, stat='percent', hue='TARGET', multiple='fill')
ax.set_xticks(np.linspace(0,300000,21))
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.set_xlabel('Income')
ax.set_title('Income Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 8 | clients | AMT_CREDIT | Credit amount of the loan | NaN |
ax = sns.boxplot(x = clients.AMT_CREDIT)
ax.set_xticks(list(clients.AMT_CREDIT.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Loan Credit (in 100,000)')
ax.set_title('Loan Credit Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=clients, x='TARGET', y='AMT_CREDIT')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Loan Credit (in 100,000)')
ax.set_title('Loan Credit Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.histplot(data=clients, x='AMT_CREDIT', hue='TARGET', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Loan Credit (in 100,000)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Loan Credit Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=clients, x='AMT_CREDIT', hue='TARGET', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Loan Credit (in 100,000)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Loan Credit Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 9 | clients | AMT_ANNUITY | Loan annuity | NaN |
ax = sns.boxplot(x = clients.AMT_ANNUITY)
ax.set_xticks(list(clients.AMT_ANNUITY.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_ANNUITY<80000,:], x='TARGET', y='AMT_ANNUITY')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.histplot(data=clients, x='AMT_ANNUITY', hue='TARGET', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Loan Annuity')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Loan Annuity Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=clients, x='AMT_ANNUITY', hue='TARGET', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Loan Annuity')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Loan Annuity Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 10 | clients | AMT_GOODS_PRICE | For consumer loans it is the price of the goods for which the loan is given | NaN |
ax = sns.boxplot(x = clients.AMT_GOODS_PRICE)
ax.set_xticks(list(clients.AMT_GOODS_PRICE.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Price of Goods (in 100,000)')
ax.set_title('Price of Goods (for Consumer Loans) Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=clients, x='TARGET', y='AMT_GOODS_PRICE')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Price of Goods (in 100,000)')
ax.set_title('Price of Goods (for Consumer Loans) Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.histplot(data=clients, x='AMT_GOODS_PRICE', hue='TARGET', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Region Population (Normalised)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Price of Goods (for Consumer Loans) Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=clients, x='AMT_GOODS_PRICE', hue='TARGET', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Region Population (Normalised)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Price of Goods (for Consumer Loans) Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 12 | clients | NAME_INCOME_TYPE | Clients income type (businessman, working, maternity leave,…) | NaN |
clients_cat_uni_analysis('NAME_INCOME_TYPE', 'Income Type')
plt.show()
clients_cat_seg_analysis('NAME_INCOME_TYPE', 'Income Type')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 13 | clients | NAME_EDUCATION_TYPE | Level of highest education the client achieved | NaN |
clients_cat_uni_analysis('NAME_EDUCATION_TYPE', 'Highest Education Level')
plt.show()
clients_cat_seg_analysis('NAME_EDUCATION_TYPE', 'Highest Education Level')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 14 | clients | NAME_FAMILY_STATUS | Family status of the client | NaN |
clients.NAME_FAMILY_STATUS.value_counts()
Married 196432 Single / not married 45444 Civil marriage 29775 Separated 19770 Widow 16088 Unknown 2 Name: NAME_FAMILY_STATUS, dtype: int64
For this analysis, the difference between 'Married' and 'Civil marriage' holds no value. Family expenses should be same. So, I am merging these two categories.
clients.NAME_FAMILY_STATUS = clients.NAME_FAMILY_STATUS.replace('Civil marriage', 'Married')
# Converting unknown to NaN
clients.NAME_FAMILY_STATUS = clients.NAME_FAMILY_STATUS.replace('Unknown', np.NaN)
clients_cat_uni_analysis('NAME_FAMILY_STATUS', 'Relationship Status')
plt.show()
clients_cat_seg_analysis('NAME_FAMILY_STATUS', 'Relationship Status')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 15 | clients | NAME_HOUSING_TYPE | What is the housing situation of the client (renting, living with parents, ...) | NaN |
clients_cat_uni_analysis('NAME_HOUSING_TYPE', 'Housing Type')
plt.show()
clients_cat_seg_analysis('NAME_HOUSING_TYPE', 'Housing Type')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 16 | clients | REGION_POPULATION_RELATIVE | Normalized population of region where client lives (higher number means the client lives in more populated region) | normalized |
ax = sns.boxplot(x = clients.REGION_POPULATION_RELATIVE)
ax.set_xticks(list(clients.REGION_POPULATION_RELATIVE.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Region Population (Normalised)')
ax.set_title('Region Population (Normalised) Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.REGION_POPULATION_RELATIVE<0.05,:], x='TARGET', y='REGION_POPULATION_RELATIVE')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Region Population (Normalised)')
ax.set_title('Region Population (Normalised) Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.histplot(data=clients, x='REGION_POPULATION_RELATIVE', hue='TARGET', bins=15, stat='percent', multiple='stack')
ax.set_xlabel('Region Population (Normalised)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Region Population (Normalised) Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=clients, x='REGION_POPULATION_RELATIVE', hue='TARGET', bins=15, stat='percent', multiple='fill')
ax.set_xlabel('Region Population (Normalised)')
ax.legend(labels=['Payment Difficulties', 'Other Cases'])
ax.set_title('Region Population (Normalised) Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 17 | clients | DAYS_BIRTH | Client's age in days at the time of application | time only relative to the application |
# Converting it to absolute age in years
clients['AGE'] = clients.DAYS_BIRTH.abs() / 365
ax = sns.boxplot(x = clients.AGE)
ax.set_xticks(list(clients.AGE.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('AGE')
ax.set_title('Age Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=clients, x='TARGET', y='AGE')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Age')
ax.set_title('Age Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
# Creating buckets for better analysis
clients['AGE_BUCKET'] = pd.cut(clients.AGE, bins=range(20,71,10), labels=['20-30', '30-40', '40-50','50-60', '60-70'], right=False)
clients_cat_uni_analysis('AGE_BUCKET', 'Age Bucket')
plt.show()
clients_cat_seg_analysis('AGE_BUCKET', 'Age Bucket')
plt.show()
Assesment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 18 | clients | DAYS_EMPLOYED | How many days before the application the person started current employment | time only relative to the application |
# Converting it to years employed
clients['YEARS_EMPLOYED'] = clients.DAYS_EMPLOYED.abs() / 365
ax = sns.boxplot(x = clients.YEARS_EMPLOYED)
ax.figure.set_size_inches(24,8)
plt.show()
There's an anomaly. Nobody can be employed for 1000 years. This is definitely missing value.
clients.loc[clients.YEARS_EMPLOYED > 100, 'YEARS_EMPLOYED'] = np.NaN
ax = sns.boxplot(x = clients.YEARS_EMPLOYED)
ax.set_xticks(list(clients.YEARS_EMPLOYED.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Years Employed')
ax.set_title('Years Employed Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.YEARS_EMPLOYED < 20,:], x='TARGET', y='YEARS_EMPLOYED')
ax.set_xticks([0,1], ['Other Cases', 'Payment Difficulties'])
ax.set_ylabel('Years Employed')
ax.set_title('Years Employed Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
# Creating buckets for better analysis
clients['EMPLOYED_EXPERIENCE'] = pd.cut(clients.YEARS_EMPLOYED, bins=[0,1,5,10,20,100], labels=['Fresher', 'Associate', 'Senior','Master', 'Grand Master'], right=False)
clients_cat_uni_analysis('EMPLOYED_EXPERIENCE', 'Job Experience')
plt.show()
clients_cat_seg_analysis('EMPLOYED_EXPERIENCE', 'Job Experience')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 28 | clients | OCCUPATION_TYPE | What kind of occupation does the client have | NaN |
ax = clients_cat_uni_analysis('OCCUPATION_TYPE', 'Occupation Type')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()
ax = clients_cat_seg_analysis('OCCUPATION_TYPE', 'Occupation Type')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 29 | clients | CNT_FAM_MEMBERS | How many family members does client have | NaN |
clients.CNT_FAM_MEMBERS.value_counts()
2.0 158357 1.0 67847 3.0 52601 4.0 24697 5.0 3478 6.0 408 7.0 81 8.0 20 9.0 6 10.0 3 14.0 2 12.0 2 20.0 2 16.0 2 13.0 1 15.0 1 11.0 1 Name: CNT_FAM_MEMBERS, dtype: int64
# Binning number of family members into family size for easy analysis
clients.CNT_FAM_MEMBERS = clients.CNT_FAM_MEMBERS.replace(1.0, 'Single').replace(2.0, 'Couple').replace([3.0,4.0,5.0], 'Small Family').replace([6.0,7.0,8.0,9.0,10.0], 'Mid Family').replace([11.0,12.0,13.0,14.0,15.0,16.0,20.0], 'Big Family')
clients.CNT_FAM_MEMBERS.value_counts()
Couple 158357 Small Family 80776 Single 67847 Mid Family 518 Big Family 11 Name: CNT_FAM_MEMBERS, dtype: int64
clients_cat_uni_analysis('CNT_FAM_MEMBERS', 'Family Size')
plt.show()
clients_cat_seg_analysis('CNT_FAM_MEMBERS', 'Family Size')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 30 | clients | REGION_RATING_CLIENT | Our rating of the region where client lives (1,2,3) | NaN |
clients_cat_uni_analysis('REGION_RATING_CLIENT', 'Region Rating')
plt.show()
clients_cat_seg_analysis('REGION_RATING_CLIENT', 'Region Rating')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 40 | clients | ORGANIZATION_TYPE | Type of organization where client works | NaN |
len(clients.ORGANIZATION_TYPE.value_counts())
58
# Reducing the number of unique values
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Business Entity' if x.startswith('Business Entity') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Trade' if x.startswith('Trade') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Transport' if x.startswith('Transport') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Industry' if x.startswith('Industry') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.apply(lambda x: 'Trade' if x.startswith('Trade') else x)
clients.ORGANIZATION_TYPE = clients.ORGANIZATION_TYPE.replace(['XNA', 'Other'], np.NaN)
ax = clients_cat_uni_analysis('ORGANIZATION_TYPE', 'Organisation Type')
ax.figure.set_size_inches(24, 12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()
ax = clients_cat_seg_analysis('ORGANIZATION_TYPE', 'Organisation Type')
ax.figure.set_size_inches(24, 12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 116 | clients | AMT_REQ_CREDIT_BUREAU_HOUR | Number of enquiries to Credit Bureau about the client one hour before application | NaN |
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_HOUR', 'Last Hour Credit Bureau Enquiry')
ax.set_xticklabels([0,1,2,3,4])
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Hour')
plt.show()
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_HOUR', 'Last Hour Credit Bureau Enquiry')
ax.set_xticklabels([0,1,2,3,4])
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Hour')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 117 | clients | AMT_REQ_CREDIT_BUREAU_DAY | Number of enquiries to Credit Bureau about the client one day before application (excluding one hour before application) | NaN |
clients.AMT_REQ_CREDIT_BUREAU_DAY = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_DAY, bins=[0.0,1.0,4.0,10.0], labels=['No','Low','High'], right=False)
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_DAY', 'Last Day Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Day')
plt.show()
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_DAY', 'Last Day Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Day')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 118 | clients | AMT_REQ_CREDIT_BUREAU_WEEK | Number of enquiries to Credit Bureau about the client one week before application (excluding one day before application) | NaN |
clients.AMT_REQ_CREDIT_BUREAU_WEEK = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_WEEK, bins=[0.0,1.0,4.0,10.0], labels=['No','Low','High'], right=False)
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_WEEK', 'Last Week Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Week')
plt.show()
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_WEEK', 'Last Week Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Week')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 119 | clients | AMT_REQ_CREDIT_BUREAU_MON | Number of enquiries to Credit Bureau about the client one month before application (excluding one week before application) | NaN |
clients.AMT_REQ_CREDIT_BUREAU_MON = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_MON, bins=[0.0,1.0,6.0,11.0,16.0,30.0], labels=['No','Low', 'Mid', 'High', 'Very High'], right=False)
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_MON', 'Last Month Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Month')
plt.show()
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_MON', 'Last Month Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Month')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 120 | clients | AMT_REQ_CREDIT_BUREAU_QRT | Number of enquiries to Credit Bureau about the client 3 month before application (excluding one month before application) | NaN |
clients.AMT_REQ_CREDIT_BUREAU_QRT = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_QRT, bins=[0.0,1.0,6.0,11.0,20.0,500.0], labels=['No','Low', 'Mid', 'High', 'Very High'], right=False)
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_QRT', 'Last Quarter Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Quarter')
plt.show()
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_QRT', 'Last Quarter Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Quarter')
plt.show()
Assessment:
clients_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 121 | clients | AMT_REQ_CREDIT_BUREAU_YEAR | Number of enquiries to Credit Bureau about the client one day year (excluding last 3 months before application) | NaN |
clients.AMT_REQ_CREDIT_BUREAU_YEAR = pd.cut(clients.AMT_REQ_CREDIT_BUREAU_YEAR, bins=[0.0,1.0,6.0,11.0,16.0,30.0], labels=['No','Low', 'Mid', 'High', 'Very High'], right=False)
ax = clients_cat_uni_analysis('AMT_REQ_CREDIT_BUREAU_YEAR', 'Last Year Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Year')
plt.show()
ax = clients_cat_seg_analysis('AMT_REQ_CREDIT_BUREAU_YEAR', 'Last Year Credit Bureau Enquiry')
ax.set_xlabel('No. of Credit Bureau Enquiry in Last Year')
plt.show()
Assessment:
# Helper function for Categorical Univariate Analysis
def loans_cat_uni_analysis(col, friendly_name):
tmp_df = loans[col].value_counts(normalize=True).mul(100).rename('Percent').reset_index().rename(columns={'index': col})
ax = sns.barplot(data=tmp_df, x=col, y='Percent')
for p in ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x() + (p.get_width()/2)
txt_y = p.get_height()
ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
ax.set_title(f'{friendly_name} Univariate Analysis (Normalised)', fontsize=20)
ax.set_xlabel(friendly_name)
ax.figure.set_size_inches(16,8)
return ax
# Helper function for Categorical Segmented Univariate Analysis
def loans_cat_seg_analysis(col, friendly_name):
tmp_df = loans.groupby(col)['NAME_CONTRACT_STATUS'].value_counts(normalize=True).mul(100).rename('Percent').reset_index()
ax = sns.barplot(data=tmp_df, x=col, y='Percent', hue='NAME_CONTRACT_STATUS')
for p in ax.patches:
txt = str(p.get_height().round(2)) + '%'
txt_x = p.get_x() + (p.get_width()/2)
txt_y = p.get_height()
ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
ax.set_title(f'{friendly_name} Segmented Univariate Analysis (Normalised)', fontsize=20)
ax.set_xlabel(friendly_name)
ax.figure.set_size_inches(16,8)
return ax
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 123 | loans | SK_ID_CURR | ID of loan in our sample | hashed |
clients_cols.describe('SK_ID_CURR')
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 0 | clients | SK_ID_CURR | ID of loan in our sample | NaN |
Assessment:
clients and loans data frames.loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 124 | loans | NAME_CONTRACT_TYPE | Contract product type (Cash loan, consumer loan [POS] ,...) of the previous application | NaN |
loans.NAME_CONTRACT_TYPE.value_counts()
Cash loans 747553 Consumer loans 729151 Revolving loans 193164 XNA 346 Name: NAME_CONTRACT_TYPE, dtype: int64
# Converting 'XNA' to NaN
loans.NAME_CONTRACT_TYPE = loans.NAME_CONTRACT_TYPE.replace('XNA', np.NaN)
loans_cat_uni_analysis('NAME_CONTRACT_TYPE', 'Loan Type')
plt.show()
loans_cat_seg_analysis('NAME_CONTRACT_TYPE', 'Loan Type')
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 125 | loans | AMT_ANNUITY | Annuity of previous application | NaN |
ax = sns.boxplot(x = loans.AMT_ANNUITY)
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
loans_annuity_minus_outliers = loans.loc[loans.AMT_ANNUITY<220000,:].AMT_ANNUITY
ax = sns.boxplot(x = loans_annuity_minus_outliers)
ax.set_xticks(list(loans_annuity_minus_outliers.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xticklabels(ax.get_xticks(), rotation=45)
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=loans.loc[loans.AMT_ANNUITY<220000,:], x='NAME_CONTRACT_STATUS', y='AMT_ANNUITY')
ax.set_ylabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.histplot(data=loans.loc[loans.AMT_ANNUITY<220000,:], x='AMT_ANNUITY', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=loans.loc[loans.AMT_ANNUITY<220000,:], x='AMT_ANNUITY', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Loan Annuity')
ax.set_title('Loan Annuity Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 126 | loans | AMT_APPLICATION | For how much credit did client ask on the previous application | NaN |
ax = sns.boxplot(x = loans.AMT_APPLICATION)
ax.set_xlabel('Application Amount')
ax.set_title('Application Amount Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
loans_application_minus_outliers = loans.loc[loans.AMT_APPLICATION<3000000,:]
ax = sns.boxplot(x = loans_application_minus_outliers.AMT_APPLICATION)
ax.set_xlabel('Application Amount')
ax.set_title('Application Amount Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=loans_application_minus_outliers, x='NAME_CONTRACT_STATUS', y='AMT_APPLICATION')
ax.set_ylabel('Application Amount')
ax.set_title('Application Amount Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.histplot(data=loans_application_minus_outliers, x='AMT_APPLICATION', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Application Amount')
ax.set_title('Application Amount Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=loans_application_minus_outliers, x='AMT_APPLICATION', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Application Amount')
ax.set_title('Application Amount Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 127 | loans | AMT_CREDIT | Final credit amount on the previous application. This differs from AMT_APPLICATION in a way that the AMT_APPLICATION is the amount for which the client initially applied for, but during our approval process he could have received different amount - AMT_CREDIT | NaN |
ax = sns.boxplot(x = loans.AMT_CREDIT)
ax.set_xlabel('Loan CREDIT')
ax.set_title('Loan CREDIT Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
loans_credit_minus_outliers = loans.loc[loans.AMT_CREDIT<350000,:]
ax = sns.boxplot(x = loans_credit_minus_outliers.AMT_CREDIT)
ax.set_xticks(list(loans_credit_minus_outliers.AMT_CREDIT.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Loan CREDIT')
ax.set_title('Loan CREDIT Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_CONTRACT_STATUS', y='AMT_CREDIT')
ax.set_ylabel('Loan Credit')
ax.set_title('Loan Credit Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.histplot(data=loans_application_minus_outliers, x='AMT_CREDIT', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='stack')
ax.set_xlabel('Loan Credit')
ax.set_title('Loan Credit Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=loans_application_minus_outliers, x='AMT_CREDIT', hue='NAME_CONTRACT_STATUS', bins=25, stat='percent', multiple='fill')
ax.set_xlabel('Loan Credit')
ax.set_title('Loan Credit Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 139 | loans | NAME_CONTRACT_STATUS | Contract status (approved, cancelled, ...) of previous application | NaN |
ax = sns.countplot(x=loans.NAME_CONTRACT_STATUS)
ax.set_title('Target Variable Data Imbalance', fontsize=20)
ax.figure.set_size_inches(12,8)
plt.show()
Assessment:
NAME_CONTRACT_STATUS is a target variable.loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 141 | loans | NAME_PAYMENT_TYPE | Payment method that client chose to pay for the previous application | NaN |
# Renaming segments
loans.NAME_PAYMENT_TYPE = loans.NAME_PAYMENT_TYPE.replace('XNA', np.NaN).replace('Cash through the bank', 'Cash').replace('Non-cash from your account', 'Account Debit').replace('Cashless from the account of the employer', 'Employer')
loans_cat_uni_analysis('NAME_PAYMENT_TYPE', 'Loan Payment')
plt.show()
loans_cat_seg_analysis('NAME_PAYMENT_TYPE', 'Loan Payment')
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 144 | loans | NAME_CLIENT_TYPE | Was the client old or new client when applying for the previous application | NaN |
# Fixing values
loans.NAME_CLIENT_TYPE = loans.NAME_CLIENT_TYPE.replace('XNA', np.NaN).replace(['Repeater', 'Refreshed'], 'Old')
loans_cat_uni_analysis('NAME_CLIENT_TYPE', 'Client Type')
plt.show()
loans_cat_seg_analysis('NAME_CLIENT_TYPE', 'Client Type')
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 145 | loans | NAME_GOODS_CATEGORY | What kind of goods did the client apply for in the previous application | NaN |
# Replacing XNA with NaN
loans.NAME_GOODS_CATEGORY = loans.NAME_GOODS_CATEGORY.replace('XNA', np.NaN)
ax = loans_cat_uni_analysis('NAME_GOODS_CATEGORY', 'Goods')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()
ax = loans_cat_seg_analysis('NAME_GOODS_CATEGORY', 'Goods')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 148 | loans | CHANNEL_TYPE | Through which channel we acquired the client on the previous application | NaN |
loans_cat_uni_analysis('CHANNEL_TYPE', 'Acquision Channel')
plt.show()
ax = loans_cat_seg_analysis('CHANNEL_TYPE', 'Acquision Channel')
ax.figure.set_size_inches(24,12)
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 151 | loans | CNT_PAYMENT | Term of previous credit at application of the previous application | NaN |
ax = sns.boxplot(x = loans.CNT_PAYMENT)
ax.set_xticks(list(loans.CNT_PAYMENT.quantile([0,0.25,0.5,0.75,0.95,1])))
ax.set_xlabel('Loan Term')
ax.set_title('Loan Term Distribution Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.boxplot(data=loans, x='NAME_CONTRACT_STATUS', y='CNT_PAYMENT')
ax.set_ylabel('Loan Terms')
ax.set_title('Loan Terms Distribution Segmented Univariate Analysis', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.histplot(data=loans, x='CNT_PAYMENT', hue='NAME_CONTRACT_STATUS', bins=15, stat='percent', multiple='stack')
ax.set_xlabel('Loan Terms')
ax.set_title('Loan Terms Distribution Histogram', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.histplot(data=loans, x='CNT_PAYMENT', hue='NAME_CONTRACT_STATUS', bins=15, stat='percent', multiple='fill')
ax.set_xlabel('Loan Terms')
ax.set_title('Loan Terms Distribution Histogram (Normalised)', fontsize=20)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
loans_cols.describe_next()
| Table | Column | Description | Special | |
|---|---|---|---|---|
| 152 | loans | NAME_YIELD_GROUP | Grouped interest rate into small medium and high of the previous application | grouped |
loans.NAME_YIELD_GROUP = loans.NAME_YIELD_GROUP.replace('XNA', np.NaN)
# Merging low_normal & low_action segments
loans.NAME_YIELD_GROUP = loans.NAME_YIELD_GROUP.replace(['low_normal', 'low_action'], 'low')
loans_cat_uni_analysis('NAME_YIELD_GROUP', 'Interest Rate')
plt.show()
loans_cat_seg_analysis('NAME_YIELD_GROUP', 'Interest Rate')
plt.show()
Assessment:
AMT_CREDITAMT_INCOMECODE_GENDERAGE_BUCKETNAME_EDUCATION_TYPEax = sns.scatterplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET')
ax.set_xlabel('Income')
ax.set_ylabel('Loan Credit')
ax.set_title('Income vs Loan Credit', fontsize=20)
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.figure.set_size_inches(12,12)
plt.show()
ax = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='CODE_GENDER')
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='AGE_BUCKET', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
ax = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='NAME_EDUCATION_TYPE', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
Assessment:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<1000000,:], x='CODE_GENDER', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Gender', fontsize=20)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<1000000,:], x='AGE_BUCKET', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Age Group', fontsize=20)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<1000000,:], x='NAME_EDUCATION_TYPE', y='AMT_CREDIT', hue='TARGET', order=['Lower secondary', 'Secondary / secondary special', 'Incomplete higher', 'Higher education', 'Academic degree'])
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Education Level', fontsize=20)
plt.show()
Assessment:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='CODE_GENDER', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.set_title('Income vs Gender', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
Assessment:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AGE_BUCKET', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.set_title('Income vs Age Group', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
Assessment:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='NAME_EDUCATION_TYPE', y='AMT_INCOME_TOTAL', hue='TARGET', order=['Lower secondary', 'Secondary / secondary special', 'Incomplete higher', 'Higher education', 'Academic degree'])
ax.set_title('Income vs Education Level', fontsize=20)
ax.figure.set_size_inches(24,12)
plt.show()
Assessment:
AMT_CREDITAMT_INCOMEFLAG_OWN_CARCNT_CHILDRENEMPLOYED_EXPERIENCEg = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET')
g.figure.set_size_inches(8,8)
plt.show()
g = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='FLAG_OWN_CAR')
g.figure.set_size_inches(12,8)
plt.show()
g = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='CNT_CHILDREN', col_wrap=3)
g.figure.set_size_inches(24,16)
plt.show()
g = sns.relplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='AMT_INCOME_TOTAL', y='AMT_CREDIT', hue='TARGET', col='EMPLOYED_EXPERIENCE', col_wrap=3)
g.figure.set_size_inches(24,16)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='FLAG_OWN_CAR', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Car Ownership', fontsize=20)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='CNT_CHILDREN', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Children Count', fontsize=20)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='EMPLOYED_EXPERIENCE', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Work Experience', fontsize=20)
plt.show()
Assessment:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='FLAG_OWN_CAR', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Income vs Car Ownership', fontsize=20)
plt.show()
Assessment:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='CNT_CHILDREN', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Income vs Children Count', fontsize=20)
plt.show()
Assessment:
ax = sns.boxplot(data=clients.loc[clients.AMT_INCOME_TOTAL<300000,:], x='EMPLOYED_EXPERIENCE', y='AMT_INCOME_TOTAL', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Income vs Work Experience', fontsize=20)
plt.show()
Assessment:
AMT_CREDITREGION_POPULATION_RELATIVEFLAG_OWN_CARNAME_FAMILY_STATUSNAME_CONTRACT_TYPEax = sns.scatterplot(data=clients, x='REGION_POPULATION_RELATIVE', y='AMT_CREDIT', hue='TARGET')
ax.set_xlabel('Region Population')
ax.set_ylabel('Loan Credit')
ax.set_title('Loan Credit vs Region Population', fontsize=20)
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.figure.set_size_inches(12,12)
plt.show()
/Users/Sachin/opt/miniconda3/envs/UpGrad/lib/python3.9/site-packages/IPython/core/pylabtools.py:151: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. fig.canvas.print_figure(bytes_io, **kw)
ax = sns.relplot(data=clients, x='REGION_POPULATION_RELATIVE', y='AMT_CREDIT', hue='TARGET', col='FLAG_OWN_CAR', col_wrap=3)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.relplot(data=clients, x='REGION_POPULATION_RELATIVE', y='AMT_CREDIT', hue='TARGET', col='NAME_FAMILY_STATUS', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
ax = sns.relplot(data=clients, x='REGION_POPULATION_RELATIVE', y='AMT_CREDIT', hue='TARGET', col='NAME_CONTRACT_TYPE', col_wrap=3)
ax.figure.set_size_inches(24,8)
plt.show()
Assessment:
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='NAME_CONTRACT_TYPE', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Loan Types', fontsize=20)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='FLAG_OWN_CAR', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Car Ownership', fontsize=20)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_CREDIT<300000,:], x='NAME_FAMILY_STATUS', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Relationship Status', fontsize=20)
plt.show()
AMT_INCOME_TOTALAMT_CREDITREGION_POPULATION_RELATIVEAGEYEARS_EMPLOYEDg = sns.pairplot(clients.loc[clients.AMT_INCOME_TOTAL<300000,:], hue='TARGET', vars=['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'REGION_POPULATION_RELATIVE', 'AGE', 'YEARS_EMPLOYED'])
g.figure.set_size_inches(24,24)
plt.show()
AMT_REQ_CREDIT_BUREAU_YEARNAME_FAMILY_STATUSCODE_GENDERAMT_ANNUITYAMT_GOODS_PRICEax = sns.scatterplot(data=clients, x='AMT_ANNUITY', y='AMT_GOODS_PRICE', hue='TARGET')
ax.set_xlabel('Loan Annuity')
ax.set_ylabel('Goods Price')
ax.set_title('Loan Annuity vs Goods Price', fontsize=20)
ax.legend(['Other Cases', 'Payment Difficulties'])
ax.figure.set_size_inches(12,12)
plt.show()
Assessment:
ax = sns.relplot(data=clients, x='AMT_ANNUITY', y='AMT_GOODS_PRICE', hue='TARGET', col='NAME_FAMILY_STATUS', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
ax = sns.relplot(data=clients, x='AMT_ANNUITY', y='AMT_GOODS_PRICE', hue='TARGET', col='CODE_GENDER', col_wrap=3)
ax.figure.set_size_inches(24,8)
plt.show()
ax = sns.relplot(data=clients, x='AMT_ANNUITY', y='AMT_GOODS_PRICE', hue='TARGET', col='AMT_REQ_CREDIT_BUREAU_YEAR', col_wrap=3)
ax.figure.set_size_inches(24,16)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_ANNUITY<150000,:], x='CODE_GENDER', y='AMT_ANNUITY', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Annuity vs Gender', fontsize=20)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_ANNUITY<150000,:], x='NAME_FAMILY_STATUS', y='AMT_ANNUITY', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Annuity vs Relationship Status', fontsize=20)
plt.show()
ax = sns.boxplot(data=clients.loc[clients.AMT_ANNUITY<150000,:], x='AMT_REQ_CREDIT_BUREAU_YEAR', y='AMT_ANNUITY', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Annuity vs Credit Bureau Enquiry', fontsize=20)
plt.show()
AMT_CREDITCNT_PAYMENTNAME_CONTRACT_TYPENAME_CLIENT_TYPENAME_YIELD_GROUPax = sns.scatterplot(data=loans, x='CNT_PAYMENT', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS')
ax.set_xlabel('Loan Terms')
ax.set_ylabel('Loan Credit')
ax.set_title('Loan Terms vs Loan Credit', fontsize=20)
ax.figure.set_size_inches(12,12)
plt.show()
Assessment 1:
ax = sns.relplot(data=loans, x='CNT_PAYMENT', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS', col='NAME_CLIENT_TYPE')
ax.figure.set_size_inches(24,12)
plt.show()
Assessment 2:
ax = sns.relplot(data=loans, x='CNT_PAYMENT', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS', col='NAME_YIELD_GROUP')
ax.figure.set_size_inches(24,12)
plt.show()
Assessment 3:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_CONTRACT_STATUS', y='AMT_CREDIT')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Loan Status', fontsize=20)
plt.show()
Assessment 4:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_CLIENT_TYPE', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Client Type', fontsize=20)
plt.show()
Assessment 5:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_YIELD_GROUP', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Interest Rate', fontsize=20)
plt.show()
Assessment 6:
ax = sns.boxplot(data=loans_credit_minus_outliers, x='NAME_CONTRACT_TYPE', y='AMT_CREDIT', hue='NAME_CONTRACT_STATUS')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Contract Type', fontsize=20)
plt.show()
Assessment 7:
ax = sns.boxplot(data=loans, x='NAME_CONTRACT_STATUS', y='CNT_PAYMENT')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Terms vs Loan Status', fontsize=20)
plt.show()
Assessment 8:
ax = sns.boxplot(data=loans, x='NAME_CLIENT_TYPE', y='CNT_PAYMENT', hue='NAME_CONTRACT_STATUS')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Terms vs Client Type', fontsize=20)
plt.show()
Assessment 9:
# Merging clients and loans data frames
merged = pd.merge(left=clients[['SK_ID_CURR','TARGET']], right=loans[['SK_ID_CURR', 'AMT_CREDIT', 'CNT_PAYMENT', 'NAME_CONTRACT_TYPE', 'NAME_CLIENT_TYPE', 'NAME_YIELD_GROUP']], how='inner', on='SK_ID_CURR')
merged.head()
| SK_ID_CURR | TARGET | AMT_CREDIT | CNT_PAYMENT | NAME_CONTRACT_TYPE | NAME_CLIENT_TYPE | NAME_YIELD_GROUP | |
|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | 179055.0 | 24.0 | Consumer loans | New | low |
| 1 | 100003 | 0 | 1035882.0 | 12.0 | Cash loans | Old | low |
| 2 | 100003 | 0 | 348637.5 | 6.0 | Consumer loans | Old | middle |
| 3 | 100003 | 0 | 68053.5 | 12.0 | Consumer loans | Old | middle |
| 4 | 100004 | 0 | 20106.0 | 4.0 | Consumer loans | New | middle |
ax = sns.scatterplot(data=merged, x='CNT_PAYMENT', y='AMT_CREDIT', hue='TARGET')
ax.set_xlabel('Loan Terms')
ax.set_ylabel('Loan Credit')
ax.set_title('Loan Terms vs Loan Credit', fontsize=20)
ax.figure.set_size_inches(12,12)
plt.show()
Assessment 1:
ax = sns.relplot(data=merged, x='CNT_PAYMENT', y='AMT_CREDIT', hue='TARGET', col='NAME_CLIENT_TYPE')
ax.figure.set_size_inches(24,12)
plt.show()
Assessment 2:
ax = sns.relplot(data=merged, x='CNT_PAYMENT', y='AMT_CREDIT', hue='TARGET', col='NAME_YIELD_GROUP')
ax.figure.set_size_inches(24,12)
plt.show()
Assessment 3:
Interest rate grouping doesn't tell anything strong about payment difficulties.
merged_credit_minus_outliers = merged.loc[merged.AMT_CREDIT<350000,:]
ax = sns.boxplot(data=merged_credit_minus_outliers, x='TARGET', y='AMT_CREDIT')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Payment Difficulty', fontsize=20)
plt.show()
Assessment 4:
ax = sns.boxplot(data=merged_credit_minus_outliers, x='NAME_CLIENT_TYPE', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Client Type', fontsize=20)
plt.show()
Assessment 5:
ax = sns.boxplot(data=merged_credit_minus_outliers, x='NAME_YIELD_GROUP', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Interest Rate', fontsize=20)
plt.show()
Assessment 6:
ax = sns.boxplot(data=merged_credit_minus_outliers, x='NAME_CONTRACT_TYPE', y='AMT_CREDIT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Credit vs Contract Type', fontsize=20)
plt.show()
Assessment 7:
ax = sns.boxplot(data=merged, x='TARGET', y='CNT_PAYMENT')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Terms vs Payment Difficulty', fontsize=20)
plt.show()
Assessment 8:
ax = sns.boxplot(data=merged, x='NAME_CLIENT_TYPE', y='CNT_PAYMENT', hue='TARGET')
ax.figure.set_size_inches(24,12)
ax.set_title('Loan Terms vs Client Type', fontsize=20)
plt.show()
Assessment 9:
ax = sns.heatmap(clients[['TARGET', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'AGE', 'YEARS_EMPLOYED']].corr(), cmap='RdYlGn', annot=True)
ax.figure.set_size_inches(24,12)
plt.show()
ax = sns.heatmap(loans[['AMT_APPLICATION', 'AMT_CREDIT', 'AMT_ANNUITY', 'CNT_PAYMENT']].corr(), cmap='RdYlGn', annot=True)
ax.figure.set_size_inches(24,12)
plt.show()
Assessment: