# Importing essential libraries
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


# Loading data
df = pd.read_csv('Leads.csv')
df.head()


df.shape

(9240, 37)


df_missing = df.isnull().mean() * 100
df_missing[df_missing > 0]

Lead Source                                       0.389610
TotalVisits                                       1.482684
Page Views Per Visit                              1.482684
Last Activity                                     1.114719
Country                                          26.634199
Specialization                                   15.562771
How did you hear about X Education               23.885281
What is your current occupation                  29.112554
What matters most to you in choosing a course    29.318182
Tags                                             36.287879
Lead Quality                                     51.590909
Lead Profile                                     29.318182
City                                             15.367965
Asymmetrique Activity Index                      45.649351
Asymmetrique Profile Index                       45.649351
Asymmetrique Activity Score                      45.649351
Asymmetrique Profile Score                       45.649351
dtype: float64


df = df.replace({'Select': np.nan})


df_missing = df.isnull().mean() * 100
df_missing[df_missing > 0]

Lead Source                                       0.389610
TotalVisits                                       1.482684
Page Views Per Visit                              1.482684
Last Activity                                     1.114719
Country                                          26.634199
Specialization                                   36.580087
How did you hear about X Education               78.463203
What is your current occupation                  29.112554
What matters most to you in choosing a course    29.318182
Tags                                             36.287879
Lead Quality                                     51.590909
Lead Profile                                     74.188312
City                                             39.707792
Asymmetrique Activity Index                      45.649351
Asymmetrique Profile Index                       45.649351
Asymmetrique Activity Score                      45.649351
Asymmetrique Profile Score                       45.649351
dtype: float64


df.drop(df_missing[df_missing > 40].index, axis=1, inplace=True)


df.shape

(9240, 30)


df_missing = df.isnull().mean() * 100
df_missing[df_missing > 0]

Lead Source                                       0.389610
TotalVisits                                       1.482684
Page Views Per Visit                              1.482684
Last Activity                                     1.114719
Country                                          26.634199
Specialization                                   36.580087
What is your current occupation                  29.112554
What matters most to you in choosing a course    29.318182
Tags                                             36.287879
City                                             39.707792
dtype: float64


df = df[df['Lead Source'].notnull() & df['TotalVisits'].notnull() & df['Page Views Per Visit'].notnull() & df['Last Activity']]


df_missing = df.isnull().mean() * 100
df_missing[df_missing > 0]

Country                                          25.303064
Specialization                                   36.169275
What is your current occupation                  29.567996
What matters most to you in choosing a course    29.777386
Tags                                             36.665197
City                                             39.398281
dtype: float64


df['Country'].value_counts(normalize=True) * 100

India                   95.765713
United States            1.017999
United Arab Emirates     0.781942
Singapore                0.354087
Saudi Arabia             0.309826
United Kingdom           0.221304
Australia                0.191797
Qatar                    0.147536
Hong Kong                0.103275
Bahrain                  0.103275
Oman                     0.088522
France                   0.088522
unknown                  0.073768
South Africa             0.059014
Nigeria                  0.059014
Germany                  0.059014
Kuwait                   0.059014
Canada                   0.059014
Sweden                   0.044261
China                    0.029507
Asia/Pacific Region      0.029507
Uganda                   0.029507
Bangladesh               0.029507
Italy                    0.029507
Belgium                  0.029507
Netherlands              0.029507
Ghana                    0.029507
Philippines              0.029507
Russia                   0.014754
Switzerland              0.014754
Vietnam                  0.014754
Denmark                  0.014754
Tanzania                 0.014754
Liberia                  0.014754
Malaysia                 0.014754
Kenya                    0.014754
Sri Lanka                0.014754
Indonesia                0.014754
Name: Country, dtype: float64


# Imputing missing value with India
df['Country'] = df['Country'].fillna('India')


df['Specialization'].value_counts(normalize=True) * 100

Finance Management                   16.557320
Human Resource Management            14.450967
Marketing Management                 14.209254
Operations Management                 8.615331
Business Administration               6.888812
IT Projects Management                6.319061
Supply Chain Management               5.973757
Banking, Investment And Insurance     5.783840
Travel and Tourism                    3.487569
Media and Advertising                 3.487569
International Business                3.038674
Healthcare Management                 2.693370
E-COMMERCE                            1.916436
Hospitality Management                1.916436
Retail Management                     1.726519
Rural and Agribusiness                1.260359
E-Business                            0.984116
Services Excellence                   0.690608
Name: Specialization, dtype: float64


# Imputing missing value with a new category
df['Specialization'] = df['Specialization'].fillna('Others')


df['What is your current occupation'].value_counts(normalize=True) * 100

Unemployed              85.682992
Working Professional    10.593021
Student                  3.223283
Other                    0.234705
Housewife                0.140823
Businessman              0.125176
Name: What is your current occupation, dtype: float64


# Imputing missing value with the mode
df['What is your current occupation'] = df['What is your current occupation'].fillna('Unemployed')


df['What matters most to you in choosing a course'].value_counts(normalize=True) * 100

Better Career Prospects      99.968613
Flexibility & Convenience     0.015694
Other                         0.015694
Name: What matters most to you in choosing a course, dtype: float64


df['What matters most to you in choosing a course'].value_counts()

Better Career Prospects      6370
Flexibility & Convenience       1
Other                           1
Name: What matters most to you in choosing a course, dtype: int64


df['Tags'].value_counts(normalize=True) * 100

Will revert after reading the email                  35.079172
Ringing                                              20.654254
Interested in other courses                           8.856795
Already a student                                     8.091178
Closed by Horizzon                                    5.237515
switched off                                          4.176092
Busy                                                  3.219071
Lost to EINS                                          2.992866
Not doing further education                           2.523056
Interested  in full time MBA                          2.018444
Graduation in progress                                1.931442
invalid number                                        1.444232
Diploma holder (Not Eligible)                         1.096224
wrong number given                                    0.817818
opp hangup                                            0.574213
number not provided                                   0.435010
in touch with EINS                                    0.208805
Lost to Others                                        0.121803
Still Thinking                                        0.104402
Want to take admission but has financial problems     0.104402
In confusion whether part time or DLP                 0.087002
Interested in Next batch                              0.087002
Lateral student                                       0.052201
Shall take in the next coming month                   0.034801
University not recognized                             0.034801
Recognition issue (DEC approval)                      0.017400
Name: Tags, dtype: float64


# Imputing missing values with the mode
df['Tags'] = df['Tags'].fillna('Will revert after reading the email')


df['City'].value_counts(normalize=True) * 100

Mumbai                         57.774141
Thane & Outskirts              13.547918
Other Cities                   12.365885
Other Cities of Maharashtra     8.110566
Other Metro Cities              6.855792
Tier II Cities                  1.345699
Name: City, dtype: float64


df['City'] = df['City'].fillna('Missing')
df['City'] = df.apply(lambda x: 'Mumbai' if ((x['Country'] == 'India') & (x['City'] == 'Missing')) else ('Other Cities' if (x['City'] == 'Missing') else x['City']), axis=1)


df['City'].value_counts(normalize=True) * 100

Mumbai                         73.749173
Thane & Outskirts               8.210271
Other Cities                    8.155169
Other Cities of Maharashtra     4.915142
Other Metro Cities              4.154728
Tier II Cities                  0.815517
Name: City, dtype: float64


df.duplicated().sum()

0


# Check for duplicate in Prospect ID
df.duplicated(subset = ['Prospect ID'], keep = False).sum()

0


# Check for duplicate in Lead Number
df.duplicated(subset = ['Lead Number'], keep = False).sum()

0


# Checking features with only one category
df_nunique = df.nunique()
df_nunique[df_nunique < 2]

Magazine                                    1
Receive More Updates About Our Courses      1
Update me on Supply Chain Content           1
Get updates on DM Content                   1
I agree to pay the amount through cheque    1
dtype: int64


# Removing unnecessary features
df.drop(['Prospect ID', 'Lead Number', 'What matters most to you in choosing a course', 'Do Not Call', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'I agree to pay the amount through cheque', 'Last Notable Activity'], axis=1, inplace=True)
df.shape

(9074, 14)


# Helper functions for Numerical Univariate Analysis
def num_uni_box_analysis(var, friendly_name=None):
  if friendly_name is None:
    friendly_name = var
  ax = sns.boxplot(data=df, x=var)
  ax.set_xticks(list(df[var].quantile([0,0.25,0.5,0.75,0.95,1])))
  ax.set_xlabel(friendly_name)
  ax.set_title(f'{friendly_name} Distribution Univariate Analysis', fontsize=20)
  ax.figure.set_size_inches(24,8)
  return ax

def num_uni_hist_analysis(var, friendly_name=None, bins=25):
  if friendly_name is None:
    friendly_name = var
  ax = sns.histplot(data=df ,x=var, stat='percent', bins=bins)
  ax.set_xlabel(friendly_name)
  ax.set_title(f'{friendly_name} Distribution Histogram', fontsize=20)
  ax.figure.set_size_inches(24,8)
  return ax


# Helper functions for Numerical Segmented Univariate Analysis
def num_uni_box_seg_analysis(var, friendly_name=None):
  if friendly_name is None:
    friendly_name = var
  ax = sns.boxplot(data=df, x='Converted', y=var)
  ax.legend(['No', 'Yes'])
  ax.set_ylabel(friendly_name)
  ax.set_title(f'{friendly_name} Distribution Univariate Analysis', fontsize=20)
  ax.figure.set_size_inches(24,8)
  return ax

def num_uni_hist_seg_analysis(var, friendly_name=None, bins=25):
  if friendly_name is None:
    friendly_name = var
  ax = sns.histplot(data=df ,x=var, hue='Converted', stat='percent', multiple='fill', bins=bins)
  ax.legend(['No', 'Yes'])
  ax.set_xlabel(friendly_name)
  ax.set_title(f'{friendly_name} Distribution Histogram (Normalised)', fontsize=20)
  ax.figure.set_size_inches(24,8)
  return ax


# Helper function for Categorical Univariate Analysis
def cat_uni_analysis(col, friendly_name=None):
  if friendly_name is None:
    friendly_name = col
  tmp_df = df[col].value_counts(normalize=True).mul(100).rename('Percent').reset_index().rename(columns={'index': col})
  ax = sns.barplot(data=tmp_df, x=col, y='Percent')
  for p in ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() + (p.get_width()/2)
    txt_y = p.get_height()
    ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
  ax.set_title(f'{friendly_name} Univariate Analysis (Normalised)', fontsize=20)
  ax.set_xlabel(friendly_name)
  ax.figure.set_size_inches(16,8)
  return ax


# Helper function for Categorical Segmented Univariate Analysis
def cat_seg_analysis(col, friendly_name=None):
  if friendly_name is None:
    friendly_name = col
  tmp_df = df.groupby(col)['Converted'].value_counts(normalize=True).mul(100).rename('Percent').reset_index().replace(0,'No').replace(1,'Yes')
  ax = sns.barplot(data=tmp_df, x=col, y='Percent', hue='Converted')
  for p in ax.patches:
    txt = str(p.get_height().round(2)) + '%'
    txt_x = p.get_x() + (p.get_width()/2)
    txt_y = p.get_height()
    ax.annotate(txt, (txt_x, txt_y), size=11, ha='center', va='bottom')
  ax.set_title(f'{friendly_name} Segmented Univariate Analysis (Normalised)', fontsize=20)
  ax.set_xlabel(friendly_name)
  ax.figure.set_size_inches(16,8)
  return ax


ax = sns.countplot(y=df['Converted'])
plt.yticks([0,1], ['No', 'Yes'], rotation=45)
ax.set_title('Target Variable Data Imbalance', fontsize=20)
ax.figure.set_size_inches(10,4)
plt.show()


cat_uni_analysis('Lead Origin')
plt.show()


cat_seg_analysis('Lead Origin')
plt.show()


ax = cat_uni_analysis('Lead Source')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


# Fixing spelling issue in data
df['Lead Source'] = df['Lead Source'].replace({'google': 'Google'})


ax = cat_uni_analysis('Lead Source')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


df['Lead Source'] = df['Lead Source'].replace(['Welingak Website', 'Referral Sites', 'Facebook', 'bing', 'Click2call', 'Social Media', 'Live Chat', 'Press_Release', 'Pay per Click Ads', 'blog', 'WeLearn', 'welearnblog_Home', 'youtubechannel', 'testone', 'NC_EDM'], 'Others')


cat_uni_analysis('Lead Source')
plt.show()


ax = cat_seg_analysis('Lead Source')
ax.figure.set_size_inches(24,12)
plt.show()


cat_uni_analysis('Do Not Email')
plt.show()


cat_seg_analysis('Do Not Email')
plt.show()


num_uni_box_analysis('TotalVisits', 'Total Visits')
plt.show()


df = df[df['TotalVisits'] < 50]


num_uni_box_analysis('TotalVisits', 'Total Visits')
plt.show()


num_uni_box_seg_analysis('TotalVisits', 'Total Visits')
plt.show()


num_uni_hist_analysis('TotalVisits', 'Total Visits')
plt.show()


num_uni_hist_seg_analysis('TotalVisits', 'Total Visits')
plt.show()


num_uni_box_analysis('Total Time Spent on Website', 'Time Spent')
plt.show()


num_uni_box_seg_analysis('Total Time Spent on Website', 'Time Spent')
plt.show()


num_uni_hist_analysis('Total Time Spent on Website', 'Time Spent')
plt.show()


num_uni_hist_seg_analysis('Total Time Spent on Website', 'Time Spent')
plt.show()


num_uni_box_analysis('Page Views Per Visit')
plt.show()


num_uni_box_seg_analysis('Page Views Per Visit')
plt.show()


num_uni_hist_analysis('Page Views Per Visit', bins=10)
plt.show()


num_uni_hist_seg_analysis('Page Views Per Visit', bins=10)
plt.show()


ax = cat_uni_analysis('Last Activity')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


df['Last Activity'] = df['Last Activity'].replace(['Unreachable', 'Unsubscribed', 'Had a Phone Conversation', 'View in browser link Clicked', 'Approached upfront', 'Email Received', 'Email Marked Spam', 'Visited Booth in Tradeshow', 'Resubscribed to emails'], 'Other')


ax = cat_uni_analysis('Last Activity')
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


ax = cat_seg_analysis('Last Activity')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


df['Country'].value_counts(normalize=True) * 100

India                   96.835024
United States            0.760918
United Arab Emirates     0.584473
Singapore                0.264667
Saudi Arabia             0.231584
United Kingdom           0.165417
Australia                0.143361
Qatar                    0.110278
Hong Kong                0.077195
Bahrain                  0.077195
Oman                     0.066167
France                   0.066167
unknown                  0.055139
South Africa             0.044111
Nigeria                  0.044111
Germany                  0.044111
Kuwait                   0.044111
Canada                   0.044111
Sweden                   0.033083
China                    0.022056
Asia/Pacific Region      0.022056
Uganda                   0.022056
Bangladesh               0.022056
Italy                    0.022056
Belgium                  0.022056
Netherlands              0.022056
Ghana                    0.022056
Philippines              0.022056
Russia                   0.011028
Switzerland              0.011028
Vietnam                  0.011028
Denmark                  0.011028
Tanzania                 0.011028
Liberia                  0.011028
Malaysia                 0.011028
Kenya                    0.011028
Sri Lanka                0.011028
Indonesia                0.011028
Name: Country, dtype: float64


df['Country'] = df['Country'].apply(lambda x: 'Others' if x != 'India' else 'India')


cat_uni_analysis('Country')
plt.show()


cat_seg_analysis('Country')
plt.show()


ax = cat_uni_analysis('Specialization')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


ax = cat_seg_analysis('Specialization')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


df.rename(columns={'What is your current occupation': 'Occupation'}, inplace=True)


cat_uni_analysis('Occupation')
plt.show()


cat_seg_analysis('Occupation')
plt.show()


ax = cat_uni_analysis('Tags')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


ax = cat_seg_analysis('Tags')
ax.figure.set_size_inches(24,12)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()


df['City'].value_counts(normalize=True) * 100

Mumbai                         73.753860
Thane & Outskirts               8.204676
Other Cities                    8.149537
Other Cities of Maharashtra     4.918394
Other Metro Cities              4.157477
Tier II Cities                  0.816056
Name: City, dtype: float64


df[(df['Country'] != 'India') & (df['City'] == 'Mumbai')][['Country', 'City']].head()


df['City'] = df.apply(lambda x: 'Other Cities' if ((x['Country'] == 'Others') & (x['City'] in ['Mumbai', 'Thane & Outskirts', 'Other Cities of Maharashtra'])) else x['City'], axis=1)


cat_uni_analysis('City')
plt.show()


cat_seg_analysis('City')
plt.show()


df.rename(columns={'A free copy of Mastering The Interview': 'Gift'}, inplace=True)


cat_uni_analysis('Gift')
plt.show()


cat_seg_analysis('Gift')
plt.show()


ax = sns.pairplot(df)
ax.figure.set_size_inches(24,20)


# Heatmap
ax = sns.heatmap(df.corr(), cmap='Blues', annot=True)
ax.figure.set_size_inches(12,10)
plt.show()


cat_vars = ['Lead Origin', 'Lead Source', 'Do Not Email','Last Activity', 'Country', 'Specialization', 'Occupation', 'Tags', 'City', 'Gift']
cat_dum = pd.get_dummies(df[cat_vars], drop_first=True)
df.drop(columns=cat_vars, inplace=True)
df = pd.concat([df, cat_dum], axis=1)
df.columns

Index(['Converted', 'TotalVisits', 'Total Time Spent on Website',
       'Page Views Per Visit', 'Lead Origin_Landing Page Submission',
       'Lead Origin_Lead Add Form', 'Lead Origin_Lead Import',
       'Lead Source_Google', 'Lead Source_Olark Chat',
       'Lead Source_Organic Search', 'Lead Source_Others',
       'Lead Source_Reference', 'Do Not Email_Yes',
       'Last Activity_Email Bounced', 'Last Activity_Email Link Clicked',
       'Last Activity_Email Opened', 'Last Activity_Form Submitted on Website',
       'Last Activity_Olark Chat Conversation', 'Last Activity_Other',
       'Last Activity_Page Visited on Website', 'Last Activity_SMS Sent',
       'Country_Others', 'Specialization_Business Administration',
       'Specialization_E-Business', 'Specialization_E-COMMERCE',
       'Specialization_Finance Management',
       'Specialization_Healthcare Management',
       'Specialization_Hospitality Management',
       'Specialization_Human Resource Management',
       'Specialization_IT Projects Management',
       'Specialization_International Business',
       'Specialization_Marketing Management',
       'Specialization_Media and Advertising',
       'Specialization_Operations Management', 'Specialization_Others',
       'Specialization_Retail Management',
       'Specialization_Rural and Agribusiness',
       'Specialization_Services Excellence',
       'Specialization_Supply Chain Management',
       'Specialization_Travel and Tourism', 'Occupation_Housewife',
       'Occupation_Other', 'Occupation_Student', 'Occupation_Unemployed',
       'Occupation_Working Professional', 'Tags_Busy',
       'Tags_Closed by Horizzon', 'Tags_Diploma holder (Not Eligible)',
       'Tags_Graduation in progress',
       'Tags_In confusion whether part time or DLP',
       'Tags_Interested  in full time MBA', 'Tags_Interested in Next batch',
       'Tags_Interested in other courses', 'Tags_Lateral student',
       'Tags_Lost to EINS', 'Tags_Lost to Others',
       'Tags_Not doing further education',
       'Tags_Recognition issue (DEC approval)', 'Tags_Ringing',
       'Tags_Shall take in the next coming month', 'Tags_Still Thinking',
       'Tags_University not recognized',
       'Tags_Want to take admission but has financial problems',
       'Tags_Will revert after reading the email', 'Tags_in touch with EINS',
       'Tags_invalid number', 'Tags_number not provided', 'Tags_opp hangup',
       'Tags_switched off', 'Tags_wrong number given', 'City_Other Cities',
       'City_Other Cities of Maharashtra', 'City_Other Metro Cities',
       'City_Thane & Outskirts', 'City_Tier II Cities', 'Gift_Yes'],
      dtype='object')


df.shape

(9068, 76)


df_train, df_test = train_test_split(df, train_size=0.7, random_state=100)


df_train.shape

(6347, 76)


df_test.shape

(2721, 76)


# Learn Min & Max values
num_vars = ['TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit']
scaler = MinMaxScaler()
scaler.fit(df_train[num_vars])

MinMaxScaler()

MinMaxScaler()


# Scale
df_train[num_vars] = scaler.transform(df_train[num_vars])
df_train.head()


# Creating function to easily calculate VIF
def calculate_vif(x_df):
  vif_df = pd.DataFrame({'Feature': x_df.columns, 'VIF': [ variance_inflation_factor(x_df.values, i) for i in range(x_df.shape[1])]})
  vif_df['VIF'] = vif_df['VIF'].round(2)
  vif_df = vif_df.sort_values(by='VIF', ascending=False)
  return vif_df


# Creating function to easily train models
def train_model(y_df, x_df):
  # Preparing for intercept
  x_df_sm = sm.add_constant(x_df)
  # Training model
  lr = sm.GLM(y_df, x_df_sm, sm.families.Binomial())
  lr_model = lr.fit()
  return lr_model


# Extract Target Variable
y_train = df_train.pop('Converted')
X_train = df_train


# Creating function to easily select features
def auto_select_features(n, x_df=X_train):
  logreg = LogisticRegression(max_iter=1000)
  selector = RFE(logreg, n_features_to_select=n)
  selector = selector.fit(x_df, y_train)
  top_features_df = pd.DataFrame({'Feature': x_df.columns, 'Selected': selector.support_, 'Rank': selector.ranking_})
  selected_vars = top_features_df[top_features_df['Selected'] == True]['Feature'].to_list()
  return selected_vars


# Creating function to easily evaluate performance of model over multiple probability cutoffs
def model_performance(model, x_df, y_df=y_train):
  prob = [float(x)/10 for x in range(10)]

  y_final = y_df.to_frame()
  x_df_sm = sm.add_constant(x_df)
  y_final['Conversion_Prob'] = model.predict(x_df_sm)

  for i in prob:
    y_final[i] = y_final['Conversion_Prob'].map(lambda x: 1 if x > i else 0)

  performance_df = pd.DataFrame( columns = ['probability_cutoff','accuracy','sensitivity','specificity'])
  for i in prob:
    cm = confusion_matrix(y_final['Converted'], y_final[i])
    total=sum(sum(cm))
    accuracy = (cm[0,0]+cm[1,1])/total
    sensitivity = cm[1,1]/(cm[1,0]+cm[1,1])
    specificity = cm[0,0]/(cm[0,0]+cm[0,1])
    performance_df.loc[i] =[ i ,accuracy,sensitivity,specificity]
  return performance_df


# Creating function to draw ROC curve
def roc(model, x_df, y_df=y_train):
    y_final = y_df.to_frame()
    x_df_sm = sm.add_constant(x_df)
    y_final['Conversion_Prob'] = model.predict(x_df_sm)
    fpr, tpr, threshold = roc_curve( y_final['Converted'], y_final['Conversion_Prob'], drop_intermediate = False )
    auc_score = roc_auc_score( y_final['Converted'], y_final['Conversion_Prob'])
    plt.figure(figsize=(8, 8))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
    return None


# Auto selecting top 15 features
X_train_1 = X_train[auto_select_features(15)]


# Training Logistic Regression Model
train_model(y_train, X_train_1).summary()


calculate_vif(X_train_1)


# Dropping 'Tags_Interested in Next batch' because it has high p-value
X_train_1 = X_train_1.drop('Tags_Interested in Next batch', axis=1)


# Re-train model
train_model(y_train, X_train_1).summary()


calculate_vif(X_train_1)


# Dropping 'Tags_Lateral student' because it has high p-value
X_train_1 = X_train_1.drop('Tags_Lateral student', axis=1)


# Re-train model
train_model(y_train, X_train_1).summary()


calculate_vif(X_train_1)


# Dropping 'Occupation_Unemployed' because it has high VIF
X_train_1 = X_train_1.drop('Occupation_Unemployed', axis=1)


# Re-train model
train_model(y_train, X_train_1).summary()


calculate_vif(X_train_1)


model_1 = train_model(y_train, X_train_1)


roc(model_1, X_train_1)


train_perf_1 = model_performance(model_1, X_train_1)
train_perf_1


train_perf_1.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


train_cutoff_1 = 0.4
train_perf_1.loc[[train_cutoff_1]]


# Removing Tags from the training set as it was assigned by sales team themselves
cols_without_tags = list(filter(lambda x: False if x.startswith('Tags_') else True, list(X_train.columns)))


X_train_2 = X_train[cols_without_tags]


# Auto selecting top 15 features
X_train_2 = X_train_2[auto_select_features(15, X_train_2)]


# Training Logistic Regression Model
train_model(y_train, X_train_2).summary()


calculate_vif(X_train_2)


# Dropping 'Occupation_Housewife' because it has high p-value
X_train_2 = X_train_2.drop('Occupation_Housewife', axis=1)


# Re-train model
train_model(y_train, X_train_2).summary()


calculate_vif(X_train_2)


# Dropping 'Page Views Per Visit' because it has high VIF
X_train_2 = X_train_2.drop('Page Views Per Visit', axis=1)


# Re-train model
train_model(y_train, X_train_2).summary()


calculate_vif(X_train_2)


model_2 = train_model(y_train, X_train_2)


roc(model_2, X_train_2)


train_perf_2 = model_performance(model_2, X_train_2)
train_perf_2


train_perf_2.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


train_cutoff_2 = 0.4
train_perf_2.loc[[train_cutoff_2]]


# Continuing without tags
X_train_3 = X_train[cols_without_tags]


# Auto selecting top 20 features
X_train_3 = X_train_3[auto_select_features(20, X_train_3)]


# Training Logistic Regression Model
train_model(y_train, X_train_3).summary()


calculate_vif(X_train_3)


# Dropping 'Occupation_Unemployed' because it has both high p-value & high VIF
X_train_3 = X_train_3.drop('Occupation_Unemployed', axis=1)


# Re-train model
train_model(y_train, X_train_3).summary()


calculate_vif(X_train_3)


# Dropping 'Occupation_Housewife' because it has high p-value
X_train_3 = X_train_3.drop('Occupation_Housewife', axis=1)


# Re-train model
train_model(y_train, X_train_3).summary()


calculate_vif(X_train_3)


# Dropping 'Occupation_Other' because it has high p-value
X_train_3 = X_train_3.drop('Occupation_Other', axis=1)


# Re-train model
train_model(y_train, X_train_3).summary()


calculate_vif(X_train_3)


# Dropping 'Lead Origin_Lead Import' because it has high p-value
X_train_3 = X_train_3.drop('Lead Origin_Lead Import', axis=1)


# Re-train model
train_model(y_train, X_train_3).summary()


calculate_vif(X_train_3)


# Dropping 'Specialization_Hospitality Management' because it has high p-value
X_train_3 = X_train_3.drop('Specialization_Hospitality Management', axis=1)


# Re-train model
train_model(y_train, X_train_3).summary()


calculate_vif(X_train_3)


# Dropping 'Page Views Per Visit' because it has high VIF
X_train_3 = X_train_3.drop('Page Views Per Visit', axis=1)


# Re-train model
train_model(y_train, X_train_3).summary()


calculate_vif(X_train_3)


model_3 = train_model(y_train, X_train_3)


roc(model_3, X_train_3)


train_perf_3 = model_performance(model_3, X_train_3)
train_perf_3


train_perf_3.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


train_cutoff_3 = 0.4
train_perf_3.loc[[train_cutoff_3]]


# Continuing from Model 3 with aim to further reduce VIF
X_train_4 = X_train_3.copy()


# Training Logistic Regression Model
train_model(y_train, X_train_4).summary()


calculate_vif(X_train_4)


# Dropping 'Lead Origin_Lead Add Form' because it has high VIF
X_train_4 = X_train_4.drop('Lead Origin_Lead Add Form', axis=1)


# Re-train model
train_model(y_train, X_train_4).summary()


calculate_vif(X_train_4)


# Dropping 'TotalVisits' because it has high p-value
X_train_4 = X_train_4.drop('TotalVisits', axis=1)


# Re-train model
train_model(y_train, X_train_4).summary()


calculate_vif(X_train_4)


# Dropping 'Occupation_Student' because it has high p-value
X_train_4 = X_train_4.drop('Occupation_Student', axis=1)


# Re-train model
train_model(y_train, X_train_4).summary()


calculate_vif(X_train_4)


model_4 = train_model(y_train, X_train_4)


roc(model_4, X_train_4)


train_perf_4 = model_performance(model_4, X_train_4)
train_perf_4


train_perf_4.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


train_cutoff_4 = 0.4
train_perf_4.loc[[train_cutoff_4]]


# Continuing without tags
X_train_5 = X_train[cols_without_tags]


# Auto selecting top 10 features
X_train_5 = X_train_5[auto_select_features(10, X_train_5)]


# Training Logistic Regression Model
train_model(y_train, X_train_5).summary()


calculate_vif(X_train_5)


# Dropping 'Occupation_Housewife' because it has high p-value
X_train_5 = X_train_5.drop('Occupation_Housewife', axis=1)


# Re-train model
train_model(y_train, X_train_5).summary()


calculate_vif(X_train_5)


model_5 = train_model(y_train, X_train_5)


roc(model_5, X_train_5)


train_perf_5 = model_performance(model_5, X_train_5)
train_perf_5


train_perf_5.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


train_cutoff_5 = 0.4
train_perf_5.loc[[train_cutoff_5]]


# Feature Scaling on Test Set
df_test[num_vars] = scaler.transform(df_test[num_vars])


# Extract target variable
y_test = df_test.pop('Converted')
X_test = df_test


test_perf_1 = model_performance(model_1, X_test[X_train_1.columns], y_test)
test_perf_1


test_perf_1.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


test_cutoff_1 = 0.4
test_perf_1.loc[[test_cutoff_1]]


test_perf_2 = model_performance(model_2, X_test[X_train_2.columns], y_test)
test_perf_2


test_perf_2.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


test_cutoff_2 = 0.4
test_perf_2.loc[[test_cutoff_2]]


test_perf_3 = model_performance(model_3, X_test[X_train_3.columns], y_test)
test_perf_3


test_perf_3.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


test_cutoff_3 = 0.4
test_perf_3.loc[[test_cutoff_3]]


test_perf_4 = model_performance(model_4, X_test[X_train_4.columns], y_test)
test_perf_4


test_perf_4.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


test_cutoff_4 = 0.4
test_perf_4.loc[[test_cutoff_4]]


test_perf_5 = model_performance(model_5, X_test[X_train_5.columns], y_test)
test_perf_5


test_perf_5.plot(x='probability_cutoff', y=['accuracy','sensitivity','specificity'], figsize=(10,8))
plt.show()


test_cutoff_5 = 0.4
test_perf_5.loc[[test_cutoff_5]]


test_perf_5.loc[test_cutoff_5, 'accuracy']

0.8059536934950385


model_names = ['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5']
models = [model_1, model_2, model_3, model_4, model_5]
train_perfs = [train_perf_1, train_perf_2, train_perf_3, train_perf_4, train_perf_5]
test_perfs = [test_perf_1, test_perf_2, test_perf_3, test_perf_4, test_perf_5]
train_cutoffs = [train_cutoff_1, train_cutoff_2, train_cutoff_3, train_cutoff_4, train_cutoff_5]
test_cutoffs = [test_cutoff_1, test_cutoff_2, test_cutoff_3, test_cutoff_4, test_cutoff_5]


num_predictors = []
train_accuracies = []
test_accuracies = []

for model, train_perf, test_perf, train_cutoff, test_cutoff in zip(models, train_perfs, test_perfs, train_cutoffs, test_cutoffs):
  p = len(model.params) - 1 # Subtracted 1 to not count intercept
  train_accuracy = train_perf.loc[train_cutoff, 'accuracy']
  test_accuracy = test_perf.loc[test_cutoff, 'accuracy']
  num_predictors.append(p)
  train_accuracies.append(train_accuracy)
  test_accuracies.append(test_accuracy)

result_df = pd.DataFrame({'Model': model_names, 'No. of Predictors': num_predictors, 'Accuracy (Train)': train_accuracies, 'Accuracy (Test)': test_accuracies})
result_df


# Final Model
final_model = model_5


final_model.summary()


# Confusion Matrix
cm = confusion_matrix(y_test, final_model.predict(sm.add_constant(X_test[X_train_5.columns])).map(lambda x: 1 if x > test_cutoff_5 else 0))
cm

array([[1397,  317],
       [ 211,  796]])


total=sum(sum(cm))


# Accuracy Score
accuracy = (cm[0,0]+cm[1,1])/total
accuracy

0.8059536934950385


# Sensitivity or Recall
sensitivity = cm[1,1]/(cm[1,0]+cm[1,1])
sensitivity

0.7904667328699106


# Specificity
specificity = cm[0,0]/(cm[0,0]+cm[0,1])
specificity

0.8150525087514586


# Precision
precision = cm[1,1]/(cm[0,1]+cm[1,1])
precision

0.7151841868823001


# False Positive Rate (FPR)
fpr = cm[0,1]/(cm[0,0]+cm[0,1])
fpr

0.18494749124854143


# FPR equals 1-specificity
1 - specificity

0.18494749124854137


lead_df = y_test.to_frame()
lead_df['Lead Score'] = round(final_model.predict(sm.add_constant(X_test[X_train_5.columns])) * 100)
lead_df.head()

	Converted	TotalVisits	Total Time Spent on Website	Page Views Per Visit	Lead Origin_Landing Page Submission	Lead Source_Google	Lead Source_Olark Chat	Lead Source_Organic Search	...	City_Other Cities	Gift_Yes
3050	0	0.186047	0.025528	0.111250	0	0	0	1	...	0	0
6708	1	0.000000	0.000000	0.000000	0	0	1	0	...	0	0
1248	0	0.046512	0.658891	0.083333	1	0	0	0	...	1	1
1429	0	0.046512	0.566901	0.083333	1	1	0	0	...	0	0
2178	0	0.116279	0.041373	0.208333	1	0	0	0	...	0	0

Dep. Variable:	Converted	No. Observations:	6347
Model:	GLM	Df Residuals:	6331
Model Family:	Binomial	Df Model:	15
Link Function:	Logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-1774.0
Date:	Tue, 18 Oct 2022	Deviance:	3548.0
Time:	23:39:54	Pearson chi2:	8.77e+03
No. Iterations:	20	Pseudo R-squ. (CS):	0.5377
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-2.7369	0.280	-9.774	0.000	-3.286	-2.188
Total Time Spent on Website	4.4618	0.201	22.239	0.000	4.069	4.855
Lead Origin_Landing Page Submission	-1.7699	0.157	-11.247	0.000	-2.078	-1.461
Lead Origin_Lead Add Form	2.9057	0.279	10.423	0.000	2.359	3.452
Do Not Email_Yes	-1.3891	0.211	-6.594	0.000	-1.802	-0.976
Last Activity_Email Opened	1.0821	0.118	9.140	0.000	0.850	1.314
Last Activity_Other	1.8893	0.294	6.417	0.000	1.312	2.466
Last Activity_SMS Sent	2.6507	0.126	21.100	0.000	2.404	2.897
Specialization_Others	-1.6015	0.157	-10.180	0.000	-1.910	-1.293
Occupation_Unemployed	-2.3341	0.205	-11.360	0.000	-2.737	-1.931
Tags_Busy	3.7653	0.269	14.013	0.000	3.239	4.292
Tags_Closed by Horizzon	8.8449	0.753	11.744	0.000	7.369	10.321
Tags_Interested in Next batch	24.0491	1.4e+04	0.002	0.999	-2.75e+04	2.75e+04
Tags_Lateral student	25.0831	2.01e+04	0.001	0.999	-3.93e+04	3.93e+04
Tags_Lost to EINS	8.7471	0.748	11.702	0.000	7.282	10.212
Tags_Will revert after reading the email	4.1667	0.163	25.488	0.000	3.846	4.487

Dep. Variable:	Converted	No. Observations:	6347
Model:	GLM	Df Residuals:	6332
Model Family:	Binomial	Df Model:	14
Link Function:	Logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-1779.0
Date:	Tue, 18 Oct 2022	Deviance:	3558.1
Time:	23:39:55	Pearson chi2:	8.65e+03
No. Iterations:	19	Pseudo R-squ. (CS):	0.5369
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-2.7012	0.278	-9.717	0.000	-3.246	-2.156
Total Time Spent on Website	4.4642	0.200	22.292	0.000	4.072	4.857
Lead Origin_Landing Page Submission	-1.7867	0.157	-11.386	0.000	-2.094	-1.479
Lead Origin_Lead Add Form	2.9129	0.278	10.494	0.000	2.369	3.457
Do Not Email_Yes	-1.3497	0.210	-6.439	0.000	-1.761	-0.939
Last Activity_Email Opened	1.0851	0.118	9.169	0.000	0.853	1.317
Last Activity_Other	1.8830	0.294	6.407	0.000	1.307	2.459
Last Activity_SMS Sent	2.6588	0.126	21.179	0.000	2.413	2.905
Specialization_Others	-1.6112	0.157	-10.271	0.000	-1.919	-1.304
Occupation_Unemployed	-2.3245	0.205	-11.356	0.000	-2.726	-1.923
Tags_Busy	3.7274	0.267	13.947	0.000	3.204	4.251
Tags_Closed by Horizzon	8.8074	0.752	11.710	0.000	7.333	10.282
Tags_Lateral student	24.0391	1.22e+04	0.002	0.998	-2.38e+04	2.39e+04
Tags_Lost to EINS	8.7126	0.747	11.663	0.000	7.248	10.177
Tags_Will revert after reading the email	4.1284	0.161	25.634	0.000	3.813	4.444

	Prospect ID	Lead Number	Lead Origin	Lead Source	Do Not Email	Do Not Call	Converted	TotalVisits	Total Time Spent on Website	Page Views Per Visit	...	Get updates on DM Content	Lead Profile	City	Asymmetrique Activity Index	Asymmetrique Profile Index	Asymmetrique Activity Score	Asymmetrique Profile Score	I agree to pay the amount through cheque	A free copy of Mastering The Interview	Last Notable Activity
0	7927b2df-8bba-4d29-b9a2-b6e0beafe620	660737	API	Olark Chat	No	No	0	0.0	0	0.0	...	No	Select	Select	02.Medium	02.Medium	15.0	15.0	No	No	Modified
1	2a272436-5132-4136-86fa-dcc88c88f482	660728	API	Organic Search	No	No	0	5.0	674	2.5	...	No	Select	Select	02.Medium	02.Medium	15.0	15.0	No	No	Email Opened
2	8cc8c611-a219-4f35-ad23-fdfd2656bd8a	660727	Landing Page Submission	Direct Traffic	No	No	1	2.0	1532	2.0	...	No	Potential Lead	Mumbai	02.Medium	01.High	14.0	20.0	No	Yes	Email Opened
3	0cc2df48-7cf4-4e39-9de9-19797f9b38cc	660719	Landing Page Submission	Direct Traffic	No	No	0	1.0	305	1.0	...	No	Select	Mumbai	02.Medium	01.High	13.0	17.0	No	No	Modified
4	3256f628-e534-4826-9d63-4a8b88782852	660681	Landing Page Submission	Google	No	No	1	2.0	1428	1.0	...	No	Select	Mumbai	02.Medium	01.High	15.0	18.0	No	No	Modified

	Country	City
87	Others	Mumbai
103	Others	Mumbai
243	Others	Mumbai
425	Others	Mumbai
445	Others	Mumbai

	Feature	VIF
8	Occupation_Unemployed	7.12
1	Lead Origin_Landing Page Submission	4.67
7	Specialization_Others	3.27
14	Tags_Will revert after reading the email	2.70
4	Last Activity_Email Opened	2.28
6	Last Activity_SMS Sent	2.15
0	Total Time Spent on Website	2.11
2	Lead Origin_Lead Add Form	1.40
10	Tags_Closed by Horizzon	1.24
3	Do Not Email_Yes	1.20
5	Last Activity_Other	1.08
9	Tags_Busy	1.08
13	Tags_Lost to EINS	1.08
11	Tags_Interested in Next batch	1.01
12	Tags_Lateral student	1.00

	Feature	VIF
8	Occupation_Unemployed	7.11
1	Lead Origin_Landing Page Submission	4.67
7	Specialization_Others	3.27
13	Tags_Will revert after reading the email	2.70
4	Last Activity_Email Opened	2.28
6	Last Activity_SMS Sent	2.15
0	Total Time Spent on Website	2.11
2	Lead Origin_Lead Add Form	1.40
10	Tags_Closed by Horizzon	1.24
3	Do Not Email_Yes	1.20
5	Last Activity_Other	1.08
9	Tags_Busy	1.08
12	Tags_Lost to EINS	1.08
11	Tags_Lateral student	1.00

	Feature	VIF
1	Lead Origin_Landing Page Submission	2.91
11	Tags_Will revert after reading the email	2.64
4	Last Activity_Email Opened	2.16
0	Total Time Spent on Website	2.10
6	Last Activity_SMS Sent	2.10
7	Specialization_Others	1.78
2	Lead Origin_Lead Add Form	1.39
9	Tags_Closed by Horizzon	1.23
3	Do Not Email_Yes	1.19
5	Last Activity_Other	1.08
10	Tags_Lost to EINS	1.08
8	Tags_Busy	1.07

	probability_cutoff	accuracy	sensitivity	specificity
0.0	0.0	0.382385	1.000000	0.000000
0.1	0.1	0.772176	0.976926	0.645408
0.2	0.2	0.824642	0.932015	0.758163
0.3	0.3	0.861982	0.899876	0.838520
0.4	0.4	0.881991	0.878863	0.883929
0.5	0.5	0.884512	0.853317	0.903827
0.6	0.6	0.868442	0.764730	0.932653
0.7	0.7	0.847802	0.673671	0.955612
0.8	0.8	0.829053	0.592913	0.975255
0.9	0.9	0.781944	0.451586	0.986480

	Feature	VIF
2	Page Views Per Visit	5.03
4	Lead Origin_Lead Add Form	4.74
6	Lead Source_Reference	4.49
3	Lead Origin_Landing Page Submission	3.84
0	TotalVisits	3.46
8	Last Activity_Email Opened	2.79
12	Specialization_Others	2.68
11	Last Activity_SMS Sent	2.49
5	Lead Source_Olark Chat	2.28
1	Total Time Spent on Website	2.17
9	Last Activity_Olark Chat Conversation	1.82
7	Do Not Email_Yes	1.21
14	Occupation_Working Professional	1.20
10	Last Activity_Other	1.11
13	Occupation_Housewife	1.00

	Feature	VIF
3	Lead Origin_Lead Add Form	4.70
5	Lead Source_Reference	4.49
2	Lead Origin_Landing Page Submission	3.48
7	Last Activity_Email Opened	2.67
11	Specialization_Others	2.64
10	Last Activity_SMS Sent	2.38
0	TotalVisits	2.36
4	Lead Source_Olark Chat	2.17
1	Total Time Spent on Website	2.16
8	Last Activity_Olark Chat Conversation	1.79
6	Do Not Email_Yes	1.21
12	Occupation_Working Professional	1.20
9	Last Activity_Other	1.10

	Feature	VIF
18	Occupation_Unemployed	20.39
3	Lead Origin_Landing Page Submission	7.38
2	Page Views Per Visit	5.26
4	Lead Origin_Lead Add Form	4.90
14	Specialization_Others	4.63
7	Lead Source_Reference	4.59
0	TotalVisits	3.56
9	Last Activity_Email Opened	3.37
12	Last Activity_SMS Sent	2.83
6	Lead Source_Olark Chat	2.81
19	Occupation_Working Professional	2.51
1	Total Time Spent on Website	2.36
10	Last Activity_Olark Chat Conversation	1.94
17	Occupation_Student	1.53
8	Do Not Email_Yes	1.25
11	Last Activity_Other	1.13
5	Lead Origin_Lead Import	1.04
16	Occupation_Other	1.03
13	Specialization_Hospitality Management	1.02
15	Occupation_Housewife	1.02

	Feature	VIF
2	Page Views Per Visit	5.04
4	Lead Origin_Lead Add Form	4.75
7	Lead Source_Reference	4.50
3	Lead Origin_Landing Page Submission	3.86
0	TotalVisits	3.46
9	Last Activity_Email Opened	2.84
14	Specialization_Others	2.69
12	Last Activity_SMS Sent	2.51
6	Lead Source_Olark Chat	2.30
1	Total Time Spent on Website	2.18
10	Last Activity_Olark Chat Conversation	1.82
8	Do Not Email_Yes	1.21
18	Occupation_Working Professional	1.21
11	Last Activity_Other	1.11
17	Occupation_Student	1.03
5	Lead Origin_Lead Import	1.02
13	Specialization_Hospitality Management	1.02
15	Occupation_Housewife	1.00
16	Occupation_Other	1.00

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-4.6581	0.219	-21.226	0.000	-5.088	-4.228
Total Time Spent on Website	4.4964	0.196	22.917	0.000	4.112	4.881
Lead Origin_Landing Page Submission	-1.8921	0.152	-12.426	0.000	-2.190	-1.594
Lead Origin_Lead Add Form	2.9549	0.273	10.835	0.000	2.420	3.489
Do Not Email_Yes	-1.3068	0.202	-6.466	0.000	-1.703	-0.911
Last Activity_Email Opened	1.0439	0.115	9.081	0.000	0.819	1.269
Last Activity_Other	1.8941	0.284	6.680	0.000	1.338	2.450
Last Activity_SMS Sent	2.6196	0.122	21.489	0.000	2.381	2.859
Specialization_Others	-1.8732	0.153	-12.240	0.000	-2.173	-1.573
Tags_Busy	3.5357	0.265	13.363	0.000	3.017	4.054
Tags_Closed by Horizzon	8.8407	0.755	11.711	0.000	7.361	10.320
Tags_Lost to EINS	8.5703	0.744	11.519	0.000	7.112	10.029
Tags_Will revert after reading the email	4.0998	0.157	26.139	0.000	3.792	4.407

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-1.8923	0.168	-11.242	0.000	-2.222	-1.562
TotalVisits	2.9651	0.534	5.551	0.000	1.918	4.012
Total Time Spent on Website	4.5983	0.167	27.479	0.000	4.270	4.926
Page Views Per Visit	-2.2071	0.577	-3.824	0.000	-3.338	-1.076
Lead Origin_Landing Page Submission	-1.1792	0.127	-9.270	0.000	-1.429	-0.930
Lead Origin_Lead Add Form	5.3273	0.612	8.704	0.000	4.128	6.527
Lead Source_Olark Chat	1.0519	0.132	7.971	0.000	0.793	1.311
Lead Source_Reference	-2.0535	0.644	-3.190	0.001	-3.315	-0.792
Do Not Email_Yes	-1.3867	0.177	-7.839	0.000	-1.733	-1.040
Last Activity_Email Opened	0.8011	0.109	7.335	0.000	0.587	1.015
Last Activity_Olark Chat Conversation	-0.8012	0.187	-4.283	0.000	-1.168	-0.435
Last Activity_Other	1.4134	0.232	6.090	0.000	0.958	1.868
Last Activity_SMS Sent	1.9679	0.113	17.489	0.000	1.747	2.188
Specialization_Others	-1.2345	0.124	-9.965	0.000	-1.477	-0.992
Occupation_Housewife	22.6370	1.58e+04	0.001	0.999	-3.09e+04	3.1e+04
Occupation_Working Professional	2.6399	0.195	13.526	0.000	2.257	3.022

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-1.8817	0.168	-11.198	0.000	-2.211	-1.552
TotalVisits	2.9359	0.534	5.497	0.000	1.889	3.983
Total Time Spent on Website	4.6025	0.167	27.521	0.000	4.275	4.930
Page Views Per Visit	-2.2051	0.577	-3.822	0.000	-3.336	-1.074
Lead Origin_Landing Page Submission	-1.1770	0.127	-9.262	0.000	-1.426	-0.928
Lead Origin_Lead Add Form	5.3276	0.612	8.705	0.000	4.128	6.527
Lead Source_Olark Chat	1.0509	0.132	7.965	0.000	0.792	1.309
Lead Source_Reference	-2.0417	0.644	-3.172	0.002	-3.303	-0.780
Do Not Email_Yes	-1.3908	0.177	-7.864	0.000	-1.737	-1.044
Last Activity_Email Opened	0.7970	0.109	7.317	0.000	0.583	1.010
Last Activity_Olark Chat Conversation	-0.8076	0.187	-4.320	0.000	-1.174	-0.441
Last Activity_Other	1.4055	0.232	6.058	0.000	0.951	1.860
Last Activity_SMS Sent	1.9586	0.112	17.446	0.000	1.739	2.179
Specialization_Others	-1.2386	0.124	-10.004	0.000	-1.481	-0.996
Occupation_Working Professional	2.6344	0.195	13.501	0.000	2.252	3.017

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-2.0010	0.165	-12.096	0.000	-2.325	-1.677
TotalVisits	1.9142	0.475	4.026	0.000	0.982	2.846
Total Time Spent on Website	4.5900	0.167	27.491	0.000	4.263	4.917
Lead Origin_Landing Page Submission	-1.2094	0.127	-9.542	0.000	-1.458	-0.961
Lead Origin_Lead Add Form	5.4650	0.610	8.953	0.000	4.269	6.661
Lead Source_Olark Chat	1.1880	0.127	9.330	0.000	0.938	1.438
Lead Source_Reference	-2.0300	0.643	-3.157	0.002	-3.290	-0.770
Do Not Email_Yes	-1.3887	0.177	-7.862	0.000	-1.735	-1.043
Last Activity_Email Opened	0.7528	0.108	6.969	0.000	0.541	0.964
Last Activity_Olark Chat Conversation	-0.8423	0.187	-4.514	0.000	-1.208	-0.477
Last Activity_Other	1.3476	0.231	5.837	0.000	0.895	1.800
Last Activity_SMS Sent	1.8992	0.111	17.149	0.000	1.682	2.116
Specialization_Others	-1.2142	0.124	-9.831	0.000	-1.456	-0.972
Occupation_Working Professional	2.6275	0.195	13.463	0.000	2.245	3.010

	Feature	VIF
2	Lead Origin_Landing Page Submission	3.49
6	Last Activity_Email Opened	2.64
10	Specialization_Others	2.59
0	TotalVisits	2.35
9	Last Activity_SMS Sent	2.31
1	Total Time Spent on Website	2.15
3	Lead Source_Olark Chat	2.11
7	Last Activity_Olark Chat Conversation	1.79
4	Lead Source_Reference	1.27
5	Do Not Email_Yes	1.21
12	Occupation_Working Professional	1.20
8	Last Activity_Other	1.10
11	Occupation_Student	1.03

	Feature	VIF
1	Lead Origin_Landing Page Submission	3.07
5	Last Activity_Email Opened	2.58
9	Specialization_Others	2.53
8	Last Activity_SMS Sent	2.29
0	Total Time Spent on Website	2.05
2	Lead Source_Olark Chat	2.04
6	Last Activity_Olark Chat Conversation	1.77
3	Lead Source_Reference	1.26
4	Do Not Email_Yes	1.20
11	Occupation_Working Professional	1.20
7	Last Activity_Other	1.10
10	Occupation_Student	1.03

	Feature	VIF
1	Lead Origin_Landing Page Submission	3.06
5	Last Activity_Email Opened	2.58
9	Specialization_Others	2.53
8	Last Activity_SMS Sent	2.29
0	Total Time Spent on Website	2.05
2	Lead Source_Olark Chat	2.04
6	Last Activity_Olark Chat Conversation	1.77
3	Lead Source_Reference	1.25
4	Do Not Email_Yes	1.20
10	Occupation_Working Professional	1.20
7	Last Activity_Other	1.10

	Feature	VIF
2	Page Views Per Visit	4.58
0	TotalVisits	3.44
3	Lead Origin_Landing Page Submission	2.83
1	Total Time Spent on Website	2.05
7	Last Activity_SMS Sent	1.46
9	Occupation_Working Professional	1.15
4	Lead Origin_Lead Add Form	1.14
5	Do Not Email_Yes	1.08
6	Last Activity_Olark Chat Conversation	1.02
8	Occupation_Housewife	1.00

	probability_cutoff	accuracy	sensitivity	specificity
0.0	0.0	0.370085	1.000000	0.000000
0.1	0.1	0.773613	0.976167	0.654609
0.2	0.2	0.826167	0.930487	0.764877
0.3	0.3	0.865491	0.896723	0.847141
0.4	0.4	0.880926	0.872890	0.885648
0.5	0.5	0.882396	0.846077	0.903734
0.6	0.6	0.868798	0.767627	0.928238
0.7	0.7	0.852995	0.676266	0.956826
0.8	0.8	0.836090	0.603774	0.972579
0.9	0.9	0.784638	0.447865	0.982497

	Model	No. of Predictors	Accuracy (Train)	Accuracy (Test)
0	Model 1	12	0.881991	0.880926
1	Model 2	13	0.814873	0.810731
2	Model 3	14	0.815346	0.809996
3	Model 4	11	0.795336	0.789048
4	Model 5	9	0.806208	0.805954

Lead Scoring Logistic Regression¶

Exploratory Data Analysis¶

Loading Data¶

Cleaning Data¶

Missing Values¶

Duplicate Values¶

Unnecessary Features¶

Univariate Analysis¶

Feature: 'Converted'¶

Feature: 'Lead Origin'¶

Feature: 'Lead Source'¶

Feature: 'Do Not Email'¶

Feature: 'TotalVisits'¶

Feature: 'Total Time Spent on Website'¶

Feature: 'Page Views Per Visit'¶

Feature: 'Last Activity'¶

Feature: 'Country'¶

Feature: 'Specialization'¶

Feature: 'What is your current occupation'¶

Feature: 'Tags'¶

Feature: 'City'¶

Feature: 'A free copy of Mastering The Interview'¶

Bivariate Analysis¶

Correlation¶

Data Preparation¶

Dummy Variables Creation¶

Train-Test Split¶

Feature Scaling on Training Data¶

Model Training¶

Model 1¶

Model 2¶

Model 3¶

Model 4¶

Model 5¶

Model Evaluation¶

Model 1¶

Model 2¶

Model 3¶

Model 4¶

Model 5¶

Final Model¶

Final Model Metrics¶

Conclusion¶

Lead Score¶