%pip install fast_ml    ## Required for constant feature identification package

# these are the libraries which will help to identify constant feature.
from fast_ml.utilities import display_all
from fast_ml.feature_selection import get_constant_features

Requirement already satisfied: fast_ml in /home/vk/anaconda3/envs/dsfull/lib/python3.10/site-packages (3.68)
Note: you may need to restart the kernel to use updated packages.

# import all libraries numpy, pandas, matplotlib, seaborn. 
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns; sns.set_theme(color_codes=True)

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
%matplotlib inline

# Set custom display properties in pandas
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 900) 
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Import the necessary machine learning packages (sklearn, statsmodel) for performing logistic regression
import statsmodels.api as sm 
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, recall_score,precision_score, roc_auc_score, confusion_matrix, f1_score, roc_curve, precision_recall_curve

# Import the dataset from leads.csv
lead_score_df = pd.read_csv('Leads.csv')
lead_score_df.head(1)

# looking at the information of the dataset
lead_score_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 non-null   float64
 10  Last Activity                                  9137 non-null   object 
 11  Country                                        6779 non-null   object 
 12  Specialization                                 7802 non-null   object 
 13  How did you hear about X Education             7033 non-null   object 
 14  What is your current occupation                6550 non-null   object 
 15  What matters most to you in choosing a course  6531 non-null   object 
 16  Search                                         9240 non-null   object 
 17  Magazine                                       9240 non-null   object 
 18  Newspaper Article                              9240 non-null   object 
 19  X Education Forums                             9240 non-null   object 
 20  Newspaper                                      9240 non-null   object 
 21  Digital Advertisement                          9240 non-null   object 
 22  Through Recommendations                        9240 non-null   object 
 23  Receive More Updates About Our Courses         9240 non-null   object 
 24  Tags                                           5887 non-null   object 
 25  Lead Quality                                   4473 non-null   object 
 26  Update me on Supply Chain Content              9240 non-null   object 
 27  Get updates on DM Content                      9240 non-null   object 
 28  Lead Profile                                   6531 non-null   object 
 29  City                                           7820 non-null   object 
 30  Asymmetrique Activity Index                    5022 non-null   object 
 31  Asymmetrique Profile Index                     5022 non-null   object 
 32  Asymmetrique Activity Score                    5022 non-null   float64
 33  Asymmetrique Profile Score                     5022 non-null   float64
 34  I agree to pay the amount through cheque       9240 non-null   object 
 35  A free copy of Mastering The Interview         9240 non-null   object 
 36  Last Notable Activity                          9240 non-null   object 
dtypes: float64(4), int64(3), object(30)
memory usage: 2.6+ MB

# To distinguish numerical columns either as categorical/discrete or non categorical and return as dict
def classify_feature_dtype(df, cols):
    d_categories = {'int_cat': [], "float_ts":[] }
    for col in cols:
        if (len(df[col].unique()) < 20) and (df[col].dtype != np.float64):
            d_categories['int_cat'].append(col)
        else:
            
            if not isinstance(df[col][df[col].notna()].unique()[0], str):
                d_categories['float_ts'].append(col)
            else:
                d_categories['int_cat'].append(col)
    return d_categories

# Print all statistical information for a given set of columns
def show_stats(df, cols):
    for col in list(cols):
        print("Total Nulls: {0},\nMode: {1}".format(df[col].isna().sum(), df[col].mode()[0]))
        if len(df[col].unique()) < 50:
            print("\nUnique: {0}\n".format(df[col].unique()))
        if (df[col].dtype == int) or (df[col].dtype == float):
            print("Median   : {0}, \nVariance: {1}, \n\nDescribe: {2} \n".format(df[col].median(), df[col].var(), df[col].describe()))
        print("ValueCounts: {0} \n\n\n".format((df[col].value_counts(normalize=True) * 100).head(5)))
        print("------------------------------------------------------------------")

# Return the percentage of null values in each columns in a dataframe
def check_cols_null_pct(df):
    df_non_na = df.count() / len(df)  # Ratio of non null values
    df_na_pct = (1 - df_non_na) * 100 # Find the Percentage of null values
    return df_na_pct.sort_values(ascending=False) # Sort the resulting values in descending order

# Generates charts based on the data type of the cols, as part of the univariate analysis 
# it takes dataframe, columns, train data 0,1, and feature type as args.
def univariate_plots(df, cols, target=None, ftype=None, l_dict = None):
    for col in cols:
        #generate plots and graphs for category type. (generates piechart, countplot, boxplot / if training data is provided it generates bar chart instead)
        if ftype == "categorical":
            fig, axs = plt.subplots(1, 3, figsize=(20, 6))
 
            col_idx = 0
            axs[col_idx].pie(x=df[col].value_counts().head(12), labels=df[col].value_counts().head(12).index.str[:10], autopct="%1.1f%%", 
                    radius=1, textprops={"fontsize": 10, "color": "Black"}, startangle=90, rotatelabels=False, )
            axs[col_idx].set_title("PieChart of {0}".format(col), y=1); plt.xticks(rotation=45); plt.ylabel("Percentage")
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1
            sns.countplot(data=df, y=col, order=df[col].value_counts().head(15).index, palette="viridis",  ax=axs[col_idx])
            if (l_dict is not None) and (l_dict.get(col) is not None):
                axs[col_idx].legend([ f'{k} - {v}' for k,v in l_dict[col].items()])
            axs[col_idx].set_title("Countplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col); plt.ylabel("Count")
            fig.subplots_adjust(wspace=0.5, hspace=0.3)

            col_idx += 1
            ax = sns.barplot(data=df, x=df[col].str[:10], y=target, order=df[col].value_counts().index.str[:10], palette="viridis",  ax=axs[col_idx], errwidth=0)
            for i in ax.containers:
                ax.bar_label(i,)
            axs[col_idx].set_title('Barplot against target'); plt.xticks(rotation=90); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)

            plt.suptitle("Univariate analysis of {0}".format(col), fontsize=12, y=0.95)
            plt.tight_layout()
            plt.subplots_adjust(top=0.85)
            plt.show();
            plt.clf()

        #generate plots and graphs for numerical types. (generates boxplot, histplot, kdeplot, scatterplot)
        elif ftype == "non_categorical":        
            fig, axs = plt.subplots(1, 4, figsize=(20, 6))
            
            col_idx = 0
            
            sns.boxplot(data=df, y=col, palette="viridis", flierprops=dict(marker="o", markersize=6, markerfacecolor="red", markeredgecolor="black"),
                        medianprops=dict(linestyle="-", linewidth=3, color="#FF9900"), whiskerprops=dict(linestyle="-", linewidth=2, color="black"),
                        capprops=dict(linestyle="-", linewidth=2, color="black"), ax=axs[col_idx])
            axs[col_idx].set_title("Boxplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1

            axs[col_idx].hist(data=df, x=col, label=col)
            axs[col_idx].set_title("Histogram of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1

            sns.kdeplot(df[col], shade=True, ax=axs[col_idx])
            axs[col_idx].set_title("KDE plot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1

            sns.scatterplot(df[col], ax=axs[col_idx])
            axs[col_idx].set_title("Scatterplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)

            plt.suptitle("Univariate analysis of {0}".format(col), fontsize=12, y=0.95)
            plt.tight_layout()
            plt.subplots_adjust(top=0.85)
            plt.show()
            plt.clf()

# Perform Outlier analysis on the given dataframe.
# Find Lower threshold, Upper threshold and IQR values. 
# Return the Result as a dataframe. 
# find_outlier = True argument: restricts the output df to outlier columns. whereas find_outlier = False: returns results for all columns
def get_extremeval_threshld(df, find_outlier=False):
    outlier_df = pd.DataFrame(columns=[i for i in df.columns if find_outlier == True], data=None)
    
    for col in df.columns:
        thirdq, firstq = df[col].quantile(0.75), df[col].quantile(0.25)
        iqr = 1.5 * (thirdq - firstq)
        extvalhigh, extvallow = iqr + thirdq, firstq - iqr
        
        if find_outlier == True:
            dfout = df.loc[(df[col] > extvalhigh) | (df[col] < extvallow)]
            dfout = dfout.assign(name=col, thresh_low=extvallow, thresh_high=extvalhigh)
        else:
            dfout = pd.DataFrame([[col, extvallow, extvalhigh]], columns=['name', 'thresh_low', 'thresh_high'])
            
        outlier_df = pd.concat([outlier_df, dfout])
    # outlier_df = outlier_df.reset_index(drop=True)
    outlier_df = outlier_df.set_index('name',drop=True)
    return outlier_df

#duplicates row validation by id cols
print(f"{lead_score_df.index.is_unique}, {lead_score_df.columns.is_unique}, {lead_score_df['Prospect ID'].is_unique}, {lead_score_df['Lead Number'].is_unique}")

True, True, True, True

# drop unnecessary columns
lead_score_df = lead_score_df.drop(columns=['Prospect ID','Lead Number', 'I agree to pay the amount through cheque', 'Last Notable Activity'])

# rename columns that are too long
lead_score_df = lead_score_df.rename(columns={'Total Time Spent on Website':'ttime_on_site', 'Page Views Per Visit':'pg_view_pv', 'How did you hear about X Education':'info_abt_X_Edu', 'What is your current occupation':'curr_occupation',
    'What matters most to you in choosing a course':'reason_behind_course', 'Receive More Updates About Our Courses':'more_course_updates', 'Update me on Supply Chain Content':'supply_chain_info', 'Get updates on DM Content':'get_dm',
    'Asymmetrique Activity Index':'asym_activ_idx', 'Asymmetrique Profile Index':'asym_prof_idx', 'Asymmetrique Activity Score':'asym_activ_score', 'Asymmetrique Profile Score':'asym_prof_score',
    'A free copy of Mastering The Interview':'avail_free_copy'})

# replace unnecessary space in columns with underscore and covert it to lower case
lead_score_df.columns = lead_score_df.columns.str.replace(pat=' ',repl='_', regex=True)
lead_score_df.columns = lead_score_df.columns.str.lower()

# Check the shape and size of the data frame
lead_score_df.head(1)
lead_score_df.dtypes

lead_origin                 object
lead_source                 object
do_not_email                object
do_not_call                 object
converted                    int64
totalvisits                float64
ttime_on_site                int64
pg_view_pv                 float64
last_activity               object
country                     object
specialization              object
info_abt_x_edu              object
curr_occupation             object
reason_behind_course        object
search                      object
magazine                    object
newspaper_article           object
x_education_forums          object
newspaper                   object
digital_advertisement       object
through_recommendations     object
more_course_updates         object
tags                        object
lead_quality                object
supply_chain_info           object
get_dm                      object
lead_profile                object
city                        object
asym_activ_idx              object
asym_prof_idx               object
asym_activ_score           float64
asym_prof_score            float64
avail_free_copy             object
dtype: object

print(f'{lead_score_df.shape}, {lead_score_df.size}')

(9240, 33), 304920

# check constant features that has only one values
# In the given data set there are a lot of features that have only single value as a category
# These are called as constant features and these features are of little relevance for the machine learning model hence we dropped those features

constant_features = get_constant_features(lead_score_df)
constant_features.head(10)
"','".join(constant_features['Var'].to_list())

"magazine','more_course_updates','supply_chain_info','get_dm','x_education_forums','newspaper','do_not_call','newspaper_article','digital_advertisement','through_recommendations','search"

# drop all the constant_features
lead_score_df = lead_score_df.drop(['magazine', 'more_course_updates', 'supply_chain_info', 'get_dm', 'x_education_forums', 
                                    'newspaper', 'do_not_call', 'newspaper_article', 'digital_advertisement', 'through_recommendations', 'search',], axis=1)

# seeing the values of reason_behind_course column
# since this has also almost same value means constant value then we will drop this column too
lead_score_df['reason_behind_course'].value_counts(normalize=True)*100
lead_score_df = lead_score_df.drop(['reason_behind_course'], axis=1)

# dropping country column too since this doesn't have much input in the model
# and then checking if there any null values present in any column or not.
lead_score_df[['country']].value_counts(normalize=True)*100
lead_score_df = lead_score_df.drop(['country'], axis=1)

reason_behind_course
Better Career Prospects     99.954
Flexibility & Convenience    0.031
Other                        0.015
Name: proportion, dtype: float64

country             
India                  95.766
United States           1.018
United Arab Emirates    0.782
Singapore               0.354
Saudi Arabia            0.310
United Kingdom          0.221
Australia               0.192
Qatar                   0.148
Bahrain                 0.103
Hong Kong               0.103
France                  0.089
Oman                    0.089
unknown                 0.074
Kuwait                  0.059
Nigeria                 0.059
South Africa            0.059
Germany                 0.059
Canada                  0.059
Sweden                  0.044
Uganda                  0.030
Philippines             0.030
Asia/Pacific Region     0.030
Italy                   0.030
Ghana                   0.030
China                   0.030
Belgium                 0.030
Bangladesh              0.030
Netherlands             0.030
Malaysia                0.015
Liberia                 0.015
Russia                  0.015
Kenya                   0.015
Indonesia               0.015
Sri Lanka               0.015
Switzerland             0.015
Tanzania                0.015
Denmark                 0.015
Vietnam                 0.015
Name: proportion, dtype: float64

# check null val percentage
# After checking the null value percentage for all the features
# We could see that there are many features that have more than 45% of non values

null_pct = check_cols_null_pct(lead_score_df)
null_pct[null_pct > 0]

lead_quality       51.591
asym_activ_idx     45.649
asym_prof_score    45.649
asym_activ_score   45.649
asym_prof_idx      45.649
tags               36.288
lead_profile       29.318
curr_occupation    29.113
info_abt_x_edu     23.885
specialization     15.563
city               15.368
totalvisits         1.483
pg_view_pv          1.483
last_activity       1.115
lead_source         0.390
dtype: float64

# check null val percentage
# Since we have replaced select as null value for certain columns we are seeing increase in the null value percentage for those features
# Therefore we dropped all the features that have more than 40 of null values

null_pct = check_cols_null_pct(lead_score_df)
lead_score_df = lead_score_df.drop(null_pct[null_pct > 40].index, axis=1)

null_pct = check_cols_null_pct(lead_score_df)
null_pct[null_pct > 0]

tags              36.288
lead_profile      29.318
curr_occupation   29.113
info_abt_x_edu    23.885
specialization    15.563
city              15.368
totalvisits        1.483
pg_view_pv         1.483
last_activity      1.115
lead_source        0.390
dtype: float64

show_stats(lead_score_df,['tags','specialization'])

Total Nulls: 3353,
Mode: Will revert after reading the email

Unique: ['Interested in other courses' 'Ringing'
 'Will revert after reading the email' nan 'Lost to EINS'
 'In confusion whether part time or DLP' 'Busy' 'switched off'
 'in touch with EINS' 'Already a student' 'Diploma holder (Not Eligible)'
 'Graduation in progress' 'Closed by Horizzon' 'number not provided'
 'opp hangup' 'Not doing further education' 'invalid number'
 'wrong number given' 'Interested  in full time MBA' 'Still Thinking'
 'Lost to Others' 'Shall take in the next coming month' 'Lateral student'
 'Interested in Next batch' 'Recognition issue (DEC approval)'
 'Want to take admission but has financial problems'
 'University not recognized']

ValueCounts: tags
Will revert after reading the email   35.196
Ringing                               20.435
Interested in other courses            8.714
Already a student                      7.899
Closed by Horizzon                     6.081
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 1438,
Mode: Select

Unique: ['Select' 'Business Administration' 'Media and Advertising' nan
 'Supply Chain Management' 'IT Projects Management' 'Finance Management'
 'Travel and Tourism' 'Human Resource Management' 'Marketing Management'
 'Banking, Investment And Insurance' 'International Business' 'E-COMMERCE'
 'Operations Management' 'Retail Management' 'Services Excellence'
 'Hospitality Management' 'Rural and Agribusiness' 'Healthcare Management'
 'E-Business']

ValueCounts: specialization
Select                      24.891
Finance Management          12.510
Human Resource Management   10.869
Marketing Management        10.741
Operations Management        6.447
Name: proportion, dtype: float64 


------------------------------------------------------------------

# replacing select and imputing null value of the 'tags','specialization','info_abt_x_edu', 'lead_profile', 'city', 'curr_occupation' columns as unknown
lead_score_df[['tags','specialization','info_abt_x_edu', 'lead_profile', 'city', 'curr_occupation']] = lead_score_df[['tags','specialization','info_abt_x_edu', 'lead_profile', 'city', 'curr_occupation']].replace(to_replace=['select', 'Select', np.nan], value='unknown') 

# validate select str is replaced
[i for i in lead_score_df.columns if 'select' in (lead_score_df[i].astype(str).str.lower()).str.findall('select').value_counts().index.map(''.join).to_list()]

[]

null_pct = check_cols_null_pct(lead_score_df)
null_pct[null_pct > 0]

totalvisits     1.483
pg_view_pv      1.483
last_activity   1.115
lead_source     0.390
dtype: float64

# convert dtypes
# Convert the data type of certain features from object to category type
obj_cols = lead_score_df.select_dtypes(include='object').columns
lead_score_df[obj_cols] = lead_score_df[obj_cols].astype(dtype='category')

# impute missing categorical values using mode, if a particular value in that column has higher frequency say > 60%
for i in lead_score_df.select_dtypes(include='category'):
    temp = lead_score_df[i].value_counts(normalize=True, ascending=False) * 100
    if temp.iloc[0] > 60:
        if lead_score_df[i].isna().sum()>0:
            print(i)
            lead_score_df[i] = lead_score_df[i].fillna(temp.index[0])

# Now there are only four numerical columns with very few null values

null_pct = check_cols_null_pct(lead_score_df)
null_pct[null_pct > 0]

totalvisits     1.483
pg_view_pv      1.483
last_activity   1.115
lead_source     0.390
dtype: float64

# univariate plots#
# Now we perform Univariate analysis on both categorical and numerical variables
dtype_dict = classify_feature_dtype(lead_score_df, lead_score_df.columns )
univariate_plots(lead_score_df, dtype_dict['float_ts'], ftype='non_categorical', target='converted')

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

# univariate plots
cols = dtype_dict['int_cat'].copy()
cols.remove('converted')
univariate_plots(lead_score_df, cols, ftype='categorical', target='converted')

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

# Bivariate plots
# The pair plot view shows that there is a slight correlation between total visits and page view per visit
sns.pairplot(lead_score_df)
plt.show();

# multivariate plots
# The heat map shows that majority of the features have no correlation or has lesser than 0 5 correlation

plt.figure(figsize = (20, 4)) # Size of the figure
sns.heatmap(lead_score_df.select_dtypes(exclude='category').corr(), annot = True)
plt.show();

# # Boxplots - numerical features against target

axs = 131
plt.figure(figsize=(26, 6))
for i in list(set(dtype_dict['float_ts']) - set(['date','converted','lead_number'])):
    plt.subplot(axs)
    sns.boxplot(y=i, x='converted', data=lead_score_df, palette='tab10')
    axs += 1
plt.show();

# # Generate Bivariate Boxplots combinations for all the categorical vs continuous columns

x_lst = list(dtype_dict['int_cat']) # x_list variable contains all the categorical Columns 
y_lst = list(dtype_dict['float_ts'])# y_list contains all Continuous feature type columns 
axs = 1
for x_col in x_lst:
    plt.figure(figsize=(26,72))
    for y_col in y_lst:
        plt.subplot(18,4,axs)
        sns.boxplot(x=x_col, y=y_col, data=lead_score_df, palette='tab10')
        plt.xticks(rotation=90)
        axs += 1
    plt.show();
    axs = 1
plt.show();

# After performing eda we identified outliers in "page view per visit" "totalvisits" "ttime on site" columns we therefore cap all the values to the upper cut off and lower cut off of the iqr range

ex_val_df = get_extremeval_threshld(df=lead_score_df.select_dtypes(exclude=['category','object']) )
ex_val_df
lead_score_df.describe(percentiles=[.05,.1,.2,.5,.8,.9])

# Fix Outliers by setting either thresh low or thresh low for both extremes 
lower_cutoff = ex_val_df.loc['pg_view_pv','thresh_low']
lead_score_df['pg_view_pv'] = np.where((lead_score_df['pg_view_pv'] < lower_cutoff), lower_cutoff, lead_score_df['pg_view_pv'])
upper_cutoff = ex_val_df.loc['pg_view_pv','thresh_high']
lead_score_df['pg_view_pv'] = np.where((lead_score_df['pg_view_pv'] > upper_cutoff), upper_cutoff, lead_score_df['pg_view_pv'])

# Fix Outliers by setting either thresh low or thresh low for both extremes
lower_cutoff = ex_val_df.loc['totalvisits','thresh_low']
lead_score_df['totalvisits'] = np.where((lead_score_df['totalvisits'] < lower_cutoff), lower_cutoff, lead_score_df['totalvisits'])
upper_cutoff = ex_val_df.loc['totalvisits','thresh_high']
lead_score_df['totalvisits'] = np.where((lead_score_df['totalvisits'] > upper_cutoff), upper_cutoff, lead_score_df['totalvisits'])

# Fix Outliers by setting either thresh low or thresh low for both extremes
lower_cutoff = ex_val_df.loc['ttime_on_site','thresh_low']
lead_score_df['ttime_on_site'] = np.where((lead_score_df['ttime_on_site'] < lower_cutoff), lower_cutoff, lead_score_df['ttime_on_site'])
upper_cutoff = ex_val_df.loc['ttime_on_site','thresh_high']
lead_score_df['ttime_on_site'] = np.where((lead_score_df['ttime_on_site'] > upper_cutoff), upper_cutoff, lead_score_df['ttime_on_site'])

lead_score_df.describe()

# our null values have significantly reduced
null_pct = check_cols_null_pct(lead_score_df)
null_pct[null_pct>0]
lead_score_df.shape

totalvisits     1.483
pg_view_pv      1.483
last_activity   1.115
lead_source     0.390
dtype: float64

(9240, 15)

# dropping the rows from the 'last_activity','lead_source','totalvisits','pg_view_pv' columns as they are small in number
lead_score_df = lead_score_df.dropna(subset=['last_activity','lead_source','totalvisits','pg_view_pv'])

### there are no null values

null_pct = check_cols_null_pct(lead_score_df)
null_pct[null_pct>0]

Series([], dtype: float64)

lead_score_df.shape
lead_score_df.dtypes

(9074, 15)

lead_origin        category
lead_source        category
do_not_email       category
converted             int64
totalvisits         float64
ttime_on_site       float64
pg_view_pv          float64
last_activity      category
specialization     category
info_abt_x_edu     category
curr_occupation    category
tags               category
lead_profile       category
city               category
avail_free_copy    category
dtype: object

dtype_dict = classify_feature_dtype(lead_score_df, lead_score_df.columns )
univariate_plots(lead_score_df, dtype_dict['float_ts'], ftype='non_categorical', target='converted')

cols = dtype_dict['int_cat'].copy()
cols.remove('converted')
univariate_plots(lead_score_df, cols, ftype='categorical', target='converted')

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

# replace Yes, No with 1 and 0
lead_score_df = lead_score_df.replace(to_replace=['Yes', 'No'], value=[1, 0])

lead_score_df.describe(include=np.number)
lead_score_df.describe(exclude=np.number)

# Data Imbalance
# From the target variable we have found out the imbalance ratios around 60 therefore we decide not to rebalance

imbalance_ratio = sum(lead_score_df['converted'] == 1)/sum(lead_score_df['converted'] == 0) * 100
print(f'{round(imbalance_ratio, 2)}%')

60.92%

# Conversion Ratio 
# From the target variable the conversion ratio is around 38 it shows that there is a very high probability of failure in conversion

converted = (sum(lead_score_df['converted'])/len(lead_score_df['converted'].index))*100
print(f'{round(converted, 2)}%')

37.86%

# errorline # do no remove this line

#  we perform dummy encoding
new_ls_df = pd.get_dummies(lead_score_df, columns=lead_score_df.select_dtypes('category').columns.difference(['tags','specialization','info_abt_x_edu', 'lead_profile', 'city', 'curr_occupation']), drop_first=True, dtype=float)
new_ls_df.head(1)
new_ls_df.shape

(9074, 52)

# Creating dummy variable separately for the variable 'tags', 'specialization', 'info_abt_x_edu', 'lead_profile', 'city', 'curr_occupation' since it has the level 'Select' 
# which is useless so we # drop that level by specifying it explicitly
new_ls_df = pd.get_dummies(new_ls_df, columns=['tags', 'specialization', 'info_abt_x_edu', 'lead_profile', 'city', 'curr_occupation'], dtype=float)

# Drop the variables for which the dummy variables have been created
new_ls_df = new_ls_df.drop(new_ls_df.filter(like='unknown',axis=1).columns, axis=1)
new_ls_df.shape

(9074, 116)

new_ls_df.head(2)

corr = new_ls_df[new_ls_df.select_dtypes(exclude=['category','object']).columns].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True)

corr_0 = new_ls_df.corr(numeric_only=True).abs()
corr_0 = corr_0.unstack()
correlation_0 = corr_0.sort_values()
correlation_0 = corr_0.dropna()
# correlation_0

correlation_0 = correlation_0[correlation_0 != 1.0]
correlation_target_zero = correlation_0.reset_index()
correlation_target_zero.sort_values(by=0, ascending=False).head(10)

# spliting the data into independent and target variable
X = new_ls_df.drop(['converted'], axis=1)
y = new_ls_df['converted']

# Split the dataset into 70% train and 30% test, and set the random state to 100
# np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, test_size=0.30, random_state=100)

# Check the shape of the train dataset and the test dataset
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6351, 115)
(2723, 115)
(6351,)
(2723,)

# Post split we perform standard scaling they fit and transform the train data set
# to_scale = ['totalvisits', 'ttime_on_site', 'pg_view_pv']
to_scale = list(X.columns)

scaler = StandardScaler()
X_train[to_scale] = scaler.fit_transform(X_train[to_scale], y_train)
X_train.head(5)

# We create custom functions for model veiling since iteration we reuse certain functions again and again
# Train and predict function trains the model and predicts on the same data and returns the model its probability and predicted values based on cutoff
# The matrix function returns confusion matrix and accuracy score
# The vif function returns the vif score for the features

def logreg_train_pred_fn(fX_train, fy_train, fcol, fcutoff):
    fX_train_sm = sm.add_constant(fX_train[fcol])
    flogm = sm.GLM(fy_train, fX_train_sm, family = sm.families.Binomial())
    fres = flogm.fit()
    fy_train_pred = fres.predict(fX_train_sm)
    fy_train_pred = fy_train_pred.values.reshape(-1)
    fy_train_pred_final = pd.DataFrame({'Converted':fy_train.values, 'Conv_Prob':fy_train_pred})
    fy_train_pred_final['ID'] = fy_train.index
    fy_train_pred_final['predicted'] = fy_train_pred_final.Conv_Prob.map(lambda x: 1 if x > fcutoff else 0)
    return fres, fy_train_pred,fy_train_pred_final

def logreg_metrics_fn(fy_train_pred_final):
    fconfusion = confusion_matrix(fy_train_pred_final.Converted, fy_train_pred_final.predicted )
    faccuracy = accuracy_score(fy_train_pred_final.Converted, fy_train_pred_final.predicted)
    return fconfusion, faccuracy
   
def logreg_VIF_score_fn(fX_train, fcol):
    fvif = pd.DataFrame()
    fvif['Features'] = fX_train[fcol].columns
    fvif['VIF'] = [variance_inflation_factor(fX_train[fcol].values, i) for i in range(fX_train[fcol].shape[1])]
    fvif['VIF'] = round(fvif['VIF'], 2)
    fvif = fvif.sort_values(by = "VIF", ascending = False)
    return fvif

# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
res = logm1.fit()
# res.summary()

# Since the data set has a lot of features we perform rfe to eliminate insignificant features
logreg = LogisticRegression()

rfe = RFE(estimator=logreg, n_features_to_select=15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)

# Let's take a look at which features have been selected by RFE
rfe_feature_Ranking = list(zip(X_train.columns, rfe.support_, rfe.ranking_))

rfe_sorted = sorted(rfe_feature_Ranking, key=lambda x : x[2])
rfe_sorted[:15]

[('ttime_on_site', True, 1),
 ('last_activity_SMS Sent', True, 1),
 ('lead_origin_Landing Page Submission', True, 1),
 ('lead_source_Welingak Website', True, 1),
 ('tags_Already a student', True, 1),
 ('tags_Closed by Horizzon', True, 1),
 ('tags_Interested in other courses', True, 1),
 ('tags_Lost to EINS', True, 1),
 ('tags_Ringing', True, 1),
 ('tags_Will revert after reading the email', True, 1),
 ('tags_invalid number', True, 1),
 ('tags_switched off', True, 1),
 ('tags_wrong number given', True, 1),
 ('curr_occupation_Unemployed', True, 1),
 ('curr_occupation_Working Professional', True, 1)]

# Put all the columns selected by RFE in the variable 'col'
col = X_train.columns[rfe.support_]
# X_train.columns[~rfe.support_]

# Now we perform model iteration as many times as possible till we get an optimum result

cutoff = 0.5
res, y_train_pred,y_train_pred_final = logreg_train_pred_fn(X_train, y_train, col, cutoff)
confusion, accuracy = logreg_metrics_fn(y_train_pred_final)
vif = logreg_VIF_score_fn(X_train, col)

print('Model Summary:')          # Model Summary:
res.summary()
print('\nVIF Score:')            # VIF Score:
vif

Model Summary:

VIF Score:

# the 'tags_wrong number given' has higher p-value therefore we drop it.
# after dropping we will again make a model and see the stats

col = col.drop('tags_wrong number given', 1)
col

cutoff = 0.5
res, y_train_pred,y_train_pred_final = logreg_train_pred_fn(X_train, y_train, col, cutoff)
confusion, accuracy = logreg_metrics_fn(y_train_pred_final)
vif = logreg_VIF_score_fn(X_train, col)

print('Model Summary:')          # Model Summary:
res.summary()
print('\nVIF Score:')            # VIF Score:
vif

# print('\nY_Predicted Values:')   # Y_Predicted Values:
# y_train_pred
# print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
# y_train_pred_final

print('\nConfusion_Matrix:')     # Confusion Matrix: 
confusion
print(f'\nAccuracy: {accuracy}\n')   # Accuracy Score:

Index(['ttime_on_site', 'last_activity_SMS Sent',
       'lead_origin_Landing Page Submission', 'lead_source_Welingak Website',
       'tags_Already a student', 'tags_Closed by Horizzon',
       'tags_Interested in other courses', 'tags_Lost to EINS', 'tags_Ringing',
       'tags_Will revert after reading the email', 'tags_invalid number',
       'tags_switched off', 'curr_occupation_Unemployed',
       'curr_occupation_Working Professional'],
      dtype='object')

Model Summary:

VIF Score:

Confusion_Matrix:

array([[3757,  148],
       [ 313, 2133]])

Accuracy: 0.9274130058258542

# the 'tags_invalid number' has higher p-value therefore we drop it.
# we will again make a model and see the stats

col = col.drop('tags_invalid number', 1)
col

cutoff = 0.5
res, y_train_pred,y_train_pred_final = logreg_train_pred_fn(X_train, y_train, col, cutoff)
confusion, accuracy = logreg_metrics_fn(y_train_pred_final)
vif = logreg_VIF_score_fn(X_train, col)

print('Model Summary:')          # Model Summary:
res.summary()
print('\nVIF Score:')            # VIF Score:
vif

# print('\nY_Predicted Values:')   # Y_Predicted Values:
# y_train_pred
# print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
# y_train_pred_final

print('\nConfusion_Matrix:')     # Confusion Matrix: 
confusion
print(f'\nAccuracy: {accuracy}\n')   # Accuracy Score:

Index(['ttime_on_site', 'last_activity_SMS Sent',
       'lead_origin_Landing Page Submission', 'lead_source_Welingak Website',
       'tags_Already a student', 'tags_Closed by Horizzon',
       'tags_Interested in other courses', 'tags_Lost to EINS', 'tags_Ringing',
       'tags_Will revert after reading the email', 'tags_switched off',
       'curr_occupation_Unemployed', 'curr_occupation_Working Professional'],
      dtype='object')

Model Summary:

VIF Score:

Confusion_Matrix:

array([[3759,  146],
       [ 320, 2126]])

Accuracy: 0.9266257282317745

# taking values for TP , TN, FP, FN
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.869
specificity - 0.963
Precision - 0.936
Recall - 0.869

# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Conv_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

       prob  accuracy  sensi  speci
0.000 0.000     0.385  1.000  0.000
0.100 0.100     0.846  0.974  0.765
0.200 0.200     0.900  0.933  0.880
0.300 0.300     0.916  0.917  0.914
0.400 0.400     0.923  0.887  0.945
0.500 0.500     0.927  0.869  0.963
0.600 0.600     0.924  0.852  0.969
0.700 0.700     0.910  0.805  0.977
0.800 0.800     0.904  0.781  0.982
0.900 0.900     0.889  0.728  0.991

# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

<Axes: xlabel='prob'>

y_train_pred_final['final_predicted'] = y_train_pred_final.Conv_Prob.map( lambda x: 1 if x > 0.30 else 0)
y_train_pred_final.head()

# Let's check the overall accuracy.
accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)
confusion = confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion

0.9156038419146592

array([[3571,  334],
       [ 202, 2244]])

# taking values for TP , TN, FP, FN
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.917
specificity - 0.914
Precision - 0.87
Recall - 0.917

# created a function for the ROC curve creation and see the lables 
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = roc_curve( actual, probs, drop_intermediate = False )
    auc_score = roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    return None

fpr, tpr, thresholds = roc_curve( y_train_pred_final.Converted, y_train_pred_final.Conv_Prob, drop_intermediate = False )
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob)

# we are looking for precision recall score using inbuilt function
precision_score(y_train_pred_final.Converted, y_train_pred_final.predicted)
recall_score(y_train_pred_final.Converted, y_train_pred_final.predicted)

0.9357394366197183

0.8691741618969746

# plotting Presicion and Recall curve and finding the cutoff for this.
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob)
plt.plot(thresholds, p[:-1], "b")
plt.plot(thresholds, r[:-1], "r")
plt.title('Precision Recall Curve')
plt.show();

# used the cutoff value to see the prediction on train dataset.
y_train_pred_final['final_predicted'] = y_train_pred_final.Conv_Prob.map( lambda x: 1 if x > 0.38 else 0)
y_train_pred_final.head()

# Let's check the overall accuracy.
accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)
confusion = confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion

0.9209573295544009

array([[3669,  236],
       [ 266, 2180]])

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.891
specificity - 0.94
Precision - 0.902
Recall - 0.891

# we use these function to do the prediction on test data.
def logreg_test_pred_fn(fX_test, fy_test, fcol, fcutoff, fres):
    fX_test_sm = sm.add_constant(fX_test[fcol])
    fy_test_pred = fres.predict(fX_test_sm)
    fy_test_pred = fy_test_pred.values.reshape(-1)
    fy_test_pred_final = pd.DataFrame({'Converted':fy_test.values, 'Conv_Prob':fy_test_pred})
    fy_test_pred_final['ID'] = fy_test.index
    fy_test_pred_final['predicted'] = fy_test_pred_final.Conv_Prob.map(lambda x: 1 if x > fcutoff else 0)
    return fres, fy_test_pred,fy_test_pred_final

# this function is used to generate metrics.
def logreg_test_metrics_fn(fy_test_pred_final):
    fconfusion = confusion_matrix(fy_test_pred_final.Converted, fy_test_pred_final.predicted )
    faccuracy = accuracy_score(fy_test_pred_final.Converted, fy_test_pred_final.predicted)
    return fconfusion, faccuracy
    
# using this function we can see VIF score for multicollinearity
def logreg_test_VIF_score_fn(fX_test, fcol):
    fvif = pd.DataFrame()
    fvif['Features'] = fX_test[fcol].columns
    fvif['VIF'] = [variance_inflation_factor(fX_test[fcol].values, i) for i in range(fX_test[fcol].shape[1])]
    fvif['VIF'] = round(fvif['VIF'], 2)
    fvif = fvif.sort_values(by = "VIF", ascending = False)
    return fvif

# scaling for test data
X_test[to_scale] = scaler.transform(X_test[to_scale])
X_test[col].head(2)
X_test.shape

(2723, 115)

# we got the cutoff as 0.30 for sensitivity specificity, so we are gonna use that as threshold.
cutoff = 0.30
res, y_test_pred, y_test_pred_final = logreg_test_pred_fn(X_test, y_test, col, cutoff, res)
confusion, accuracy = logreg_test_metrics_fn(y_test_pred_final)
vif = logreg_test_VIF_score_fn(X_test, col)

# print('\nY_Predicted Values:')   # Y_Predicted Values:
# y_test_pred
# print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
# y_test_pred_final

print('\nConfusion_Matrix:')     # Confusion Matrix: 
confusion
print(f'\nAccuracy: {accuracy}\n')  # Accuracy Score:

Confusion_Matrix:

array([[1563,  171],
       [  99,  890]])

Accuracy: 0.9008446566287184

# we measure confusion matrix metrics
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.9
specificity - 0.901
Precision - 0.839
Recall - 0.9

# we got the cutoff as 0.38 for Precision Recall, so we are gonna use that as threshold.

cutoff = 0.38
res, y_test_pred, y_test_pred_final = logreg_test_pred_fn(X_test, y_test, col, cutoff, res)
confusion, accuracy = logreg_test_metrics_fn(y_test_pred_final)
vif = logreg_test_VIF_score_fn(X_test, col)

# print('\nY_Predicted Values:')   # Y_Predicted Values:
# y_test_pred
# print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
# y_test_pred_final

print('\nConfusion_Matrix:')     # Confusion Matrix: 
confusion
print(f'\nAccuracy: {accuracy}\n')  # Accuracy Score:

Confusion_Matrix:

array([[1612,  122],
       [ 131,  858]])

Accuracy: 0.9070877708409842

# we measure confusion matrix metrics
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.868
specificity - 0.93
Precision - 0.876
Recall - 0.868

# the dataset has features such as city and tags 
# tags having higher null values 
# city is having a single label as high frequency, hence we drop those features 
new_ls_df = lead_score_df.drop(['city', 'tags'], axis = 1)

# we also drop all those rows that have curr_occupation as unknown
new_ls_df = new_ls_df[ ~(new_ls_df['curr_occupation'] == 'unknown') ]

# size of our dataset is now reduced, in comparison to original dataset.
print(len(new_ls_df.index)/9240)

0.6916666666666667

#  we perform dummy encoding
new_ls_df = pd.get_dummies(new_ls_df, columns=new_ls_df.select_dtypes('category').columns.difference(['tags','specialization','info_abt_x_edu', 'lead_profile', 'city', 'curr_occupation']), drop_first=True, dtype=float)
new_ls_df.head(1)
new_ls_df.shape

(6391, 50)

new_ls_df = pd.get_dummies(new_ls_df, columns=[   'specialization', 'info_abt_x_edu', 'lead_profile' , 'curr_occupation'], dtype=float)
new_ls_df = new_ls_df.drop(new_ls_df.filter(like='unknown',axis=1).columns, axis=1)
new_ls_df.shape

(6391, 84)

new_ls_df.head(2)

corr = new_ls_df[new_ls_df.select_dtypes(exclude=['category','object']).columns].corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
with sns.axes_style("white"):
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True)

corr_0 = new_ls_df.corr(numeric_only=True).abs()
corr_0 = corr_0.unstack()
correlation_0 = corr_0.sort_values()
correlation_0 = corr_0.dropna()
# correlation_0

correlation_0 = correlation_0[correlation_0 != 1.0]
correlation_target_zero = correlation_0.reset_index()
correlation_target_zero.sort_values(by=0, ascending=False).head(10)

# spliting the data into independent and target variable
X = new_ls_df.drop(['converted'], axis=1)
y = new_ls_df['converted']

# Now we split the dataset into train and test set
# np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, test_size=0.30, random_state=100)

# Post split we perform standard scaling they fit and transform the train data set
# to_scale = ['totalvisits', 'ttime_on_site', 'pg_view_pv']
to_scale = list(X.columns)

scaler = StandardScaler()
X_train[to_scale] = scaler.fit_transform(X_train[to_scale], y_train)
X_train.head(5)

# We create custom functions for model building therefore we reuse certain functions again and again
# Train and predict_function trains the model and predicts on the same data and returns the model its probability and predicted values based on cutoff

def logreg_train_pred_fn(fX_train, fy_train, fcol, fcutoff):
    fX_train_sm = sm.add_constant(fX_train[fcol])
    flogm = sm.GLM(fy_train, fX_train_sm, family = sm.families.Binomial())
    fres = flogm.fit()
    fy_train_pred = fres.predict(fX_train_sm)
    fy_train_pred = fy_train_pred.values.reshape(-1)
    fy_train_pred_final = pd.DataFrame({'Converted':fy_train.values, 'Conv_Prob':fy_train_pred})
    fy_train_pred_final['ID'] = fy_train.index
    fy_train_pred_final['predicted'] = fy_train_pred_final.Conv_Prob.map(lambda x: 1 if x > fcutoff else 0)
    return fres, fy_train_pred,fy_train_pred_final

# The matrix function returns confusion matrix and accuracy score
def logreg_metrics_fn(fy_train_pred_final):
    fconfusion = confusion_matrix(fy_train_pred_final.Converted, fy_train_pred_final.predicted )
    faccuracy = accuracy_score(fy_train_pred_final.Converted, fy_train_pred_final.predicted)
    return fconfusion, faccuracy
   
# The vif function returns the vif score for the features
def logreg_VIF_score_fn(fX_train, fcol):
    fvif = pd.DataFrame()
    fvif['Features'] = fX_train[fcol].columns
    fvif['VIF'] = [variance_inflation_factor(fX_train[fcol].values, i) for i in range(fX_train[fcol].shape[1])]
    fvif['VIF'] = round(fvif['VIF'], 2)
    fvif = fvif.sort_values(by = "VIF", ascending = False)
    return fvif

# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
res = logm1.fit()
# res.summary()

# Since the data set has a lot of features we perform rfe to eliminate insignificant features
logreg = LogisticRegression()

rfe = RFE(estimator=logreg, n_features_to_select=15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)
rfe_feature_Ranking = list(zip(X_train.columns, rfe.support_, rfe.ranking_))

rfe_sorted = sorted(rfe_feature_Ranking, key=lambda x : x[2])
rfe_sorted[:15]

[('ttime_on_site', True, 1),
 ('do_not_email_1', True, 1),
 ('last_activity_Converted to Lead', True, 1),
 ('last_activity_Email Bounced', True, 1),
 ('last_activity_Email Opened', True, 1),
 ('last_activity_Olark Chat Conversation', True, 1),
 ('last_activity_Page Visited on Website', True, 1),
 ('lead_origin_Lead Add Form', True, 1),
 ('lead_source_Olark Chat', True, 1),
 ('lead_source_Welingak Website', True, 1),
 ('lead_profile_Dual Specialization Student', True, 1),
 ('lead_profile_Lateral Student', True, 1),
 ('lead_profile_Potential Lead', True, 1),
 ('lead_profile_Student of SomeSchool', True, 1),
 ('curr_occupation_Working Professional', True, 1)]

col = X_train.columns[rfe.support_]
# X_train.columns[~rfe.support_]

# Now we perform model iteration as many times as possible till we get an optimum result

cutoff = 0.5
res, y_train_pred,y_train_pred_final = logreg_train_pred_fn(X_train, y_train, col, cutoff)
confusion, accuracy = logreg_metrics_fn(y_train_pred_final)
vif = logreg_VIF_score_fn(X_train, col)

print('Model Summary:')          # Model Summary:
res.summary()
print('\nVIF Score:')            # VIF Score:
vif

Model Summary:

VIF Score:

# the 'lead_profile_Lateral Student' has higher p-value therefore we drop it.
# after dropping we will again make a model and see the stats

col = col.drop('lead_profile_Lateral Student', 1)
col

cutoff = 0.5
res, y_train_pred,y_train_pred_final = logreg_train_pred_fn(X_train, y_train, col, cutoff)
confusion, accuracy = logreg_metrics_fn(y_train_pred_final)
vif = logreg_VIF_score_fn(X_train, col)

print('Model Summary:')          # Model Summary:
res.summary()
print('\nVIF Score:')            # VIF Score:
vif

Index(['ttime_on_site', 'do_not_email_1', 'last_activity_Converted to Lead',
       'last_activity_Email Bounced', 'last_activity_Email Opened',
       'last_activity_Olark Chat Conversation',
       'last_activity_Page Visited on Website', 'lead_origin_Lead Add Form',
       'lead_source_Olark Chat', 'lead_source_Welingak Website',
       'lead_profile_Dual Specialization Student',
       'lead_profile_Potential Lead', 'lead_profile_Student of SomeSchool',
       'curr_occupation_Working Professional'],
      dtype='object')

Model Summary:

VIF Score:

# the 'lead_profile_Dual Specialization Student' has higher p-value therefore we drop it.
# after dropping we will again make a model and see the stats

col = col.drop('lead_profile_Dual Specialization Student', 1)
col

cutoff = 0.5
res, y_train_pred,y_train_pred_final = logreg_train_pred_fn(X_train, y_train, col, cutoff)
confusion, accuracy = logreg_metrics_fn(y_train_pred_final)
vif = logreg_VIF_score_fn(X_train, col)

print('Model Summary:')          # Model Summary:
res.summary()
print('\nVIF Score:')            # VIF Score:
vif

# print('\nY_Predicted Values:')   # Y_Predicted Values:
# y_train_pred
# print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
# y_train_pred_final

print('\nConfusion Matrix:')     # Confusion Matrix: 
confusion
print(f'\nAccuracy Score: {accuracy}\n')   # Accuracy Score:

Index(['ttime_on_site', 'do_not_email_1', 'last_activity_Converted to Lead',
       'last_activity_Email Bounced', 'last_activity_Email Opened',
       'last_activity_Olark Chat Conversation',
       'last_activity_Page Visited on Website', 'lead_origin_Lead Add Form',
       'lead_source_Olark Chat', 'lead_source_Welingak Website',
       'lead_profile_Potential Lead', 'lead_profile_Student of SomeSchool',
       'curr_occupation_Working Professional'],
      dtype='object')

Model Summary:

VIF Score:

Confusion Matrix:

array([[1927,  422],
       [ 478, 1646]])

Accuracy Score: 0.7987927565392354

# taking values for TP , TN, FP, FN
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.775
specificity - 0.82
Precision - 0.796
Recall - 0.775

# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Conv_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

       prob  accuracy  sensi  speci
0.000 0.000     0.475  1.000  0.000
0.100 0.100     0.588  0.990  0.225
0.200 0.200     0.718  0.947  0.511
0.300 0.300     0.770  0.897  0.656
0.400 0.400     0.798  0.825  0.774
0.500 0.500     0.799  0.775  0.820
0.600 0.600     0.782  0.680  0.875
0.700 0.700     0.765  0.585  0.927
0.800 0.800     0.730  0.480  0.957
0.900 0.900     0.674  0.330  0.986

# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

<Axes: xlabel='prob'>

y_train_pred_final['final_predicted'] = y_train_pred_final.Conv_Prob.map( lambda x: 1 if x > 0.43 else 0)
y_train_pred_final.head()

# Let's check the overall accuracy.
accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)
confusion = confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion

0.7985691929353901

array([[1855,  494],
       [ 407, 1717]])

# taking values for TP , TN, FP, FN
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.808
specificity - 0.79
Precision - 0.777
Recall - 0.808

# created a function for the ROC curve creation and see the lables 
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = roc_curve( actual, probs, drop_intermediate = False )
    auc_score = roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    return None

fpr, tpr, thresholds = roc_curve( y_train_pred_final.Converted, y_train_pred_final.Conv_Prob, drop_intermediate = False )
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob)

# we are looking for precision recall score using inbuilt function
precision_score(y_train_pred_final.Converted, y_train_pred_final.predicted)
recall_score(y_train_pred_final.Converted, y_train_pred_final.predicted)

0.7959381044487428

0.7749529190207156

# plotting Precision and Recall curve and finding the cutoff for this.
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob)
plt.plot(thresholds, p[:-1], "b")
plt.plot(thresholds, r[:-1], "r")
plt.title('Precision Recall Curve')
plt.show();

# used the cutoff value to see the prediction on train dataset.
y_train_pred_final['final_predicted'] = y_train_pred_final.Conv_Prob.map( lambda x: 1 if x > 0.45 else 0)
y_train_pred_final.head()

# Let's check the overall accuracy.
accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)
confusion = confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion

0.7990163201430807

array([[1873,  476],
       [ 423, 1701]])

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.801
specificity - 0.797
Precision - 0.781
Recall - 0.801

# we use these function to do the prediction on test data.
def logreg_test_pred_fn(fX_test, fy_test, fcol, fcutoff, fres):
    fX_test_sm = sm.add_constant(fX_test[fcol])
    fy_test_pred = fres.predict(fX_test_sm)
    fy_test_pred = fy_test_pred.values.reshape(-1)
    fy_test_pred_final = pd.DataFrame({'Converted':fy_test.values, 'Conv_Prob':fy_test_pred})
    fy_test_pred_final['ID'] = fy_test.index
    fy_test_pred_final['predicted'] = fy_test_pred_final.Conv_Prob.map(lambda x: 1 if x > fcutoff else 0)
    return fres, fy_test_pred,fy_test_pred_final

# this function is used to generate metrics.
def logreg_test_metrics_fn(fy_test_pred_final):
    fconfusion = confusion_matrix(fy_test_pred_final.Converted, fy_test_pred_final.predicted )
    faccuracy = accuracy_score(fy_test_pred_final.Converted, fy_test_pred_final.predicted)
    return fconfusion, faccuracy
    
# using this function we can see VIF score for multicollinearity
def logreg_test_VIF_score_fn(fX_test, fcol):
    fvif = pd.DataFrame()
    fvif['Features'] = fX_test[fcol].columns
    fvif['VIF'] = [variance_inflation_factor(fX_test[fcol].values, i) for i in range(fX_test[fcol].shape[1])]
    fvif['VIF'] = round(fvif['VIF'], 2)
    fvif = fvif.sort_values(by = "VIF", ascending = False)
    return fvif

# scaling the test data
X_test[to_scale] = scaler.transform(X_test[to_scale])
X_test[col].head(2)
X_test.shape

(1918, 83)

# our sensitivity and specificity is 0.43 so we are gonna use in this.
cutoff = 0.43
res, y_test_pred, y_test_pred_final = logreg_test_pred_fn(X_test, y_test, col, cutoff, res)
confusion, accuracy = logreg_test_metrics_fn(y_test_pred_final)
vif = logreg_test_VIF_score_fn(X_test, col)

# print('\nY_Predicted Values:')   # Y_Predicted Values:
# y_test_pred
# print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
# y_test_pred_final

print('\nConfusion_Matrix:')     # Confusion Matrix: 
confusion
print(f'\nAccuracy: {accuracy}\n')  # Accuracy Score:

Confusion_Matrix:

array([[782, 195],
       [187, 754]])

Accuracy: 0.8008342022940563

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.801
specificity - 0.8
Precision - 0.795
Recall - 0.801

# our precision and recall cutoff is 0.45 so we are gonna use in this. 

cutoff = 0.45
res, y_test_pred, y_test_pred_final = logreg_test_pred_fn(X_test, y_test, col, cutoff, res)
confusion, accuracy = logreg_test_metrics_fn(y_test_pred_final)
vif = logreg_test_VIF_score_fn(X_test, col)

# print('\nY_Predicted Values:')   # Y_Predicted Values:
# y_test_pred
# print('\nY_Predicted Cutoff:')   # Y_Predicted Cutoff:
# y_test_pred_final

print('\nConfusion_Matrix:')     # Confusion Matrix: 
confusion
print(f'\nAccuracy: {accuracy}\n')  # Accuracy Score:

Confusion_Matrix:

array([[789, 188],
       [195, 746]])

Accuracy: 0.8003128258602711

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

# Calculate the sensitivity
Sensitivity = TP/(TP+FN)
# Calculate the specificity
Specificity = TN/(TN+FP)
# Calculate Precision
Precision = TP/(TP+FP)
# Calculate Recall
Recall = TP/(TP+FN)

print(f'Sensitivity - {round(Sensitivity,3)}\nspecificity - {round(Specificity,3)}\nPrecision - {round(Precision,3)}\nRecall - {round(Recall,3)}')

Sensitivity - 0.793
specificity - 0.808
Precision - 0.799
Recall - 0.793

	thresh_low	thresh_high
name
converted	-1.500	2.500
totalvisits	-5.000	11.000
ttime_on_site	-1374.000	2322.000
pg_view_pv	-2.000	6.000

	converted	totalvisits	ttime_on_site	pg_view_pv
count	9240.000	9103.000	9240.000	9103.000
mean	0.385	3.221	487.698	2.259
std	0.487	2.882	548.021	1.793
min	0.000	0.000	0.000	0.000
25%	0.000	1.000	12.000	1.000
50%	0.000	3.000	248.000	2.000
75%	1.000	5.000	936.000	3.000
max	1.000	11.000	2272.000	6.000

	converted	totalvisits	ttime_on_site	pg_view_pv
count	9074.000	9074.000	9074.000	9074.000
mean	0.379	3.231	482.887	2.266
std	0.485	2.881	545.257	1.791
min	0.000	0.000	0.000	0.000
25%	0.000	1.000	11.000	1.000
50%	0.000	3.000	246.000	2.000
75%	1.000	5.000	922.750	3.200
max	1.000	11.000	2272.000	6.000

	level_0	level_1	0
2761	lead_origin_Lead Import	lead_source_Facebook	0.984
2988	lead_source_Facebook	lead_origin_Lead Import	0.984
3899	lead_source_Reference	lead_origin_Lead Add Form	0.866
2655	lead_origin_Lead Add Form	lead_source_Reference	0.866
343	pg_view_pv	totalvisits	0.753
116	totalvisits	pg_view_pv	0.753
5231	tags_Already a student	lead_profile_Student of SomeSchool	0.669
11673	lead_profile_Student of SomeSchool	tags_Already a student	0.669
7296	tags_Will revert after reading the email	converted	0.653
63	converted	tags_Will revert after reading the email	0.653

	totalvisits	ttime_on_site	pg_view_pv	avail_free_copy_1	do_not_email_1	last_activity_Converted to Lead	last_activity_Email Bounced	last_activity_Email Link Clicked	last_activity_Email Marked Spam	last_activity_Email Opened	last_activity_Email Received	last_activity_Form Submitted on Website	last_activity_Had a Phone Conversation	last_activity_Olark Chat Conversation	last_activity_Page Visited on Website	last_activity_Resubscribed to emails	last_activity_SMS Sent	last_activity_Unreachable	last_activity_Unsubscribed	last_activity_View in browser link Clicked	last_activity_Visited Booth in Tradeshow	lead_origin_Landing Page Submission	lead_origin_Lead Add Form	lead_origin_Lead Import	lead_source_Direct Traffic	lead_source_Facebook	lead_source_Google	lead_source_Live Chat	lead_source_NC_EDM	lead_source_Olark Chat	lead_source_Organic Search	lead_source_Pay per Click Ads	lead_source_Press_Release	lead_source_Reference	lead_source_Referral Sites	lead_source_Social Media	lead_source_Welingak Website	lead_source_bing	lead_source_blog	lead_source_google	lead_source_testone	lead_source_welearnblog_Home	lead_source_youtubechannel	tags_Already a student	tags_Busy	tags_Closed by Horizzon	tags_Diploma holder (Not Eligible)	tags_Graduation in progress	tags_In confusion whether part time or DLP	tags_Interested in full time MBA	tags_Interested in Next batch	tags_Interested in other courses	tags_Lateral student	tags_Lost to EINS	tags_Lost to Others	tags_Not doing further education	tags_Recognition issue (DEC approval)	tags_Ringing	tags_Shall take in the next coming month	tags_Still Thinking	tags_University not recognized	tags_Want to take admission but has financial problems	tags_Will revert after reading the email	tags_in touch with EINS	tags_invalid number	tags_number not provided	tags_opp hangup	tags_switched off	tags_wrong number given	specialization_Banking, Investment And Insurance	specialization_Business Administration	specialization_E-Business	specialization_E-COMMERCE	specialization_Finance Management	specialization_Healthcare Management	specialization_Hospitality Management	specialization_Human Resource Management	specialization_IT Projects Management	specialization_International Business	specialization_Marketing Management	specialization_Media and Advertising	specialization_Operations Management	specialization_Retail Management	specialization_Rural and Agribusiness	specialization_Services Excellence	specialization_Supply Chain Management	specialization_Travel and Tourism	info_abt_x_edu_Advertisements	info_abt_x_edu_Email	info_abt_x_edu_Multiple Sources	info_abt_x_edu_Online Search	info_abt_x_edu_Other	info_abt_x_edu_SMS	info_abt_x_edu_Social Media	info_abt_x_edu_Student of SomeSchool	info_abt_x_edu_Word Of Mouth	lead_profile_Dual Specialization Student	lead_profile_Lateral Student	lead_profile_Other Leads	lead_profile_Potential Lead	lead_profile_Student of SomeSchool	city_Mumbai	city_Other Cities	city_Other Cities of Maharashtra	city_Other Metro Cities	city_Thane & Outskirts	city_Tier II Cities	curr_occupation_Businessman	curr_occupation_Housewife	curr_occupation_Other	curr_occupation_Student	curr_occupation_Unemployed	curr_occupation_Working Professional
3009	-0.431	-0.160	-0.155	1.473	-0.293	-0.216	-0.189	-0.170	-0.018	1.294	-0.018	-0.109	-0.052	-0.343	-0.276	-0.013	-0.668	-0.106	-0.079	-0.031	-0.013	0.930	-0.271	-0.062	1.611	-0.063	-0.686	-0.018	-0.013	-0.479	-0.382	-0.013	-0.013	-0.233	-0.116	-0.018	-0.127	-0.022	-0.013	-0.025	-0.013	-0.013	-0.013	-0.227	-0.139	-0.189	-0.093	-0.109	-0.025	-0.116	-0.025	-0.249	-0.018	-0.135	-0.033	-0.123	-0.013	-0.386	-0.013	-0.025	-0.018	-0.025	-0.541	-0.036	-0.096	-0.055	-0.064	-0.160	-0.070	-0.195	-0.205	-0.081	-0.110	2.929	-0.134	-0.112	-0.323	-0.207	-0.142	-0.317	-0.151	-0.241	-0.110	-0.094	-0.062	-0.198	-0.150	-0.084	-0.055	-0.131	-0.316	-0.142	-0.055	-0.087	-0.188	-0.198	-0.045	-0.047	-0.234	-0.454	-0.160	1.354	-0.284	-0.225	-0.209	-0.293	-0.090	-0.031	-0.033	-0.040	-0.151	-1.239	-0.287
1012	-0.431	-0.540	-0.155	-0.679	3.417	-0.216	5.279	-0.170	-0.018	-0.773	-0.018	-0.109	-0.052	-0.343	-0.276	-0.013	-0.668	-0.106	-0.079	-0.031	-0.013	0.930	-0.271	-0.062	1.611	-0.063	-0.686	-0.018	-0.013	-0.479	-0.382	-0.013	-0.013	-0.233	-0.116	-0.018	-0.127	-0.022	-0.013	-0.025	-0.013	-0.013	-0.013	-0.227	-0.139	-0.189	-0.093	-0.109	-0.025	-0.116	-0.025	-0.249	-0.018	-0.135	-0.033	-0.123	-0.013	-0.386	-0.013	-0.025	-0.018	-0.025	-0.541	-0.036	-0.096	-0.055	-0.064	-0.160	-0.070	-0.195	-0.205	-0.081	-0.110	-0.341	-0.134	-0.112	-0.323	-0.207	-0.142	-0.317	-0.151	-0.241	-0.110	-0.094	-0.062	-0.198	-0.150	-0.084	-0.055	-0.131	-0.316	-0.142	-0.055	-0.087	-0.188	-0.198	-0.045	-0.047	-0.234	-0.454	-0.160	1.354	-0.284	-0.225	-0.209	-0.293	-0.090	-0.031	-0.033	-0.040	-0.151	-1.239	3.489
9226	-1.125	-0.889	-1.266	-0.679	-0.293	-0.216	-0.189	-0.170	-0.018	-0.773	-0.018	-0.109	-0.052	-0.343	-0.276	-0.013	1.498	-0.106	-0.079	-0.031	-0.013	-1.075	-0.271	-0.062	-0.621	-0.063	-0.686	-0.018	-0.013	2.089	-0.382	-0.013	-0.013	-0.233	-0.116	-0.018	-0.127	-0.022	-0.013	-0.025	-0.013	-0.013	-0.013	-0.227	-0.139	-0.189	-0.093	-0.109	-0.025	-0.116	-0.025	-0.249	-0.018	-0.135	-0.033	-0.123	-0.013	2.592	-0.013	-0.025	-0.018	-0.025	-0.541	-0.036	-0.096	-0.055	-0.064	-0.160	-0.070	-0.195	-0.205	-0.081	-0.110	-0.341	-0.134	-0.112	-0.323	-0.207	-0.142	-0.317	-0.151	-0.241	-0.110	-0.094	-0.062	-0.198	-0.150	-0.084	-0.055	-0.131	-0.316	-0.142	-0.055	-0.087	-0.188	-0.198	-0.045	-0.047	-0.234	-0.454	-0.160	-0.739	-0.284	-0.225	-0.209	-0.293	-0.090	-0.031	-0.033	-0.040	-0.151	0.807	-0.287
4750	-0.431	1.643	-0.155	-0.679	-0.293	-0.216	-0.189	-0.170	-0.018	-0.773	-0.018	-0.109	-0.052	-0.343	-0.276	-0.013	1.498	-0.106	-0.079	-0.031	-0.013	0.930	-0.271	-0.062	1.611	-0.063	-0.686	-0.018	-0.013	-0.479	-0.382	-0.013	-0.013	-0.233	-0.116	-0.018	-0.127	-0.022	-0.013	-0.025	-0.013	-0.013	-0.013	-0.227	-0.139	-0.189	-0.093	-0.109	-0.025	-0.116	-0.025	-0.249	-0.018	-0.135	-0.033	-0.123	-0.013	-0.386	-0.013	-0.025	-0.018	-0.025	-0.541	-0.036	-0.096	-0.055	-0.064	-0.160	-0.070	-0.195	-0.205	-0.081	-0.110	-0.341	-0.134	-0.112	-0.323	-0.207	-0.142	3.154	-0.151	-0.241	-0.110	-0.094	-0.062	-0.198	-0.150	-0.084	-0.055	7.639	-0.316	-0.142	-0.055	-0.087	-0.188	-0.198	-0.045	-0.047	-0.234	-0.454	-0.160	-0.739	3.525	-0.225	-0.209	-0.293	-0.090	-0.031	-0.033	-0.040	-0.151	-1.239	-0.287
7987	0.609	2.018	0.123	-0.679	-0.293	-0.216	-0.189	-0.170	-0.018	-0.773	-0.018	-0.109	-0.052	-0.343	-0.276	-0.013	1.498	-0.106	-0.079	-0.031	-0.013	0.930	-0.271	-0.062	1.611	-0.063	-0.686	-0.018	-0.013	-0.479	-0.382	-0.013	-0.013	-0.233	-0.116	-0.018	-0.127	-0.022	-0.013	-0.025	-0.013	-0.013	-0.013	-0.227	-0.139	-0.189	-0.093	-0.109	-0.025	-0.116	-0.025	-0.249	-0.018	7.397	-0.033	-0.123	-0.013	-0.386	-0.013	-0.025	-0.018	-0.025	-0.541	-0.036	-0.096	-0.055	-0.064	-0.160	-0.070	-0.195	-0.205	-0.081	-0.110	2.929	-0.134	-0.112	-0.323	-0.207	-0.142	-0.317	-0.151	-0.241	-0.110	-0.094	-0.062	-0.198	-0.150	-0.084	-0.055	-0.131	-0.316	-0.142	-0.055	-0.087	-0.188	5.057	-0.045	-0.047	-0.234	-0.454	-0.160	-0.739	-0.284	4.445	-0.209	-0.293	-0.090	-0.031	-0.033	-0.040	-0.151	0.807	-0.287

	Desc	Var	Value	Perc
0	Constant	magazine	No	100.000
1	Constant	more_course_updates	No	100.000
2	Constant	supply_chain_info	No	100.000
3	Constant	get_dm	No	100.000
4	Quasi Constant	x_education_forums	No	99.989
5	Quasi Constant	newspaper	No	99.989
6	Quasi Constant	do_not_call	No	99.978
7	Quasi Constant	newspaper_article	No	99.978
8	Quasi Constant	digital_advertisement	No	99.957
9	Quasi Constant	through_recommendations	No	99.924

	lead_origin	lead_source	do_not_email	last_activity	specialization	info_abt_x_edu	curr_occupation	tags	lead_profile	city	avail_free_copy
count	9074	9074	9074	9074	9074	9074	9074	9074	9074	9074	9074
unique	4	21	2	17	19	10	7	27	6	7	2
top	Landing Page Submission	Google	0	Email Opened	unknown	unknown	Unemployed	unknown	unknown	unknown	0
freq	4885	2868	8358	3432	3282	7086	5476	3327	6757	3575	6186

Dep. Variable:	converted	No. Observations:	6351
Model:	GLM	Df Residuals:	6335
Model Family:	Binomial	Df Model:	15
Link Function:	Logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-1244.6
Date:	Sun, 27 Oct 2024	Deviance:	2489.1
Time:	08:12:55	Pearson chi2:	8.96e+03
No. Iterations:	23	Pseudo R-squ. (CS):	0.6098
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-1.0033	178.985	-0.006	0.996	-351.808	349.802
ttime_on_site	0.9779	0.055	17.775	0.000	0.870	1.086
last_activity_SMS Sent	1.0486	0.054	19.397	0.000	0.943	1.155
lead_origin_Landing Page Submission	-0.6088	0.058	-10.470	0.000	-0.723	-0.495
lead_source_Welingak Website	0.5220	0.092	5.682	0.000	0.342	0.702
tags_Already a student	-0.8063	0.156	-5.185	0.000	-1.111	-0.502
tags_Closed by Horizzon	1.0797	0.132	8.196	0.000	0.821	1.338
tags_Interested in other courses	-0.6854	0.081	-8.507	0.000	-0.843	-0.527
tags_Lost to EINS	0.7695	0.098	7.877	0.000	0.578	0.961
tags_Ringing	-1.3190	0.087	-15.086	0.000	-1.490	-1.148
tags_Will revert after reading the email	1.8542	0.086	21.609	0.000	1.686	2.022
tags_invalid number	-2.2802	1516.845	-0.002	0.999	-2975.241	2970.680
tags_switched off	-0.6183	0.084	-7.331	0.000	-0.784	-0.453
tags_wrong number given	-1.7134	1485.914	-0.001	0.999	-2914.051	2910.625
curr_occupation_Unemployed	0.7247	0.056	12.881	0.000	0.614	0.835
curr_occupation_Working Professional	0.4284	0.095	4.490	0.000	0.241	0.615

	Features	VIF
13	curr_occupation_Unemployed	1.960
9	tags_Will revert after reading the email	1.950
14	curr_occupation_Working Professional	1.680
8	tags_Ringing	1.560
6	tags_Interested in other courses	1.240
0	ttime_on_site	1.210
5	tags_Closed by Horizzon	1.180
4	tags_Already a student	1.160
1	last_activity_SMS Sent	1.150
2	lead_origin_Landing Page Submission	1.150
11	tags_switched off	1.130
3	lead_source_Welingak Website	1.080
7	tags_Lost to EINS	1.050
10	tags_invalid number	1.050
12	tags_wrong number given	1.030

	Features	VIF
9	tags_Will revert after reading the email	1.930
12	curr_occupation_Unemployed	1.930
13	curr_occupation_Working Professional	1.670
8	tags_Ringing	1.540
6	tags_Interested in other courses	1.230
0	ttime_on_site	1.210
5	tags_Closed by Horizzon	1.170
1	last_activity_SMS Sent	1.150
2	lead_origin_Landing Page Submission	1.150
4	tags_Already a student	1.150
11	tags_switched off	1.120
3	lead_source_Welingak Website	1.080
7	tags_Lost to EINS	1.050
10	tags_invalid number	1.050

	Features	VIF
9	tags_Will revert after reading the email	1.890
11	curr_occupation_Unemployed	1.870
12	curr_occupation_Working Professional	1.660
8	tags_Ringing	1.500
6	tags_Interested in other courses	1.220
0	ttime_on_site	1.210
5	tags_Closed by Horizzon	1.170
1	last_activity_SMS Sent	1.150
2	lead_origin_Landing Page Submission	1.150
4	tags_Already a student	1.140
10	tags_switched off	1.120
3	lead_source_Welingak Website	1.080
7	tags_Lost to EINS	1.050

	Converted	Conv_Prob	ID	predicted	0.000	0.100	0.200	0.300	0.400	0.500	0.600	0.700	0.800	0.900
0	0	0.028	3009	0	1	0	0	0	0	0	0	0	0	0
1	0	0.083	1012	0	1	0	0	0	0	0	0	0	0	0
2	0	0.040	9226	0	1	0	0	0	0	0	0	0	0	0
3	1	0.608	4750	1	1	1	1	1	1	1	1	0	0	0
4	1	1.000	7987	1	1	1	1	1	1	1	1	1	1	1

	ttime_on_site	last_activity_SMS Sent	lead_origin_Landing Page Submission	lead_source_Welingak Website	tags_Already a student	tags_Closed by Horizzon	tags_Interested in other courses	tags_Lost to EINS	tags_Ringing	tags_Will revert after reading the email	tags_switched off	curr_occupation_Unemployed	curr_occupation_Working Professional
3271	-0.601	-0.668	-1.075	-0.127	-0.227	-0.189	-0.249	-0.135	-0.386	-0.541	-0.160	-1.239	-0.287
1490	1.887	-0.668	0.930	-0.127	-0.227	-0.189	-0.249	-0.135	-0.386	1.848	-0.160	-1.239	3.489

	level_0	level_1	0
1923	lead_source_Facebook	lead_origin_Lead Import	0.982
1772	lead_origin_Lead Import	lead_source_Facebook	0.982
2454	lead_source_Reference	lead_origin_Lead Add Form	0.863
1703	lead_origin_Lead Add Form	lead_source_Reference	0.863
5851	curr_occupation_Working Professional	curr_occupation_Unemployed	0.842
5775	curr_occupation_Unemployed	curr_occupation_Working Professional	0.842
78	totalvisits	pg_view_pv	0.739
229	pg_view_pv	totalvisits	0.739
537	last_activity_Email Bounced	do_not_email_1	0.586
386	do_not_email_1	last_activity_Email Bounced	0.586

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-0.6964	0.079	-8.866	0.000	-0.850	-0.542
ttime_on_site	0.9925	0.054	18.340	0.000	0.886	1.099
last_activity_SMS Sent	1.0110	0.053	19.180	0.000	0.908	1.114
lead_origin_Landing Page Submission	-0.6454	0.057	-11.233	0.000	-0.758	-0.533
lead_source_Welingak Website	0.5408	0.092	5.893	0.000	0.361	0.721
tags_Already a student	-0.7737	0.156	-4.975	0.000	-1.079	-0.469
tags_Closed by Horizzon	1.1040	0.132	8.384	0.000	0.846	1.362
tags_Interested in other courses	-0.6503	0.080	-8.100	0.000	-0.808	-0.493
tags_Lost to EINS	0.7786	0.098	7.985	0.000	0.587	0.970
tags_Ringing	-1.2450	0.087	-14.376	0.000	-1.415	-1.075
tags_Will revert after reading the email	1.9114	0.085	22.371	0.000	1.744	2.079
tags_switched off	-0.5829	0.084	-6.923	0.000	-0.748	-0.418
curr_occupation_Unemployed	0.6352	0.055	11.553	0.000	0.527	0.743
curr_occupation_Working Professional	0.4025	0.096	4.175	0.000	0.214	0.592

	totalvisits	ttime_on_site	pg_view_pv	avail_free_copy_1	do_not_email_1	last_activity_Converted to Lead	last_activity_Email Bounced	last_activity_Email Link Clicked	last_activity_Email Opened	last_activity_Email Received	last_activity_Form Submitted on Website	last_activity_Had a Phone Conversation	last_activity_Olark Chat Conversation	last_activity_Page Visited on Website	last_activity_SMS Sent	last_activity_Unreachable	last_activity_Unsubscribed	last_activity_View in browser link Clicked	last_activity_Visited Booth in Tradeshow	lead_origin_Landing Page Submission	lead_origin_Lead Add Form	lead_origin_Lead Import	lead_source_Direct Traffic	lead_source_Facebook	lead_source_Google	lead_source_Live Chat	lead_source_Olark Chat	lead_source_Organic Search	lead_source_Press_Release	lead_source_Reference	lead_source_Referral Sites	lead_source_Social Media	lead_source_WeLearn	lead_source_Welingak Website	lead_source_bing	specialization_Banking, Investment And Insurance	specialization_Business Administration	specialization_E-Business	specialization_E-COMMERCE	specialization_Finance Management	specialization_Healthcare Management	specialization_Hospitality Management	specialization_Human Resource Management	specialization_IT Projects Management	specialization_International Business	specialization_Marketing Management	specialization_Media and Advertising	specialization_Operations Management	specialization_Retail Management	specialization_Rural and Agribusiness	specialization_Services Excellence	specialization_Supply Chain Management	specialization_Travel and Tourism	info_abt_x_edu_Advertisements	info_abt_x_edu_Email	info_abt_x_edu_Multiple Sources	info_abt_x_edu_Online Search	info_abt_x_edu_Other	info_abt_x_edu_SMS	info_abt_x_edu_Social Media	info_abt_x_edu_Student of SomeSchool	info_abt_x_edu_Word Of Mouth	lead_profile_Dual Specialization Student	lead_profile_Lateral Student	lead_profile_Other Leads	lead_profile_Potential Lead	lead_profile_Student of SomeSchool	curr_occupation_Businessman	curr_occupation_Housewife	curr_occupation_Other	curr_occupation_Student	curr_occupation_Unemployed	curr_occupation_Working Professional
533	-0.131	0.524	0.363	1.395	-0.269	-0.220	-0.167	-0.182	1.262	-0.015	-0.120	-0.062	-0.269	-0.271	-0.714	-0.105	-0.072	-0.030	-0.015	0.872	-0.315	-0.064	-0.645	-0.064	-0.688	-0.021	-0.406	2.519	-0.015	-0.271	-0.108	-0.021	-0.015	-0.145	-0.026	-0.212	-0.229	-0.084	-0.120	-0.363	-0.142	-0.115	-0.338	-0.205	-0.146	-0.343	5.874	-0.251	-0.111	-0.099	-0.067	-0.211	-0.157	-0.088	-0.064	-0.132	-0.321	-0.154	-0.037	-0.094	-0.205	-0.204	-0.058	-0.060	-0.281	-0.574	-0.194	-0.040	-0.030	-0.047	-0.182	-2.458	2.915
846	-1.163	-0.942	-1.331	-0.717	-0.269	-0.220	-0.167	-0.182	1.262	-0.015	-0.120	-0.062	-0.269	-0.271	-0.714	-0.105	-0.072	-0.030	-0.015	-1.147	3.178	-0.064	-0.645	-0.064	-0.688	-0.021	-0.406	-0.397	-0.015	3.697	-0.108	-0.021	-0.015	-0.145	-0.026	-0.212	-0.229	-0.084	-0.120	2.757	-0.142	-0.115	-0.338	-0.205	-0.146	-0.343	-0.170	-0.251	-0.111	-0.099	-0.067	-0.211	-0.157	-0.088	-0.064	-0.132	-0.321	-0.154	-0.037	-0.094	-0.205	-0.204	-0.058	-0.060	-0.281	1.742	-0.194	-0.040	-0.030	-0.047	-0.182	0.407	-0.343
7546	1.932	0.554	2.056	1.395	-0.269	-0.220	-0.167	-0.182	-0.792	-0.015	-0.120	-0.062	-0.269	-0.271	1.402	-0.105	-0.072	-0.030	-0.015	0.872	-0.315	-0.064	-0.645	-0.064	-0.688	-0.021	-0.406	2.519	-0.015	-0.271	-0.108	-0.021	-0.015	-0.145	-0.026	-0.212	-0.229	-0.084	8.300	-0.363	-0.142	-0.115	-0.338	-0.205	-0.146	-0.343	-0.170	-0.251	-0.111	-0.099	-0.067	-0.211	-0.157	-0.088	-0.064	-0.132	-0.321	-0.154	-0.037	-0.094	-0.205	-0.204	-0.058	-0.060	-0.281	1.742	-0.194	-0.040	-0.030	-0.047	-0.182	0.407	-0.343
8631	0.556	0.655	1.492	-0.717	-0.269	-0.220	-0.167	-0.182	-0.792	-0.015	-0.120	-0.062	-0.269	-0.271	1.402	-0.105	-0.072	-0.030	-0.015	-1.147	-0.315	-0.064	-0.645	-0.064	-0.688	-0.021	-0.406	2.519	-0.015	-0.271	-0.108	-0.021	-0.015	-0.145	-0.026	-0.212	-0.229	-0.084	-0.120	-0.363	-0.142	-0.115	-0.338	-0.205	-0.146	-0.343	-0.170	-0.251	-0.111	-0.099	-0.067	-0.211	-0.157	-0.088	-0.064	-0.132	-0.321	-0.154	-0.037	-0.094	-0.205	-0.204	-0.058	-0.060	-0.281	-0.574	-0.194	-0.040	-0.030	-0.047	-0.182	0.407	-0.343
7824	-0.819	-0.613	-0.766	-0.717	-0.269	4.551	-0.167	-0.182	-0.792	-0.015	-0.120	-0.062	-0.269	-0.271	-0.714	-0.105	-0.072	-0.030	-0.015	0.872	-0.315	-0.064	1.551	-0.064	-0.688	-0.021	-0.406	-0.397	-0.015	-0.271	-0.108	-0.021	-0.015	-0.145	-0.026	-0.212	-0.229	-0.084	-0.120	-0.363	-0.142	-0.115	-0.338	-0.205	-0.146	-0.343	-0.170	3.977	-0.111	-0.099	-0.067	-0.211	-0.157	-0.088	-0.064	-0.132	-0.321	-0.154	-0.037	-0.094	-0.205	-0.204	-0.058	-0.060	-0.281	-0.574	-0.194	-0.040	-0.030	-0.047	-0.182	0.407	-0.343

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	0.0787	85.559	0.001	0.999	-167.613	167.770
ttime_on_site	1.1035	0.048	23.063	0.000	1.010	1.197
do_not_email_1	-0.3114	0.056	-5.548	0.000	-0.421	-0.201
last_activity_Converted to Lead	-0.3903	0.049	-7.920	0.000	-0.487	-0.294
last_activity_Email Bounced	-0.2913	0.072	-4.069	0.000	-0.432	-0.151
last_activity_Email Opened	-0.3885	0.043	-8.977	0.000	-0.473	-0.304
last_activity_Olark Chat Conversation	-0.4788	0.048	-9.929	0.000	-0.573	-0.384
last_activity_Page Visited on Website	-0.2793	0.043	-6.456	0.000	-0.364	-0.195
lead_origin_Lead Add Form	0.9053	0.071	12.833	0.000	0.767	1.044
lead_source_Olark Chat	0.5089	0.043	11.836	0.000	0.425	0.593
lead_source_Welingak Website	0.3529	0.109	3.250	0.001	0.140	0.566
lead_profile_Dual Specialization Student	1.3589	1024.498	0.001	0.999	-2006.620	2009.337
lead_profile_Lateral Student	1.3798	1027.316	0.001	0.999	-2012.122	2014.882
lead_profile_Potential Lead	0.6317	0.043	14.716	0.000	0.548	0.716
lead_profile_Student of SomeSchool	-0.5945	0.116	-5.115	0.000	-0.822	-0.367
curr_occupation_Working Professional	0.6817	0.057	11.905	0.000	0.569	0.794

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	0.0044	59.431	7.36e-05	1.000	-116.478	116.487
ttime_on_site	1.1074	0.048	23.175	0.000	1.014	1.201
do_not_email_1	-0.3051	0.056	-5.481	0.000	-0.414	-0.196
last_activity_Converted to Lead	-0.3907	0.049	-7.926	0.000	-0.487	-0.294
last_activity_Email Bounced	-0.2950	0.072	-4.125	0.000	-0.435	-0.155
last_activity_Email Opened	-0.3814	0.043	-8.835	0.000	-0.466	-0.297
last_activity_Olark Chat Conversation	-0.4783	0.048	-9.920	0.000	-0.573	-0.384
last_activity_Page Visited on Website	-0.2804	0.043	-6.479	0.000	-0.365	-0.196
lead_origin_Lead Add Form	0.9133	0.070	12.981	0.000	0.775	1.051
lead_source_Olark Chat	0.5076	0.043	11.830	0.000	0.424	0.592
lead_source_Welingak Website	0.3500	0.108	3.226	0.001	0.137	0.563
lead_profile_Dual Specialization Student	1.3576	1024.557	0.001	0.999	-2006.737	2009.452
lead_profile_Potential Lead	0.6265	0.043	14.605	0.000	0.542	0.711
lead_profile_Student of SomeSchool	-0.5976	0.116	-5.133	0.000	-0.826	-0.369
curr_occupation_Working Professional	0.6840	0.057	11.970	0.000	0.572	0.796

	Features	VIF
7	lead_origin_Lead Add Form	1.620
1	do_not_email_1	1.570
3	last_activity_Email Bounced	1.510
9	lead_source_Welingak Website	1.330
0	ttime_on_site	1.300
8	lead_source_Olark Chat	1.290
4	last_activity_Email Opened	1.260
5	last_activity_Olark Chat Conversation	1.220
12	lead_profile_Potential Lead	1.170
6	last_activity_Page Visited on Website	1.110
14	curr_occupation_Working Professional	1.110
2	last_activity_Converted to Lead	1.100
13	lead_profile_Student of SomeSchool	1.050
10	lead_profile_Dual Specialization Student	1.010
11	lead_profile_Lateral Student	1.010

	Features	VIF
7	lead_origin_Lead Add Form	1.610
1	do_not_email_1	1.570
3	last_activity_Email Bounced	1.510
9	lead_source_Welingak Website	1.330
0	ttime_on_site	1.300
8	lead_source_Olark Chat	1.290
4	last_activity_Email Opened	1.260
5	last_activity_Olark Chat Conversation	1.220
11	lead_profile_Potential Lead	1.160
6	last_activity_Page Visited on Website	1.110
13	curr_occupation_Working Professional	1.110
2	last_activity_Converted to Lead	1.100
12	lead_profile_Student of SomeSchool	1.050
10	lead_profile_Dual Specialization Student	1.010

	Converted	Conv_Prob	ID	predicted	0.000	0.100	0.200	0.300	0.400	0.500	0.600	0.700	0.800	0.900
0	1	0.844	533	1	1	1	1	1	1	1	1	1	1	0
1	1	0.919	846	1	1	1	1	1	1	1	1	1	1	1
2	1	0.844	7546	1	1	1	1	1	1	1	1	1	1	0
3	0	0.589	8631	1	1	1	1	1	1	1	0	0	0	0
4	0	0.051	7824	0	1	0	0	0	0	0	0	0	0	0

	ttime_on_site	do_not_email_1	last_activity_Converted to Lead	last_activity_Email Bounced	last_activity_Email Opened	last_activity_Olark Chat Conversation	last_activity_Page Visited on Website	lead_origin_Lead Add Form	lead_source_Olark Chat	lead_source_Welingak Website	lead_profile_Potential Lead	lead_profile_Student of SomeSchool	curr_occupation_Working Professional
2938	1.011	-0.269	-0.220	-0.167	-0.792	-0.269	-0.271	-0.315	-0.406	-0.145	-0.574	-0.194	2.915
4301	-0.362	3.716	-0.220	-0.167	-0.792	-0.269	-0.271	-0.315	2.461	-0.145	1.742	-0.194	-0.343

Data Science Portfolio

Lead-Scoring-Case-Study

Designed and Developed by

Problem Statement

Expectations

About the Dataset

Lead Scoring Case Study¶

Initialization - Preprocessing - Visualization¶

1. Package Imports and Data Initialization¶

----------------------------------------------------------------------¶

2. Data Preprocesing - Part 1¶

Custom Functions for Preprocessing and EDA¶

Data Cleaning, Column Renaming, Missing Value Imputation, Feature Selection¶

Constant Feature Identification¶

Missing Value Imputation¶

----------------------------------------------------------------------¶

3. Data Visualization - EDA¶

Observations¶

¶

Observations¶

The categorical analysis¶

¶

3.1 Other Bivariate - Multivariate plots (computationally intensive)¶

----------------------------------------------------------------------¶

4. Data Preprocesing - Part 2¶

Outlier Analysis and Capping¶

Univariate analysis Post Missing value Imputation and Outlier Capping¶

----------------------------------------------------------------------¶

5. Data Imbalance & Conversion Ratio¶

-----------------------------------------------------¶

-----------------------------------------------------¶

Approach - 01¶

Dummy Encoding¶

----------------------------------------------------------------------¶

Train and Test Split¶

----------------------------------------------------------------------¶

Feature Scaling¶

----------------------------------------------------------------------¶

Model Building¶

Custom Functions for Model Training¶

Base Model¶

RFE - Recursive Feature Elimination¶

Model 1¶

Model 2¶

Model 3¶

Finding Optimal Cutoff Point¶

ROC Curve¶

Precision - Recall Curve¶

---------------------------------------------------------------¶

Predictions on the test set¶

Custom Functions for Test¶

Model Validation on Test Data¶

-----------------------------------------------------¶

------------------------------------------------------¶

Approach - 02¶

Data Encoding¶

Dummy Encoding¶

----------------------------------------------------------------------¶

Train and Test Split¶

----------------------------------------------------------------------¶

Feature Scaling¶

----------------------------------------------------------------------¶

Model Building¶

Custom Functions for Model Training¶

Base Model¶

RFE - Recursive Feature Elimination¶

Model 1¶

Model 2¶

Finding Optimal Cutoff Point¶

----------------------------------------------------------------------¶

ROC Curve and Precision - Recall Curve¶

---------------------------------------------------------------¶

Predictions on the test set¶

Custom Functions for Test¶

Model Validation on Test Data¶

Related Posts

Telecom-churn-Case-Study 13 Nov 2024

Bike Sharing Case Study 13 Sep 2024

RSVP Movies Case study 13 Aug 2024