1. Import all necessary libraries. LogisticRegression, Decision Tree classifier, Random Forest, Imblearn, RFE, PCA etc
import time as ct
start_time = ct.time()
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_theme(color_codes=True)
import matplotlib.pyplot as plt
import warnings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
%matplotlib inline
# Set custom display properties in pandas
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 900)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# %pip install fast_ml ## Required for constant feature identification package
# %conda install xgboost -y
# %conda install -c conda-forge imbalanced-learn -y
from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek
import xgboost as xgb
import statsmodels.api as sm
from sklearn.tree import plot_tree
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, RocCurveDisplay, precision_recall_curve, f1_score, classification_report, accuracy_score
2. Initialize the telecom_churn dataset
telecom_df = pd.read_csv("./telecom_churn_data.csv")
# check columns names and datatype
<class 'pandas.core.frame.DataFrame'> RangeIndex: 99999 entries, 0 to 99998 Columns: 226 entries, mobile_number to sep_vbc_3g dtypes: float64(179), int64(35), object(12) memory usage: 172.4+ MB
Custom Functions¶
1. func combine_features: Combines similar features that are part of "Good phase" by stripping unique identifiers. It then takes the mean among the features and thereby creating a new derived feature.2. func find_outliers: Outlier Analysis using Boxplot IQR method.3. func check_col_null_pct: Check the columns null percentage and return the columns based on the given threshold value
def combine_features(df, cols, pat1='_6' , pat2='_7' ,to_append='_good_phase'):
drop_lst = []
new_cols = []
month_col = cols[cols.str.contains(pat='jun_|jul_')].str.replace(pat="(jun_|jul_)", repl="", regex=True).unique()
cols = cols.str.replace(pat="(_\d$|jun_|jul_)", repl="", regex=True).unique()
for col in cols:
if col in month_col:
new_col = col + to_append
col1 = 'jun_' + col
col2 = 'jul_' + col
new_col = col + to_append
col1 = col + pat1
col2 = col + pat2
df[new_col] = df[[col1, col2]].mean(axis=1)
return drop_lst, new_cols
def find_outliers(df):
temp_df = pd.DataFrame(columns= ['col', 'lower_threshold', 'iqr_q1', 'iqr', 'iqr_q3', 'upper_threshold', 'outliers_cnt'])
num_cols = df.select_dtypes(include=np.number).columns
for col in num_cols:
arr = df[col][df[col].notna()]
iqr_q3 = np.quantile(df[col], 0.75)
iqr_q1 = np.quantile(df[col], 0.25)
iqr = iqr_q3 - iqr_q1
iqr_upper_threshold = iqr_q3 + (1.5 * iqr)
iqr_lower_threshold = iqr_q1 - (1.5 * iqr)
outliers = arr[(arr > iqr_upper_threshold) | (arr < iqr_lower_threshold)]
to_add = pd.Series({'col': col, 'lower_threshold': iqr_lower_threshold, 'iqr_q1': iqr_q1, 'iqr': iqr, 'iqr_q3': iqr_q3, 'upper_threshold': iqr_upper_threshold, 'outliers_cnt': len(outliers)})
temp_df = pd.concat([temp_df,to_add.to_frame().T])
return temp_df
def check_col_null_pct(df, thresh=0, incl_all=1):
col_null_pct = (df.isna().sum()/len(df))*100
cols_na_abv_thresh = col_null_pct[col_null_pct > thresh]
cols_na_bel_thresh = col_null_pct[col_null_pct < thresh]
return cols_na_abv_thresh.sort_values(ascending=False) , cols_na_bel_thresh.sort_values(ascending=False)
Data Preprocessing¶
3. Check the shape and size of the dataset.
(99999, 226)
Index(['mobile_number', 'circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8', 'last_date_of_month_9', 'arpu_6', ... 'sachet_3g_9', 'fb_user_6', 'fb_user_7', 'fb_user_8', 'fb_user_9', 'aon', 'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g'], dtype='object', length=226)
4. Check for any duplicate entries in the data set. Also check if there is any duplicates in mobile number column.
5. Check for column null percentage.
cols_with_gt40_na , cols_with_le40_na = check_col_null_pct(telecom_df, 40)
telecom_df = telecom_df[cols_with_le40_na.index.sort_values(ascending=True)]
cols_with_le40_na[cols_with_le40_na > 0]
loc_og_t2m_mou_9 7.745 loc_ic_t2m_mou_9 7.745 offnet_mou_9 7.745 std_ic_t2f_mou_9 7.745 roam_ic_mou_9 7.745 std_og_t2t_mou_9 7.745 roam_og_mou_9 7.745 std_ic_t2m_mou_9 7.745 loc_og_t2t_mou_9 7.745 std_ic_t2o_mou_9 7.745 loc_og_mou_9 7.745 std_ic_t2t_mou_9 7.745 isd_og_mou_9 7.745 loc_ic_t2f_mou_9 7.745 loc_og_t2c_mou_9 7.745 loc_og_t2f_mou_9 7.745 loc_ic_t2t_mou_9 7.745 std_og_t2m_mou_9 7.745 spl_ic_mou_9 7.745 std_ic_mou_9 7.745 ic_others_9 7.745 std_og_mou_9 7.745 isd_ic_mou_9 7.745 spl_og_mou_9 7.745 onnet_mou_9 7.745 std_og_t2c_mou_9 7.745 loc_ic_mou_9 7.745 og_others_9 7.745 std_og_t2f_mou_9 7.745 std_og_t2t_mou_8 5.378 std_og_t2c_mou_8 5.378 loc_ic_t2m_mou_8 5.378 loc_ic_t2f_mou_8 5.378 og_others_8 5.378 std_og_t2m_mou_8 5.378 loc_og_mou_8 5.378 std_og_mou_8 5.378 loc_ic_mou_8 5.378 std_og_t2f_mou_8 5.378 loc_og_t2c_mou_8 5.378 spl_og_mou_8 5.378 loc_ic_t2t_mou_8 5.378 isd_og_mou_8 5.378 loc_og_t2f_mou_8 5.378 std_ic_t2f_mou_8 5.378 ic_others_8 5.378 isd_ic_mou_8 5.378 spl_ic_mou_8 5.378 std_ic_mou_8 5.378 onnet_mou_8 5.378 std_ic_t2o_mou_8 5.378 roam_ic_mou_8 5.378 offnet_mou_8 5.378 loc_og_t2m_mou_8 5.378 roam_og_mou_8 5.378 std_ic_t2m_mou_8 5.378 loc_og_t2t_mou_8 5.378 std_ic_t2t_mou_8 5.378 date_of_last_rech_9 4.760 std_ic_t2f_mou_6 3.937 spl_og_mou_6 3.937 loc_ic_mou_6 3.937 ic_others_6 3.937 isd_ic_mou_6 3.937 std_ic_t2m_mou_6 3.937 spl_ic_mou_6 3.937 loc_ic_t2m_mou_6 3.937 loc_ic_t2t_mou_6 3.937 std_ic_mou_6 3.937 std_ic_t2t_mou_6 3.937 std_ic_t2o_mou_6 3.937 loc_ic_t2f_mou_6 3.937 og_others_6 3.937 roam_ic_mou_6 3.937 offnet_mou_6 3.937 roam_og_mou_6 3.937 std_og_t2f_mou_6 3.937 loc_og_t2f_mou_6 3.937 loc_og_t2t_mou_6 3.937 std_og_t2c_mou_6 3.937 std_og_t2t_mou_6 3.937 std_og_t2m_mou_6 3.937 loc_og_mou_6 3.937 onnet_mou_6 3.937 std_og_mou_6 3.937 loc_og_t2m_mou_6 3.937 loc_og_t2c_mou_6 3.937 isd_og_mou_6 3.937 std_ic_t2f_mou_7 3.859 std_ic_t2m_mou_7 3.859 roam_og_mou_7 3.859 loc_og_t2t_mou_7 3.859 isd_og_mou_7 3.859 roam_ic_mou_7 3.859 spl_og_mou_7 3.859 offnet_mou_7 3.859 onnet_mou_7 3.859 std_ic_mou_7 3.859 spl_ic_mou_7 3.859 isd_ic_mou_7 3.859 ic_others_7 3.859 loc_og_t2m_mou_7 3.859 std_ic_t2o_mou_7 3.859 std_ic_t2t_mou_7 3.859 std_og_t2c_mou_7 3.859 loc_og_t2c_mou_7 3.859 loc_ic_mou_7 3.859 loc_og_mou_7 3.859 loc_ic_t2f_mou_7 3.859 std_og_t2t_mou_7 3.859 loc_ic_t2m_mou_7 3.859 std_og_t2m_mou_7 3.859 std_og_t2f_mou_7 3.859 loc_ic_t2t_mou_7 3.859 std_og_mou_7 3.859 loc_og_t2f_mou_7 3.859 og_others_7 3.859 date_of_last_rech_8 3.622 date_of_last_rech_7 1.767 last_date_of_month_9 1.659 date_of_last_rech_6 1.607 last_date_of_month_8 1.100 loc_og_t2o_mou 1.018 std_og_t2o_mou 1.018 loc_ic_t2o_mou 1.018 last_date_of_month_7 0.601 dtype: float64
6. Check for any constant features or feature that has only one label.
from fast_ml import feature_selection as fs
const_features_df = fs.get_constant_features(telecom_df, threshold=100, dropna=True)
Desc | Var | Value | Perc | |
0 | Constant | circle_id | 109 | 100.000 |
1 | Constant | last_date_of_month_6 | 6/30/2014 | 100.000 |
2 | Constant | last_date_of_month_7 | 7/31/2014 | 100.000 |
3 | Constant | last_date_of_month_8 | 8/31/2014 | 100.000 |
4 | Constant | last_date_of_month_9 | 9/30/2014 | 100.000 |
5 | Constant | loc_ic_t2o_mou | 0.000 | 100.000 |
6 | Constant | loc_og_t2o_mou | 0.000 | 100.000 |
7 | Constant | std_ic_t2o_mou_6 | 0.000 | 100.000 |
8 | Constant | std_ic_t2o_mou_7 | 0.000 | 100.000 |
9 | Constant | std_ic_t2o_mou_8 | 0.000 | 100.000 |
10 | Constant | std_ic_t2o_mou_9 | 0.000 | 100.000 |
11 | Constant | std_og_t2c_mou_6 | 0.000 | 100.000 |
12 | Constant | std_og_t2c_mou_7 | 0.000 | 100.000 |
13 | Constant | std_og_t2c_mou_8 | 0.000 | 100.000 |
14 | Constant | std_og_t2c_mou_9 | 0.000 | 100.000 |
15 | Constant | std_og_t2o_mou | 0.000 | 100.000 |
- circle_id, last_date_of_month_6, last_date_of_month_7, last_date_of_month_8, last_date_of_month_9, loc_ic_t2o_mou, loc_og_t2o_mou, std_ic_t2o_mou_6, std_ic_t2o_mou_7, std_ic_t2o_mou_8, std_ic_t2o_mou_9, std_og_t2c_mou_6, std_og_t2c_mou_7, std_og_t2c_mou_8, std_og_t2c_mou_9, std_og_t2o_mou
telecom_df = telecom_df.drop(const_features_df['Var'].to_list(), axis=1)
(99999, 170)
7. Plot the bar chart for columns with less than 10% of NULL values.
_,cols_with_le10_na = check_col_null_pct(telecom_df, 10)
cols_with_le10_na[(cols_with_le10_na > 0)];
8. Convert Date column to day in numbers format.
telecom_df[['date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9']] = telecom_df[['date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9']].apply(lambda x: pd.to_datetime(x)
9. Check for any outliers in the data set.
find_outliers(telecom_df).sort_values(by=['outliers_cnt', 'lower_threshold', 'iqr_q1', 'iqr', 'iqr_q3', 'upper_threshold'], ascending=[False, True, False, False, False, False]).head(20)
col | lower_threshold | iqr_q1 | iqr | iqr_q3 | upper_threshold | outliers_cnt | |
0 | vol_2g_mb_8 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 22785 |
0 | vol_2g_mb_7 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 22525 |
0 | vol_2g_mb_6 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 22268 |
0 | vol_2g_mb_9 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 21720 |
0 | last_day_rch_amt_9 | -75.000 | 0.000 | 50.000 | 50.000 | 125.000 | 17714 |
0 | aug_vbc_3g | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 16686 |
0 | jul_vbc_3g | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 16097 |
0 | vol_3g_mb_8 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 15899 |
0 | vol_3g_mb_7 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 15421 |
0 | vol_3g_mb_9 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 15361 |
0 | jun_vbc_3g | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 15129 |
0 | vol_3g_mb_6 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 14582 |
0 | sachet_2g_9 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 14178 |
0 | sachet_2g_8 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 14067 |
0 | sachet_2g_7 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 13305 |
0 | sachet_2g_6 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 12903 |
0 | total_og_mou_7 | -460.330 | 43.010 | 335.560 | 378.570 | 881.910 | 8624 |
0 | total_og_mou_8 | -458.400 | 38.580 | 331.320 | 369.900 | 866.880 | 8596 |
0 | total_og_mou_9 | -466.445 | 25.510 | 327.970 | 353.480 | 845.435 | 8534 |
0 | total_og_mou_6 | -447.440 | 44.740 | 328.120 | 372.860 | 865.040 | 8443 |
10. Impute Missing Values: Since there are lot of missing values in the dataset, we therefore decide to impute it. Also most of the columns are skewed, hence we use "median" as a strategy to impute it.
telecom_df_bk = telecom_df.copy()
# telecom_df = telecom_df_bk
to_impute_df = telecom_df.select_dtypes(include = np.number)
si = SimpleImputer(strategy='median')
imputed_arr = si.fit_transform(to_impute_df)
df_imputed = pd.DataFrame(imputed_arr, columns = to_impute_df.columns)
telecom_df = telecom_df[telecom_df.columns.difference(to_impute_df.columns)]
telecom_df = pd.concat([telecom_df, df_imputed], axis=1)
11. Filter the high value customers who have recharged more than 70% of the average recharge value during the good phase.
# telecom_df['total_rech_amt_good_phase'] = telecom_df[['total_rech_amt_6','total_rech_amt_7']].sum(axis=1)
# telecom_df = telecom_df.drop(['total_rech_amt_6','total_rech_amt_7'], axis=1)
# telecom_df = telecom_df[(telecom_df['total_rech_amt_good_phase'] >= telecom_df['total_rech_amt_good_phase'].quantile(0.7))]
telecom_df['total_rech_amt_good_phase'] = telecom_df[['total_rech_amt_6','total_rech_amt_7']].mean(axis=1)
telecom_df = telecom_df.drop(['total_rech_amt_6','total_rech_amt_7'], axis=1)
telecom_df = telecom_df[(telecom_df['total_rech_amt_good_phase'] >= telecom_df['total_rech_amt_good_phase'].quantile(0.7))]
12. Tag the churned customers (1 or 0) by applying conditions on the following fourth month columns: total_ic_mou_9, total_og_mou_9, vol_2g_mb_9, vol_3g_mb_9
telecom_df['churn'] = telecom_df.apply(lambda x: 1 if ((x['total_ic_mou_9'] < 1) & (x['total_og_mou_9'] < 1) & (x['vol_2g_mb_9'] < 1 ) & (x['vol_3g_mb_9'] < 1) ) else 0, axis=1)
13. Rename columns_8 as action phase based on business requirements.
telecom_df.columns = telecom_df.columns.str.replace(pat='_8',repl='_action_phase')
telecom_df = telecom_df.rename(columns={'aug_vbc_3g': 'vbc_3g_action_phase'})
Index([], dtype='object')
(30011, 170)
14. Find all columns related to churn phase or cols with _9 in name. and drop it
sep_cols_to_drop = telecom_df.filter(like='_9').columns.to_list()
['arpu_9', 'date_of_last_rech_9', 'ic_others_9', 'isd_ic_mou_9', 'isd_og_mou_9', 'last_day_rch_amt_9', 'loc_ic_mou_9', 'loc_ic_t2f_mou_9', 'loc_ic_t2m_mou_9', 'loc_ic_t2t_mou_9', 'loc_og_mou_9', 'loc_og_t2c_mou_9', 'loc_og_t2f_mou_9', 'loc_og_t2m_mou_9', 'loc_og_t2t_mou_9', 'max_rech_amt_9', 'monthly_2g_9', 'monthly_3g_9', 'offnet_mou_9', 'og_others_9', 'onnet_mou_9', 'roam_ic_mou_9', 'roam_og_mou_9', 'sachet_2g_9', 'sachet_3g_9', 'spl_ic_mou_9', 'spl_og_mou_9', 'std_ic_mou_9', 'std_ic_t2f_mou_9', 'std_ic_t2m_mou_9', 'std_ic_t2t_mou_9', 'std_og_mou_9', 'std_og_t2f_mou_9', 'std_og_t2m_mou_9', 'std_og_t2t_mou_9', 'total_ic_mou_9', 'total_og_mou_9', 'total_rech_amt_9', 'total_rech_num_9', 'vol_2g_mb_9', 'vol_3g_mb_9', 'sep_vbc_3g']
telecom_df = telecom_df.drop(sep_cols_to_drop, axis=1)
(30011, 128)
15. Filter features that are identified by months _6 & _7, in order to combine those features as "good phase" and also check the datatypes
cols_to_combine = telecom_df.filter(regex='.*(jun_|jul_|_6|_7).*',axis=1).columns
<class 'pandas.core.frame.DataFrame'> Index: 30011 entries, 7 to 99997 Data columns (total 82 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 arpu_6 30011 non-null float64 1 arpu_7 30011 non-null float64 2 date_of_last_rech_6 30011 non-null float64 3 date_of_last_rech_7 30011 non-null float64 4 ic_others_6 30011 non-null float64 5 ic_others_7 30011 non-null float64 6 isd_ic_mou_6 30011 non-null float64 7 isd_ic_mou_7 30011 non-null float64 8 isd_og_mou_6 30011 non-null float64 9 isd_og_mou_7 30011 non-null float64 10 jul_vbc_3g 30011 non-null float64 11 jun_vbc_3g 30011 non-null float64 12 last_day_rch_amt_6 30011 non-null float64 13 last_day_rch_amt_7 30011 non-null float64 14 loc_ic_mou_6 30011 non-null float64 15 loc_ic_mou_7 30011 non-null float64 16 loc_ic_t2f_mou_6 30011 non-null float64 17 loc_ic_t2f_mou_7 30011 non-null float64 18 loc_ic_t2m_mou_6 30011 non-null float64 19 loc_ic_t2m_mou_7 30011 non-null float64 20 loc_ic_t2t_mou_6 30011 non-null float64 21 loc_ic_t2t_mou_7 30011 non-null float64 22 loc_og_mou_6 30011 non-null float64 23 loc_og_mou_7 30011 non-null float64 24 loc_og_t2c_mou_6 30011 non-null float64 25 loc_og_t2c_mou_7 30011 non-null float64 26 loc_og_t2f_mou_6 30011 non-null float64 27 loc_og_t2f_mou_7 30011 non-null float64 28 loc_og_t2m_mou_6 30011 non-null float64 29 loc_og_t2m_mou_7 30011 non-null float64 30 loc_og_t2t_mou_6 30011 non-null float64 31 loc_og_t2t_mou_7 30011 non-null float64 32 max_rech_amt_6 30011 non-null float64 33 max_rech_amt_7 30011 non-null float64 34 monthly_2g_6 30011 non-null float64 35 monthly_2g_7 30011 non-null float64 36 monthly_3g_6 30011 non-null float64 37 monthly_3g_7 30011 non-null float64 38 offnet_mou_6 30011 non-null float64 39 offnet_mou_7 30011 non-null float64 40 og_others_6 30011 non-null float64 41 og_others_7 30011 non-null float64 42 onnet_mou_6 30011 non-null float64 43 onnet_mou_7 30011 non-null float64 44 roam_ic_mou_6 30011 non-null float64 45 roam_ic_mou_7 30011 non-null float64 46 roam_og_mou_6 30011 non-null float64 47 roam_og_mou_7 30011 non-null float64 48 sachet_2g_6 30011 non-null float64 49 sachet_2g_7 30011 non-null float64 50 sachet_3g_6 30011 non-null float64 51 sachet_3g_7 30011 non-null float64 52 spl_ic_mou_6 30011 non-null float64 53 spl_ic_mou_7 30011 non-null float64 54 spl_og_mou_6 30011 non-null float64 55 spl_og_mou_7 30011 non-null float64 56 std_ic_mou_6 30011 non-null float64 57 std_ic_mou_7 30011 non-null float64 58 std_ic_t2f_mou_6 30011 non-null float64 59 std_ic_t2f_mou_7 30011 non-null float64 60 std_ic_t2m_mou_6 30011 non-null float64 61 std_ic_t2m_mou_7 30011 non-null float64 62 std_ic_t2t_mou_6 30011 non-null float64 63 std_ic_t2t_mou_7 30011 non-null float64 64 std_og_mou_6 30011 non-null float64 65 std_og_mou_7 30011 non-null float64 66 std_og_t2f_mou_6 30011 non-null float64 67 std_og_t2f_mou_7 30011 non-null float64 68 std_og_t2m_mou_6 30011 non-null float64 69 std_og_t2m_mou_7 30011 non-null float64 70 std_og_t2t_mou_6 30011 non-null float64 71 std_og_t2t_mou_7 30011 non-null float64 72 total_ic_mou_6 30011 non-null float64 73 total_ic_mou_7 30011 non-null float64 74 total_og_mou_6 30011 non-null float64 75 total_og_mou_7 30011 non-null float64 76 total_rech_num_6 30011 non-null float64 77 total_rech_num_7 30011 non-null float64 78 vol_2g_mb_6 30011 non-null float64 79 vol_2g_mb_7 30011 non-null float64 80 vol_3g_mb_6 30011 non-null float64 81 vol_3g_mb_7 30011 non-null float64 dtypes: float64(82) memory usage: 19.0 MB
16. Now that we have identified the features for "Good phase", using an aggr method we can go ahead combine those features. once features are combined as "Good Phase", we drop all those redundant features of months _6&_7.
drop_lst, new_cols = combine_features(df=telecom_df, cols=cols_to_combine, pat1='_6', pat2='_7',to_append='_good_phase')
telecom_df = telecom_df.drop(drop_lst, axis=1)
(30011, 87)
Index(['aon', 'arpu_action_phase', 'vbc_3g_action_phase', 'date_of_last_rech_action_phase', 'ic_others_action_phase', 'isd_ic_mou_action_phase', 'isd_og_mou_action_phase', 'last_day_rch_amt_action_phase', 'loc_ic_mou_action_phase', 'loc_ic_t2f_mou_action_phase', 'loc_ic_t2m_mou_action_phase', 'loc_ic_t2t_mou_action_phase', 'loc_og_mou_action_phase', 'loc_og_t2c_mou_action_phase', 'loc_og_t2f_mou_action_phase', 'loc_og_t2m_mou_action_phase', 'loc_og_t2t_mou_action_phase', 'max_rech_amt_action_phase', 'mobile_number', 'monthly_2g_action_phase', 'monthly_3g_action_phase', 'offnet_mou_action_phase', 'og_others_action_phase', 'onnet_mou_action_phase', 'roam_ic_mou_action_phase', 'roam_og_mou_action_phase', 'sachet_2g_action_phase', 'sachet_3g_action_phase', 'spl_ic_mou_action_phase', 'spl_og_mou_action_phase', 'std_ic_mou_action_phase', 'std_ic_t2f_mou_action_phase', 'std_ic_t2m_mou_action_phase', 'std_ic_t2t_mou_action_phase', 'std_og_mou_action_phase', 'std_og_t2f_mou_action_phase', 'std_og_t2m_mou_action_phase', 'std_og_t2t_mou_action_phase', 'total_ic_mou_action_phase', 'total_og_mou_action_phase', 'total_rech_amt_action_phase', 'total_rech_num_action_phase', 'vol_2g_mb_action_phase', 'vol_3g_mb_action_phase', 'total_rech_amt_good_phase', 'churn', 'arpu_good_phase', 'date_of_last_rech_good_phase', 'ic_others_good_phase', 'isd_ic_mou_good_phase', 'isd_og_mou_good_phase', 'vbc_3g_good_phase', 'last_day_rch_amt_good_phase', 'loc_ic_mou_good_phase', 'loc_ic_t2f_mou_good_phase', 'loc_ic_t2m_mou_good_phase', 'loc_ic_t2t_mou_good_phase', 'loc_og_mou_good_phase', 'loc_og_t2c_mou_good_phase', 'loc_og_t2f_mou_good_phase', 'loc_og_t2m_mou_good_phase', 'loc_og_t2t_mou_good_phase', 'max_rech_amt_good_phase', 'monthly_2g_good_phase', 'monthly_3g_good_phase', 'offnet_mou_good_phase', 'og_others_good_phase', 'onnet_mou_good_phase', 'roam_ic_mou_good_phase', 'roam_og_mou_good_phase', 'sachet_2g_good_phase', 'sachet_3g_good_phase', 'spl_ic_mou_good_phase', 'spl_og_mou_good_phase', 'std_ic_mou_good_phase', 'std_ic_t2f_mou_good_phase', 'std_ic_t2m_mou_good_phase', 'std_ic_t2t_mou_good_phase', 'std_og_mou_good_phase', 'std_og_t2f_mou_good_phase', 'std_og_t2m_mou_good_phase', 'std_og_t2t_mou_good_phase', 'total_ic_mou_good_phase', 'total_og_mou_good_phase', 'total_rech_num_good_phase', 'vol_2g_mb_good_phase', 'vol_3g_mb_good_phase'], dtype='object')
cols_with_null,_ = check_col_null_pct(telecom_df)
Series([], dtype: float64)
telecom_df = telecom_df.drop('mobile_number', axis=1)
Exploratory Data Analysis¶
Univariate analysis¶
# Histplot analysis for all the numeric columns
cols = telecom_df.select_dtypes(include=np.number).columns
fig, axs = plt.subplots(int(np.ceil(len(cols)/10)),10, figsize=(30, int(np.ceil(len(cols)/10))*2))
for idx, col in enumerate(cols):
t1 = axs.flatten()[idx]
# Boxplot analysis for all the numeric columns
cols = telecom_df.select_dtypes(include=np.number).columns
fig, axs = plt.subplots(int(np.ceil(len(cols)/8)),8, figsize=(30, int(np.ceil(len(cols)/8))*3))
for idx, col in enumerate(cols):
t1 = axs.flatten()[idx]
sns.boxplot(telecom_df[col], orient='vert', palette='viridis', ax=t1)
# Remove Outlier
per = telecom_df['aon'].quantile([0.05,0.95]).values
telecom_df['aon'][telecom_df['aon'] >= per[1]] = per[1]
Bivariate Analysis¶
cls = ['total_ic_mou_action_phase','date_of_last_rech_action_phase','last_day_rch_amt_action_phase','roam_ic_mou_action_phase', 'roam_og_mou_action_phase',
sns.pairplot(telecom_df[cls], palette='viridis');
# kdeplot analysis for all the numeric columns
cols = telecom_df.select_dtypes(include=np.number).columns
fig, axs = plt.subplots(int(np.ceil(len(cols)/6)),6, figsize=(30, int(np.ceil(len(cols)/6))*3))
for idx, col in enumerate(cols):
t1 = axs.flatten()[idx]
sns.kdeplot(telecom_df, x=col, hue='churn', palette='viridis', ax=t1)
Observations:AON : New users are high in number ARPU : Avergae revenue is sigma curve and around 700 is mid for date_of _last_rech_action_phase, date_of_last_rech_good_phase: Number of recharges are higher towards end of the month for isd_ic_mou, isd_og_mou: ISD calls are less in number, For many users the call duration is nearly 0 last_day_rch_amt : Mostly people recharge for 100rs on last day loc_ic_mou_action_phase : Avg local call durations is 100mins for majority of the people max_rech_amt_action_phase : Majority of the customers recharge for 100-150 rs during action phase monthly_2g_action_phase : 2g usage for majority of the customers is very low monthly_3g_action_phase : 3g usage for majority of the customers is very low Local calls within operator are high in numbers, than calls to other operators Roaming usage is also low ¶
Observations:Churn rate decreases significantly over the age of customers Chrun rate is very less for customers having arpu more than 1000 Churn rate is more for users doing recharge in 22-28 date last_day_rch_amt_action_phase : Churn rate is higher among customers who do recharge for lesser amount Churn rate is higher among customers whose consumption of services like incoming calls, outgoing calls are low max_rech_amt_action_phase : For lower recharge amount churn rate is higher total_rech_amt_action_phase : For lower recharge amount churn rate is higher date_of_last_rech_good_phase : Churn rate pattern is different than action phase last_day_rch_amt_good_phase : Pattern is similar to that of action phase
Multivariate analysis¶
# Check the Heatmap for numerical columns
corr_df = telecom_df[telecom_df.select_dtypes(include=np.number).columns].corr()
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
fig, axs = plt.subplots(figsize=(20,10))
with sns.axes_style('white'):
axs = sns.heatmap(corr_df, mask=mask, square=True)
# Find top 20 correlations
corr_df = telecom_df.corr(numeric_only=True).abs()
corr_df = corr_df.unstack()
correlation = corr_df.sort_values()
correlation = corr_df.dropna()
correlation = correlation [correlation != 1.0]
correlation = correlation .reset_index()
correlation.sort_values(by=0, ascending=False).head(20)
level_0 | level_1 | 0 | |
3868 | arpu_good_phase | total_rech_amt_good_phase | 0.974 |
3699 | total_rech_amt_good_phase | arpu_good_phase | 0.974 |
3316 | total_rech_amt_action_phase | arpu_action_phase | 0.954 |
123 | arpu_action_phase | total_rech_amt_action_phase | 0.954 |
558 | isd_og_mou_action_phase | isd_og_mou_good_phase | 0.949 |
4171 | isd_og_mou_good_phase | isd_og_mou_action_phase | 0.949 |
6937 | total_ic_mou_good_phase | loc_ic_mou_good_phase | 0.892 |
4500 | loc_ic_mou_good_phase | total_ic_mou_good_phase | 0.892 |
3153 | total_ic_mou_action_phase | loc_ic_mou_action_phase | 0.887 |
716 | loc_ic_mou_action_phase | total_ic_mou_action_phase | 0.887 |
1905 | onnet_mou_action_phase | std_og_t2t_mou_action_phase | 0.857 |
3082 | std_og_t2t_mou_action_phase | onnet_mou_action_phase | 0.857 |
858 | loc_ic_t2m_mou_action_phase | loc_ic_mou_action_phase | 0.852 |
689 | loc_ic_mou_action_phase | loc_ic_t2m_mou_action_phase | 0.852 |
1734 | offnet_mou_action_phase | std_og_t2m_mou_action_phase | 0.850 |
2995 | std_og_t2m_mou_action_phase | offnet_mou_action_phase | 0.850 |
5689 | onnet_mou_good_phase | std_og_t2t_mou_good_phase | 0.847 |
6866 | std_og_t2t_mou_good_phase | onnet_mou_good_phase | 0.847 |
4473 | loc_ic_mou_good_phase | loc_ic_t2m_mou_good_phase | 0.846 |
4642 | loc_ic_t2m_mou_good_phase | loc_ic_mou_good_phase | 0.846 |
Top 5 Highly correlated features are: ¶
Column A | column B | |
arpu_good_phase | total_rech_amt_good_phase | 0.974 |
arpu_action_phase | total_rech_amt_action_phase | 0.954 |
isd_og_mou_action_phase | isd_og_mou_good_phase | 0.949 |
total_ic_mou_good_phase | loc_ic_mou_good_phase | 0.892 |
total_ic_mou_action_phase | loc_ic_mou_action_phase | 0.887 |
# evaluate the stats for all numeric columns
Train and Test Split¶
X = telecom_df.drop('churn', axis=1)
y = telecom_df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, stratify=y, random_state=100)
scaler = StandardScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train, y_train)
X_test[X_test.columns] = scaler.transform(X_test)
Class Imbalance¶
# telecom_df.to_csv('test.csv')
There is an imbalance in the dataset. With only 8% representing the class label 1 (minority class).
X_train_ori, y_train_ori = X_train.copy(), y_train.copy()
Helper Function for Class Imbalance¶
# Helper function to call all the Imbalance techniques.
def handle_imbalance(X_tr, y_tr, technique='oversampling', random_state=100):
if technique == 'undersampling':
under_sample = RandomUnderSampler(random_state=random_state, sampling_strategy='majority')
# print(under_sample.get_params())
# print(under_sample._sampling_strategy_docstring)
X_train_udr, y_train_udr = under_sample.fit_resample(X_tr, y_tr)
return X_train_udr, y_train_udr
elif technique == 'tomek_links':
tomek_sample = TomekLinks()
# print(tomek_sample.get_params())
# print(tomek_sample._sampling_strategy_docstring)
X_train_tomek, y_train_tomek = tomek_sample.fit_resample(X_tr, y_tr)
return X_train_tomek, y_train_tomek
elif technique == 'oversampling':
over_sample = RandomOverSampler(random_state=random_state)
# print(over_sample.get_params())
# print(over_sample._sampling_strategy_docstring)
X_train_ovr, y_train_ovr = over_sample.fit_resample(X_tr, y_tr)
return X_train_ovr, y_train_ovr
elif technique == 'smote':
smote_sample = SMOTE(random_state=random_state, k_neighbors=5)
# print(smote_sample.get_params())
# print(smote_sample._sampling_strategy_docstring)
X_train_smote, y_train_smote = smote_sample.fit_resample(X_tr, y_tr)
return X_train_smote, y_train_smote
elif technique == 'adasyn':
adasyn_sample = ADASYN(random_state=random_state, n_neighbors=5)
# print(adasyn_sample.get_params())
# print(adasyn_sample._sampling_strategy_docstring)
X_train_adasyn, y_train_adasyn = adasyn_sample.fit_resample(X_tr, y_tr)
return X_train_adasyn, y_train_adasyn
elif technique == 'smote_tomek':
smote_tomek_sample = SMOTETomek(random_state=random_state)
# print(smote_tomek_sample.get_params())
X_train_smote_tomek, y_train_smote_tomek = smote_tomek_sample.fit_resample(X_tr, y_tr)
return X_train_smote_tomek, y_train_smote_tomek
Class Imbalance using Oversampling¶
X_train_ovr, y_train_ovr = handle_imbalance(X_train_ori, y_train_ori, technique='oversampling', random_state=100)
[(0, 19153), (1, 19153)]
Class Imbalance using SMOTE¶
X_train_smote, y_train_smote = handle_imbalance(X_train_ori, y_train_ori, technique='smote', random_state=100)
[(0, 19153), (1, 19153)]
Class Imbalance using ADASYN¶
X_train_adasyn, y_train_adasyn = handle_imbalance(X_train_ori, y_train_ori, technique='adasyn', random_state=100)
[(0, 19153), (1, 19113)]
Principal Component Analysis¶
def principal_component_analysis(X_tr, random_state=100):
pca = PCA(random_state=random_state)
var_ratio_df = pd.DataFrame({'feature':X_tr.columns, 'var_ratio': pca.explained_variance_ratio_})
components = pd.DataFrame(pca.components_)
var_ratio_df = pd.concat([var_ratio_df,components],axis=1)
fig, axs = plt.subplots(1,3, figsize=(25,5))
axs[0].bar(range(1,len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_)
axs[0].set_xlabel('Component number')
axs[0].set_ylabel('Explained variance ratio')
axs[0].set_title('Bar plot')
axs[1].set_xlabel('Component number')
axs[1].set_ylabel('Explained variance ratio')
axs[1].set_title('Scree plot')
var_cumu = np.cumsum(pca.explained_variance_ratio_)
axs[2].vlines(x=47, ymax=1, ymin=0, colors="r", linestyles="--")
axs[2].hlines(y=0.95, xmax=100, xmin=0, colors="g", linestyles="--")
axs[2].set_ylabel("Cumulative variance explained")
axs[2].set_xlabel('Component number');
def incremental_pca(X_tr, X_te, n_components=45):
pca_incremental = IncrementalPCA(n_components, )
principal_components = pca_incremental.components_
feature_names = X_tr.columns
# Calculate the overall importance of each feature
feature_importance = {}
for i, component in enumerate(principal_components):
for j, weight in enumerate(component):
if feature_names[j] not in feature_importance:
feature_importance[feature_names[j]] = 0.0
feature_importance[feature_names[j]] += abs(weight)
# Sort features by their overall importance
sorted_feature_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
feature_lst = []
for feature, importance in sorted_feature_importance[:10]:
feature_lst.append(f"{feature}: {importance}")
df_tr_pca = pca_incremental.fit_transform(X_tr)
df_te_pca = pca_incremental.transform(X_te)
return df_tr_pca, df_te_pca, feature_names, feature_lst
Apply PCA (Dimensionality Reduction) on dataset generated by multiple class imbalance techniques¶
print("PCA using Random Oversampling")
X_train , y_train = X_train_ovr, y_train_ovr
principal_component_analysis(X_train, random_state=100)
df_train_ovr_pca, df_test_ovr_pca, feature_names, feature_lst = incremental_pca(X_train, X_test, n_components=45)
print("PCA using SMOTE")
X_train , y_train = X_train_smote, y_train_smote
principal_component_analysis(X_train, random_state=100)
df_train_smote_pca, df_test_smote_pca, feature_names, feature_lst = incremental_pca(X_train, X_test, n_components=45)
print("PCA using ADASYN")
X_train , y_train = X_train_adasyn, y_train_adasyn
principal_component_analysis(X_train, random_state=100)
df_train_adasyn_pca, df_test_adasyn_pca, feature_names, feature_lst = incremental_pca(X_train, X_test, n_components=45)
PCA using Random Oversampling
We applied PCA(dimensionality reduction) over various classing balance techniques and found out that, there is not much difference in the final outcome (reduction).
Machine Learning - Algorithms and Models¶
Custom Functions for Model Building¶
# resusable function to train using logistic regression Model
def model_training(fXt, fyt, fcutoff, ftest=False, fres=None):
fXt_sm = sm.add_constant(fXt)
if ftest == False:
lrm = sm.GLM(fyt, fXt_sm, family=sm.families.Binomial())
lrm =
fyt_pred = lrm.predict(fXt_sm)
lrm = fres
fyt_pred = lrm.predict(fXt_sm)
fyt_pred = fyt_pred.values.reshape(-1)
fyt_pred_final = pd.DataFrame({'Converted': fyt.values, 'Conv_Prob': fyt_pred})
fyt_pred_final['ID'] = fyt.index
fyt_pred_final['predicted'] = x: 1 if x > fcutoff else 0)
return lrm, fyt_pred, fyt_pred_final
# Metrics for logistic regression
def logreg_metrics_fn(fyt_pred_final):
fconfusion = confusion_matrix(fyt_pred_final.Converted, fyt_pred_final.predicted )
faccuracy = accuracy_score(fyt_pred_final.Converted, fyt_pred_final.predicted)
TP = fconfusion[1,1] # true positive
TN = fconfusion[0,0] # true negatives
FP = fconfusion[0,1] # false positives
FN = fconfusion[1,0] # false negatives
fSensi = TP/(TP+FN) # Calculate the sensitivity
fSpeci = TN/(TN+FP) # Calculate the specificity
fPreci = TP/(TP+FP) # Calculate Precision
fRecal = TP/(TP+FN) # Calculate Recall
return fconfusion, faccuracy, fSensi, fSpeci, fPreci, fRecal
# Function to generate VIF of scores.
def get_vif_score(fXt, cl):
vif = pd.DataFrame()
vif['Features'] = fXt[cl].columns
vif['VIF'] = [variance_inflation_factor(fXt[cl].values, i) for i in range(fXt[cl].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
return vif
# A common function to generate most imp metrics for all the classification algorithms.
def generate_metrics(yt, yt_pred, yt_prob ):
f1_sc = f1_score(yt, yt_pred, )
recall = recall_score(yt, yt_pred, )
roc_score = roc_auc_score(yt, yt_prob)
conf_mat = confusion_matrix(yt, yt_pred)
accuracy = accuracy_score(yt, yt_pred, )
precision = precision_score(yt, yt_pred, )
cl_rep = classification_report(yt, yt_pred)
df = pd.DataFrame({"accuracy":[accuracy],"roc_score":[roc_score],"precision":[precision],"recall":[recall],"f1_score":[f1_sc],
"classification_report":[cl_rep], "confusion_matrix": [conf_mat]})
return df, accuracy,roc_score, precision,recall, f1_sc, cl_rep, conf_mat
# Function to generate final summary report comprising of all the models and its metrics scores
def generate_summary_report(df=None, model_name="", class_imb='', train_accuracy="", test_accuracy="", roc_score="", precision="", recall="", f1_score="", classification_rep="", conf_matrix="", feature_imp="", step='create'):
if step == 'create':
df = pd.DataFrame(columns=["model_name","class_imb","train_accuracy","test_accuracy","roc_score","precision","recall","f1_score", "classification_report", "confusion_matrix", "feature_imp"])
return df
elif step == 'add':
df.loc[len(df)] = pd.Series({"model_name": model_name,"class_imb":class_imb,"train_accuracy": train_accuracy, "test_accuracy": test_accuracy,"roc_score": roc_score,"precision": precision,"recall": recall,"f1_score":f1_score, "classification_report": classification_rep, "confusion_matrix": conf_matrix, "feature_imp":feature_imp})
return df
Logistic Regression - RFE¶
Model Building¶
X_train, y_train = X_train_ovr, y_train_ovr
logreg = LogisticRegression()
logreg =,y_train)
# Find the optimum no of top features and Its contribution to the overall score using the RFECV method.
rfecv = RFECV(estimator=logreg, cv=5), y_train)
RFECV(cv=5, estimator=LogisticRegression())In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
RFECV(cv=5, estimator=LogisticRegression())
plt.figure(figsize=[30, 5])
plt.plot(range(1, len(X_train.columns)+1), rfecv.cv_results_['mean_test_score'])
From the chart we can observe that, the optimum no of features that contribute to overall score is 15 - 20 after which the graph Flattens out.
rfe = RFE(estimator=logreg, n_features_to_select= 15)
rfe =, y_train)
cols = rfe.get_feature_names_out()
rfe_df = pd.DataFrame({'feature':X_train.columns, 'rank': rfe.ranking_ , 'support': rfe.support_})
rfe_df.sort_values(by='rank', ascending=True).head(15)
feature | rank | support | |
47 | isd_ic_mou_good_phase | 1 | True |
63 | offnet_mou_good_phase | 1 | True |
37 | total_ic_mou_action_phase | 1 | True |
12 | loc_og_mou_action_phase | 1 | True |
72 | std_ic_mou_good_phase | 1 | True |
38 | total_og_mou_action_phase | 1 | True |
27 | spl_ic_mou_action_phase | 1 | True |
65 | onnet_mou_good_phase | 1 | True |
8 | loc_ic_mou_action_phase | 1 | True |
80 | total_ic_mou_good_phase | 1 | True |
5 | isd_ic_mou_action_phase | 1 | True |
19 | monthly_3g_action_phase | 1 | True |
81 | total_og_mou_good_phase | 1 | True |
51 | loc_ic_mou_good_phase | 1 | True |
29 | std_ic_mou_action_phase | 1 | True |
logreg = sm.GLM(y_train, sm.add_constant(X_train), family=sm.families.Binomial())
logreg =
# logreg.summary()
Logreg Model 1¶
logreg1, y_train_pred, y_train_pred_final = model_training(X_train[cols], y_train, 0.5)
vif_scores = get_vif_score(X_train, cols)
vif_scores[vif_scores['VIF'] > 5].head()
total_ic_mou_good_phase 0.000 std_ic_mou_good_phase 0.000 isd_ic_mou_good_phase 0.000 loc_ic_mou_good_phase 0.000 total_og_mou_good_phase 0.000 dtype: float64
Features | VIF | |
13 | total_ic_mou_good_phase | 442.120 |
9 | loc_ic_mou_good_phase | 327.360 |
6 | total_ic_mou_action_phase | 97.370 |
12 | std_ic_mou_good_phase | 70.810 |
1 | loc_ic_mou_action_phase | 68.930 |
Logreg Model 2¶
# 'total_ic_mou_good_phase' feature has very high VIF score. Hence we drop it.
cols = list(cols)
logreg2, y_train_pred, y_train_pred_final = model_training(X_train[cols], y_train, 0.5)
vif_scores = get_vif_score(X_train,cols)
vif_scores[vif_scores['VIF'] > 5].head()
std_ic_mou_good_phase 0.000 isd_ic_mou_good_phase 0.000 total_og_mou_good_phase 0.000 loc_ic_mou_good_phase 0.000 spl_ic_mou_action_phase 0.000 dtype: float64
Features | VIF | |
6 | total_ic_mou_action_phase | 81.080 |
1 | loc_ic_mou_action_phase | 58.200 |
13 | total_og_mou_good_phase | 26.800 |
10 | offnet_mou_good_phase | 14.800 |
11 | onnet_mou_good_phase | 14.070 |
Logreg Model 3¶
# 'total_ic_mou_action_phase' feature has very high VIF score. Hence we drop it.
logreg3, y_train_pred, y_train_pred_final = model_training(X_train[cols], y_train, 0.5)
vif_scores = get_vif_score(X_train,cols)
vif_scores[vif_scores['VIF'] > 5].head()
std_ic_mou_good_phase 0.000 isd_ic_mou_good_phase 0.000 isd_ic_mou_action_phase 0.000 std_ic_mou_action_phase 0.000 loc_ic_mou_good_phase 0.000 dtype: float64
Features | VIF | |
12 | total_og_mou_good_phase | 26.790 |
9 | offnet_mou_good_phase | 14.770 |
10 | onnet_mou_good_phase | 14.050 |
Logreg Model 4¶
# 'total_og_mou_good_phase' feature has very high VIF score. Hence we drop it.
logreg4, y_train_pred, y_train_pred_final = model_training(X_train[cols], y_train, 0.5)
vif_scores = get_vif_score(X_train,cols)
vif_scores[vif_scores['VIF'] > 5].head()
std_ic_mou_good_phase 0.000 isd_ic_mou_good_phase 0.000 isd_ic_mou_action_phase 0.000 std_ic_mou_action_phase 0.000 loc_ic_mou_good_phase 0.000 dtype: float64
Features | VIF |
Logreg Model 5¶
# Important features contributing to the model.
feature_imp = (logreg4.params.sort_values(ascending=False, key=abs)).reset_index()
feature_imp = feature_imp.rename(columns={'index':'columns', 0:'imp_score'})
columns | imp_score | |
0 | total_og_mou_action_phase | -1.225 |
1 | const | -1.202 |
2 | loc_ic_mou_action_phase | -1.126 |
3 | loc_og_mou_action_phase | -0.910 |
4 | onnet_mou_good_phase | 0.608 |
5 | spl_ic_mou_action_phase | -0.602 |
6 | offnet_mou_good_phase | 0.566 |
7 | monthly_3g_action_phase | -0.501 |
8 | std_ic_mou_action_phase | -0.420 |
9 | isd_ic_mou_action_phase | -0.395 |
10 | loc_ic_mou_good_phase | 0.338 |
11 | isd_ic_mou_good_phase | 0.245 |
12 | std_ic_mou_good_phase | 0.142 |
Accuracy, Sensitivity, Specificity, Precision, Recall¶
# Let's create columns with different probability cutoffs
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
y_train_pred_final[i]= x: 1 if x > i else 0)
# calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci', 'preci', 'recall'])
for i in numbers:
cm1 = confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
accuracy = (cm1[0,0]+cm1[1,1])/total1
speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
preci = cm1[1,1]/(cm1[0,1]+cm1[1,1])
recall = cm1[1,1]/(cm1[1,0]+cm1[1,1])
cutoff_df.loc[i] = [i, accuracy, sensi, speci, preci, recall]
ROC Curve Precision Recall curve¶
fig, axs = plt.subplots(1,3, figsize=(25,6))
# plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'], ax=axs[0])
axs[0].set_title('Accuracy Sensitivity Specificity')
axs[0].vlines( ymin=0, ymax=0.9,x=0.537, color="r")
# Plot ROC curve and check the area under curve
RocCurveDisplay.from_predictions(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob, drop_intermediate=False, ax=axs[1])
axs[1].set_title('ROC curve')
# plot Precision and Recall curve and find the optimal cutoff
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob)
axs[2].plot(thresholds, p[:-1], "b")
axs[2].plot(thresholds, r[:-1], "r")
axs[2].set_title('Precision Recall Curve')
axs[2].vlines( ymin=0, ymax=0.9,x=0.54, color="r");
# the cutoff value from accuracy sensitivity and specificity curve is 0.537
# Apply the cutoff value to see the prediction on train dataset.
y_train_pred_final['final_predicted'] = lambda x: 1 if x > 0.537 else 0)
# Let's check the overall Metrics.
cf_matrix, train_accuracy, sensitivity, specificity, precision, recall = logreg_metrics_fn(y_train_pred_final)
print(f'Train Accuracy - {round(train_accuracy,3)}\nSensitivity - {round(sensitivity,3)}\nspecificity - {round(specificity,3)}\nPrecision - {round(precision,3)}\nRecall - {round(recall,3)}')
# the cutoff value from Precision - Recall curve is 0.54
# Apply the cutoff value to see the prediction on train dataset.
y_train_pred_final['final_predicted'] = lambda x: 1 if x > 0.54 else 0)
# Let's check the overall metrics.
cf_matrix, train_accuracy,sensitivity, specificity, precision, recall = logreg_metrics_fn(y_train_pred_final)
print(f'Train Accuracy - {round(train_accuracy,3)}\nSensitivity - {round(sensitivity,3)}\nspecificity - {round(specificity,3)}\nPrecision - {round(precision,3)}\nRecall - {round(recall,3)}')
Model Testing¶
# Apply the trained Model on the test set
logregp, y_test_pred, y_test_pred_final = model_training(X_test[cols], y_test, 0.54, True, logreg4)
# check the overall metrics for test set
cf_matrix, test_accuracy, sensitivity, specificity, precision, recall = logreg_metrics_fn(y_test_pred_final)
roc_score = roc_auc_score( y_test_pred_final.Converted, y_test_pred_final.Conv_Prob )
f1_sc = f1_score(y_test, y_test_pred_final.predicted)
class_report = classification_report(y_test, y_test_pred_final.predicted)
print(f'Test Accuracy - {round(test_accuracy,3)}\nROC Score - {round(roc_score,3)}\nSensitivity - {round(sensitivity,3)}\nSpecificity - {round(specificity,3)}\nPrecision - {round(precision,3)}\nRecall - {round(recall,3)}')
overall_summary_df = generate_summary_report()
overall_summary_df = generate_summary_report(df=overall_summary_df, model_name="LogisticRegression",class_imb='oversampling', train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score,
precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=cf_matrix, feature_imp=feature_imp, step='add')
Test Accuracy - 0.819 ROC Score - 0.872 Sensitivity - 0.821 Specificity - 0.819 Precision - 0.305 Recall - 0.821
Logistic Regression - PCA¶
# LogisticRegression after PCA and Oversampling
# Check the overall metrics after performing LogisticRegressionCV
def logistic_regression(df_train_pca, df_test_pca, y_train, y_test, overall_summary_df):
lr_pca = LogisticRegressionCV(cv=10, random_state=100), y_train)
lr_pca_feature_importances = lr_pca.coef_.reshape(-1)
lr_pca_feature_imp_df = pd.DataFrame({'principal_component': range(len(lr_pca_feature_importances)), 'imp_score': lr_pca_feature_importances})
lr_pca_ytrain_prob = lr_pca.predict_proba(df_train_pca)[:,1]
lr_pca_ytrain_pred = lr_pca.predict(df_train_pca)
lr_pca_ytest_prob = lr_pca.predict_proba(df_test_pca)[:,1]
lr_pca_ytest_pred = lr_pca.predict(df_test_pca)
_, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=lr_pca_ytrain_pred, yt_prob=lr_pca_ytrain_prob)
metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=lr_pca_ytest_pred, yt_prob=lr_pca_ytest_prob)
overall_summary_df = generate_summary_report(df=overall_summary_df, model_name="LogisticRegressionPCA",class_imb='oversampling', train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score,
precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=conf_matrix, feature_imp=lr_pca_feature_imp_df, step='add')
X_train, y_train = X_train_ovr, y_train_ovr
logistic_regression(df_train_ovr_pca, df_test_ovr_pca, y_train, y_test, overall_summary_df = overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.827 0.892 0.318 0.838 0.461 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[6782, 1428], [129, 665]]
Custom Function¶
# Reusable function to perform Decision Tree classification and check the overall performance of the model
def decision_tree_classifier(X_train, y_train, X_test, y_test, technique, overall_summary_df):
dtc = DecisionTreeClassifier(random_state=100, max_depth=6)
dtc =, y_train)
# Get feature importance
dtc_feature_importances = dtc.feature_importances_
if X_train.shape[1] == len(X.columns):
dtc_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': dtc_feature_importances})
plot_tree(dtc, feature_names=list(X_train.columns), class_names=['No Churn', 'Churn'], filled=True, fontsize=6);
dtc_feature_imp_df = pd.DataFrame({'principal_component': range(len(dtc_feature_importances)), 'imp_score': dtc_feature_importances})
dtc_feature_imp_df = dtc_feature_imp_df.sort_values(by='imp_score', ascending=False).head(10)
# Predict on train set
y_train_prob_dtc = dtc.predict_proba(X_train)[:, 1]
y_train_pred_dtc = dtc.predict(X_train)
# Predict on test set
y_test_prob_dtc = dtc.predict_proba(X_test)[:, 1]
y_test_pred_dtc = dtc.predict(X_test)
# Check overall performance of the model using various metrics
_, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=y_train_pred_dtc, yt_prob=y_train_prob_dtc)
metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=y_test_pred_dtc, yt_prob=y_test_prob_dtc)
overall_summary_df = generate_summary_report(df=overall_summary_df, model_name=model_name, class_imb=technique, train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score,
precision=precision, recall=recall, f1_score=f1_sc, classification_rep =class_report, conf_matrix=conf_matrix, feature_imp=dtc_feature_imp_df, step='add')
DecisionTree - Oversampling¶
# Decision Tree after performing oversampling.
X_train, y_train = X_train_ovr, y_train_ovr
decision_tree_classifier(X_train, y_train, X_test, y_test, technique='oversampling', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.854 0.902 0.359 0.831 0.501 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7029, 1181], [134, 660]]
Decision Tree - SMOTE¶
# Decision Tree after performing SMOTE.
X_train, y_train = X_train_smote, y_train_smote
decision_tree_classifier(X_train, y_train, X_test, y_test, technique='smote', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.895 0.911 0.443 0.761 0.560 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7452, 758], [190, 604]]
# Decision Tree after performing ADASYN.
X_train, y_train = X_train_adasyn, y_train_adasyn
decision_tree_classifier(X_train, y_train, X_test, y_test, technique='adasyn', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.871 0.898 0.388 0.798 0.522 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7208, 1002], [160, 634]]
Decision Tree - PCA¶
# Decision Tree after performing PCA and Oversampling.
X_train, y_train = X_train_ovr, y_train_ovr
decision_tree_classifier(df_train_ovr_pca, y_train, df_test_ovr_pca, y_test, technique='oversampling', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.820 0.846 0.300 0.781 0.433 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[6760, 1450], [174, 620]]
Custom Function¶
# Reusable function to perform RandomForestClassifier and check the overall performance of the model
def random_forest_classifier(X_train, y_train, X_test, y_test, technique, overall_summary_df):
rf = RandomForestClassifier(random_state=100, oob_score=True, max_depth=6), y_train)
# Get feature importance
rf_feature_importances = rf.feature_importances_
if X_train.shape[1] == len(X.columns):
rf_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': rf_feature_importances})
rf_feature_imp_df = pd.DataFrame({'principal_component': range(len(rf_feature_importances)), 'imp_score': rf_feature_importances})
rf_feature_imp_df = rf_feature_imp_df.sort_values(by='imp_score', ascending=False).head(10)
# Predict on Train set
rf_ytrain_prob = rf.predict_proba(X_train)[:, 1]
rf_ytrain_pred = rf.predict(X_train)
# PRedict on Test set
rf_ytest_prob = rf.predict_proba(X_test)[:, 1]
rf_ytest_pred = rf.predict(X_test)
# Check overall performance of the model using various metrics.
_, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=rf_ytrain_pred, yt_prob=rf_ytrain_prob)
metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=rf_ytest_pred, yt_prob=rf_ytest_prob)
overall_summary_df = generate_summary_report(df=overall_summary_df, model_name=model_name,class_imb=technique, train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score,
precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=conf_matrix, feature_imp=rf_feature_imp_df, step='add')
Random Forest - Oversampling¶
# Perform random forest after oversampling.
X_train, y_train = X_train_ovr, y_train_ovr
random_forest_classifier(X_train, y_train, X_test, y_test, technique='oversampling', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.907 0.934 0.482 0.792 0.600 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7535, 675], [165, 629]]
Random Forest - SMOTE¶
# Perform random forest after. SMOTE
X_train, y_train = X_train_smote, y_train_smote
random_forest_classifier(X_train, y_train, X_test, y_test, technique='smote', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.903 0.931 0.471 0.802 0.594 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7495, 715], [157, 637]]
Random Forest - ADASYN¶
# Perform random forest after. ADASYN
X_train, y_train = X_train_adasyn, y_train_adasyn
random_forest_classifier(X_train, y_train, X_test, y_test, technique='adasyn', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.888 0.932 0.429 0.817 0.563 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7347, 863], [145, 649]]
Random Forest - PCA¶
# Perform random forest after PCA and Oversampling
X_train, y_train = X_train_ovr, y_train_ovr
random_forest_classifier(df_train_ovr_pca, y_train, df_test_ovr_pca, y_test, technique='oversampling', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.874 0.893 0.391 0.768 0.518 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7260, 950], [184, 610]]
Custom Function¶
# Reusable function to perform GradientBoostingClassifier and check the overall performance of the model
def gradient_boosting_classifier(X_train, y_train, X_test, y_test, technique, overall_summary_df):
gbc = GradientBoostingClassifier(random_state=100, max_depth=6), y_train)
# Get feature importance
gbc_feature_importances = gbc.feature_importances_
if X_train.shape[1] == len(X.columns):
gbc_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': gbc_feature_importances})
gbc_feature_imp_df = pd.DataFrame({'principal_component': range(len(gbc_feature_importances)), 'imp_score': gbc_feature_importances})
gbc_feature_imp_df = gbc_feature_imp_df.sort_values(by='imp_score', ascending=False).head(10)
# Predict on train set.
gbc_ytrain_prob = gbc.predict_proba(X_train)[:, 1]
gbc_ytrain_pred = gbc.predict(X_train)
# Reddit on test set.
gbc_ytest_prob = gbc.predict_proba(X_test)[:, 1]
gbc_ytest_pred = gbc.predict(X_test)
cvs = cross_val_score(gbc, X_test, y_test, cv=3, scoring='roc_auc')
print(f'cross_val_score is : {cvs}')
# Check overall performance of the model using various metrics.
_, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=gbc_ytrain_pred, yt_prob=gbc_ytrain_prob)
metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=gbc_ytest_pred, yt_prob=gbc_ytest_prob)
overall_summary_df = generate_summary_report(df=overall_summary_df, model_name=model_name, class_imb=technique, train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score,
precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=conf_matrix, feature_imp=gbc_feature_imp_df, step='add')
Gradient Boosting - Oversampling¶
# Gradient boosting after oversampling
X_train, y_train = X_train_ovr, y_train_ovr
gradient_boosting_classifier(X_train, y_train, X_test, y_test, technique='oversampling', overall_summary_df=overall_summary_df)
cross_val_score is : [0.93851828 0.94362607 0.94864421] accuracy roc_score precision recall f1_score \ 0 0.928 0.945 0.567 0.788 0.660 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7732, 478], [168, 626]]
Gradient Boosting - SMOTE¶
# Gradient boosting after SMOTE
X_train, y_train = X_train_smote, y_train_smote
gradient_boosting_classifier(X_train, y_train, X_test, y_test, technique='smote', overall_summary_df=overall_summary_df)
cross_val_score is : [0.93851828 0.94362607 0.94864421] accuracy roc_score precision recall f1_score \ 0 0.936 0.942 0.623 0.685 0.653 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7881, 329], [250, 544]]
Gradient Boosting - ADASYN¶
# Gradient boosting after ADAsyn
X_train, y_train = X_train_adasyn, y_train_adasyn
gradient_boosting_classifier(X_train, y_train, X_test, y_test, technique='adasyn', overall_summary_df=overall_summary_df)
cross_val_score is : [0.93851828 0.94362607 0.94864421] accuracy roc_score precision recall f1_score \ 0 0.934 0.940 0.614 0.689 0.649 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7866, 344], [247, 547]]
Gradient Boosting - PCA¶
# Gradient boosting after PCA and oversampling
X_train, y_train = X_train_ovr, y_train_ovr
gradient_boosting_classifier(df_train_ovr_pca, y_train, df_test_ovr_pca, y_test, technique='oversampling', overall_summary_df=overall_summary_df)
cross_val_score is : [0.89185653 0.88849354 0.89216733] accuracy roc_score precision recall f1_score \ 0 0.903 0.906 0.467 0.686 0.556 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7589, 621], [249, 545]]
Custom Function¶
# Reusable function to perform XGBClassifier and check the overall performance of the model
def xgb_classifier(X_train, y_train, X_test, y_test, technique, overall_summary_df):
xgb_cfl = xgb.XGBClassifier(scale_pos_weight= 1, objective = 'binary:logistic', random_state= 100, max_depth=6), y_train)
# Get feature importance
xgb_feature_importances = xgb_cfl.feature_importances_
if X_train.shape[1] == len(X.columns):
xgb_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': xgb_feature_importances})
xgb_feature_imp_df = pd.DataFrame({'principal_component': range(len(xgb_feature_importances)), 'imp_score': xgb_feature_importances})
xgb_feature_imp_df = xgb_feature_imp_df.sort_values(by='imp_score', ascending=False).head(10)
# Predict on trains set.
xgb_ytrain_prob = xgb_cfl.predict_proba(X_train)[:, 1]
xgb_ytrain_pred = xgb_cfl.predict(X_train)
# Predict on test set.
xgb_ytest_prob = xgb_cfl.predict_proba(X_test)[:, 1]
xgb_ytest_pred = xgb_cfl.predict(X_test)
# Check the overall performance of the model using various metrics.
_, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=xgb_ytrain_pred, yt_prob=xgb_ytrain_prob)
metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=xgb_ytest_pred, yt_prob=xgb_ytest_prob)
overall_summary_df = generate_summary_report(df=overall_summary_df, model_name=model_name, class_imb=technique, train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score,
precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=conf_matrix, feature_imp=xgb_feature_imp_df, step='add')
XGBoost - Oversampling¶
# Perform XGBOOST after oversampling
X_train, y_train = X_train_ovr, y_train_ovr
xgb_classifier(X_train, y_train, X_test, y_test, technique='oversampling', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.938 0.942 0.636 0.688 0.661 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7898, 312], [248, 546]]
XGBoost - SMOTE¶
# Perform XGBOOST after SMOTE
X_train, y_train = X_train_smote, y_train_smote
xgb_classifier(X_train, y_train, X_test, y_test, technique='smote', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.937 0.939 0.651 0.605 0.627 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7953, 257], [314, 480]]
# Perform XGBOOST after ADASYN
X_train, y_train = X_train_adasyn, y_train_adasyn
xgb_classifier(X_train, y_train, X_test, y_test, technique='adasyn', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.936 0.936 0.650 0.602 0.625 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7953, 257], [316, 478]]
XGBoost - PCA¶
# Perform XGBOOST after PCA and oversampling
X_train, y_train = X_train_ovr, y_train_ovr
xgb_classifier(df_train_ovr_pca, y_train, df_test_ovr_pca, y_test, technique='oversampling', overall_summary_df=overall_summary_df)
accuracy roc_score precision recall f1_score \ 0 0.920 0.898 0.547 0.553 0.550 classification_report \ 0 precision recall f1-score ... confusion_matrix 0 [[7847, 363], [355, 439]]
Overall Summary¶
# Evaluate the performance metrics of various Model.
overall_summary_df.sort_values(by=['roc_score', 'test_accuracy', 'train_accuracy'], ascending=[False, True, True])
model_name | class_imb | train_accuracy | test_accuracy | roc_score | precision | recall | f1_score | classification_report | confusion_matrix | feature_imp | |
10 | GradientBoostingClassifier | oversampling | 0.979 | 0.928 | 0.945 | 0.567 | 0.788 | 0.660 | precision recall f1-score ... | [[7732, 478], [168, 626]] | columns imp_score ... |
14 | XGBClassifier | oversampling | 0.996 | 0.938 | 0.942 | 0.636 | 0.688 | 0.661 | precision recall f1-score ... | [[7898, 312], [248, 546]] | columns imp_score ... |
11 | GradientBoostingClassifier | smote | 0.979 | 0.936 | 0.942 | 0.623 | 0.685 | 0.653 | precision recall f1-score ... | [[7881, 329], [250, 544]] | columns imp_score ... |
12 | GradientBoostingClassifier | adasyn | 0.979 | 0.934 | 0.940 | 0.614 | 0.689 | 0.649 | precision recall f1-score ... | [[7866, 344], [247, 547]] | columns imp_score ... |
15 | XGBClassifier | smote | 0.997 | 0.937 | 0.939 | 0.651 | 0.605 | 0.627 | precision recall f1-score ... | [[7953, 257], [314, 480]] | columns imp_score ... |
16 | XGBClassifier | adasyn | 0.997 | 0.936 | 0.936 | 0.650 | 0.602 | 0.625 | precision recall f1-score ... | [[7953, 257], [316, 478]] | columns imp_score ... |
6 | RandomForestClassifier | oversampling | 0.882 | 0.907 | 0.934 | 0.482 | 0.792 | 0.600 | precision recall f1-score ... | [[7535, 675], [165, 629]] | columns imp_score 3... |
8 | RandomForestClassifier | adasyn | 0.894 | 0.888 | 0.932 | 0.429 | 0.817 | 0.563 | precision recall f1-score ... | [[7347, 863], [145, 649]] | columns imp_score ... |
7 | RandomForestClassifier | smote | 0.905 | 0.903 | 0.931 | 0.471 | 0.802 | 0.594 | precision recall f1-score ... | [[7495, 715], [157, 637]] | columns imp_score ... |
3 | DecisionTreeClassifier | smote | 0.908 | 0.895 | 0.911 | 0.443 | 0.761 | 0.560 | precision recall f1-score ... | [[7452, 758], [190, 604]] | columns imp_score ... |
13 | GradientBoostingClassifierPCA | oversampling | 0.965 | 0.903 | 0.906 | 0.467 | 0.686 | 0.556 | precision recall f1-score ... | [[7589, 621], [249, 545]] | principal_component imp_score 8 ... |
2 | DecisionTreeClassifier | oversampling | 0.880 | 0.854 | 0.902 | 0.359 | 0.831 | 0.501 | precision recall f1-score ... | [[7029, 1181], [134, 660]] | columns imp_score ... |
17 | XGBClassifierPCA | oversampling | 0.997 | 0.920 | 0.898 | 0.547 | 0.553 | 0.550 | precision recall f1-score ... | [[7847, 363], [355, 439]] | principal_component imp_score 8 ... |
4 | DecisionTreeClassifier | adasyn | 0.890 | 0.871 | 0.898 | 0.388 | 0.798 | 0.522 | precision recall f1-score ... | [[7208, 1002], [160, 634]] | columns imp_score ... |
9 | RandomForestClassifierPCA | oversampling | 0.855 | 0.874 | 0.893 | 0.391 | 0.768 | 0.518 | precision recall f1-score ... | [[7260, 950], [184, 610]] | principal_component imp_score 8 ... |
1 | LogisticRegressionPCA | oversampling | 0.829 | 0.827 | 0.892 | 0.318 | 0.838 | 0.461 | precision recall f1-score ... | [[6782, 1428], [129, 665]] | principal_component imp_score 0 ... |
0 | LogisticRegression | oversampling | 0.808 | 0.819 | 0.872 | 0.305 | 0.821 | 0.444 | precision recall f1-score ... | [[6722, 1488], [142, 652]] | columns imp_score 0 t... |
5 | DecisionTreeClassifierPCA | oversampling | 0.832 | 0.820 | 0.846 | 0.300 | 0.781 | 0.433 | precision recall f1-score ... | [[6760, 1450], [174, 620]] | principal_component imp_score 8 ... |
overall_feature_imp_df = pd.DataFrame(columns=['model','class_imb','feature', 'imp_score'])
for items in zip(overall_summary_df.model_name, overall_summary_df.feature_imp, overall_summary_df.class_imb):
for val in items[1].values:
overall_feature_imp_df.loc[len(overall_feature_imp_df)] = items[0],items[2],val[0],val[1]
# Imp features of Top two models
overall_feature_imp_df[(overall_feature_imp_df['model'].eq('GradientBoostingClassifier')) & (overall_feature_imp_df['class_imb'].eq('smote')) | (overall_feature_imp_df['model'].eq('XGBClassifier')) & (overall_feature_imp_df['class_imb'].eq('oversampling'))]
model | class_imb | feature | imp_score | |
148 | GradientBoostingClassifier | smote | total_ic_mou_action_phase | 0.409 |
149 | GradientBoostingClassifier | smote | date_of_last_rech_action_phase | 0.143 |
150 | GradientBoostingClassifier | smote | roam_og_mou_action_phase | 0.131 |
151 | GradientBoostingClassifier | smote | last_day_rch_amt_action_phase | 0.053 |
152 | GradientBoostingClassifier | smote | roam_ic_mou_action_phase | 0.026 |
153 | GradientBoostingClassifier | smote | max_rech_amt_action_phase | 0.026 |
154 | GradientBoostingClassifier | smote | total_rech_amt_action_phase | 0.017 |
155 | GradientBoostingClassifier | smote | total_rech_num_good_phase | 0.012 |
156 | GradientBoostingClassifier | smote | aon | 0.008 |
157 | GradientBoostingClassifier | smote | loc_ic_mou_action_phase | 0.008 |
178 | XGBClassifier | oversampling | total_ic_mou_action_phase | 0.229 |
179 | XGBClassifier | oversampling | roam_og_mou_action_phase | 0.047 |
180 | XGBClassifier | oversampling | total_rech_amt_action_phase | 0.035 |
181 | XGBClassifier | oversampling | spl_ic_mou_action_phase | 0.034 |
182 | XGBClassifier | oversampling | last_day_rch_amt_action_phase | 0.034 |
183 | XGBClassifier | oversampling | date_of_last_rech_action_phase | 0.033 |
184 | XGBClassifier | oversampling | vol_3g_mb_action_phase | 0.024 |
185 | XGBClassifier | oversampling | total_rech_num_good_phase | 0.018 |
186 | XGBClassifier | oversampling | loc_og_mou_action_phase | 0.017 |
187 | XGBClassifier | oversampling | vol_2g_mb_action_phase | 0.016 |
# Plot the Barplot and evaluate the performance of various models.
plt.hlines(xmin=-0.5,xmax=30,y=0.5, color='r',linestyles='--')
plt.title('Models Performance Summary');
Based on the analysis, we have chosen the GradientBoostClassifier with SMOTE to address class imbalance. Here’s why this model was selected:
- Strong Performance Metrics: The model achieved an impressive ROC score of 0.942 with good precision and recall, ensuring reliable classification.
- Balanced Accuracy: The training accuracy does not indicate overfitting, and the gap between train and test accuracy remains within an acceptable range.
- Processing Time: While the model takes around 2 minute, this is deemed acceptable given the size and complexity of the current dataset.
Below are the top factors that impact the churn.
- Total incoming minutes of usage action phase:
- Date of last recharge action phase :
- Roaming outgoing calls minutes of usage action phase :
- Last day recharge amount action phase :
- Roaming incoming minutes of usage action phase:
end_time = ct.time()
total_time = end_time - start_time
print(f'Time taken to complete :{(total_time/60)} mins')
Time taken to complete :16.31866637070974 mins
