import time as ct
start_time = ct.time() 

import numpy as np
import pandas as pd
import seaborn as sns; sns.set_theme(color_codes=True)
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
%matplotlib inline

# Set custom display properties in pandas
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 900) 
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# %pip install fast_ml    ## Required for constant feature identification package
# %conda install xgboost -y
# %conda install -c conda-forge imbalanced-learn -y

from collections import Counter
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek

import xgboost as xgb
import statsmodels.api as sm
from sklearn.tree import plot_tree
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE, RFECV
from sklearn.decomposition import PCA, IncrementalPCA 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, RocCurveDisplay, precision_recall_curve, f1_score, classification_report, accuracy_score

telecom_df = pd.read_csv("./telecom_churn_data.csv")
telecom_df.head(2)

# check columns names and datatype 
telecom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 226 entries, mobile_number to sep_vbc_3g
dtypes: float64(179), int64(35), object(12)
memory usage: 172.4+ MB

def combine_features(df, cols, pat1='_6' , pat2='_7' ,to_append='_good_phase'):
    drop_lst = []
    new_cols = []
    month_col = cols[cols.str.contains(pat='jun_|jul_')].str.replace(pat="(jun_|jul_)", repl="", regex=True).unique()
    cols = cols.str.replace(pat="(_\d$|jun_|jul_)", repl="", regex=True).unique()
    for col in cols:
        if col in month_col:
            new_col = col + to_append
            col1 = 'jun_' + col
            col2 = 'jul_' + col
        else:
            new_col = col + to_append
            col1 = col + pat1
            col2 = col + pat2
            
        df[new_col] = df[[col1, col2]].mean(axis=1) 
        drop_lst.extend([col1,col2])
        new_cols.extend([new_col])
    return drop_lst, new_cols   

def find_outliers(df):
    temp_df = pd.DataFrame(columns= ['col', 'lower_threshold', 'iqr_q1', 'iqr', 'iqr_q3', 'upper_threshold', 'outliers_cnt'])
    num_cols = df.select_dtypes(include=np.number).columns
    for col in num_cols:
        arr = df[col][df[col].notna()]
        iqr_q3 = np.quantile(df[col], 0.75)
        iqr_q1 = np.quantile(df[col], 0.25)
        
        iqr = iqr_q3 - iqr_q1
        iqr_upper_threshold = iqr_q3 + (1.5 * iqr)
        iqr_lower_threshold = iqr_q1 - (1.5 * iqr)
        
        outliers = arr[(arr > iqr_upper_threshold) | (arr < iqr_lower_threshold)]
        to_add = pd.Series({'col': col, 'lower_threshold': iqr_lower_threshold, 'iqr_q1': iqr_q1, 'iqr': iqr, 'iqr_q3': iqr_q3, 'upper_threshold': iqr_upper_threshold, 'outliers_cnt': len(outliers)})
        temp_df = pd.concat([temp_df,to_add.to_frame().T])
    return temp_df

def check_col_null_pct(df, thresh=0, incl_all=1):
    col_null_pct = (df.isna().sum()/len(df))*100
    cols_na_abv_thresh = col_null_pct[col_null_pct > thresh]
    cols_na_bel_thresh = col_null_pct[col_null_pct < thresh]
    return cols_na_abv_thresh.sort_values(ascending=False) , cols_na_bel_thresh.sort_values(ascending=False)

telecom_df.size
telecom_df.shape
telecom_df.columns

22599774

(99999, 226)

Index(['mobile_number', 'circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou',
       'loc_ic_t2o_mou', 'last_date_of_month_6', 'last_date_of_month_7',
       'last_date_of_month_8', 'last_date_of_month_9', 'arpu_6',
       ...
       'sachet_3g_9', 'fb_user_6', 'fb_user_7', 'fb_user_8', 'fb_user_9',
       'aon', 'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g'],
      dtype='object', length=226)

telecom_df[telecom_df.duplicated()]
telecom_df['mobile_number'].is_unique

True

cols_with_gt40_na , cols_with_le40_na = check_col_null_pct(telecom_df, 40)
len(cols_with_gt40_na)

40

telecom_df = telecom_df[cols_with_le40_na.index.sort_values(ascending=True)]
cols_with_le40_na[cols_with_le40_na > 0]

loc_og_t2m_mou_9       7.745
loc_ic_t2m_mou_9       7.745
offnet_mou_9           7.745
std_ic_t2f_mou_9       7.745
roam_ic_mou_9          7.745
std_og_t2t_mou_9       7.745
roam_og_mou_9          7.745
std_ic_t2m_mou_9       7.745
loc_og_t2t_mou_9       7.745
std_ic_t2o_mou_9       7.745
loc_og_mou_9           7.745
std_ic_t2t_mou_9       7.745
isd_og_mou_9           7.745
loc_ic_t2f_mou_9       7.745
loc_og_t2c_mou_9       7.745
loc_og_t2f_mou_9       7.745
loc_ic_t2t_mou_9       7.745
std_og_t2m_mou_9       7.745
spl_ic_mou_9           7.745
std_ic_mou_9           7.745
ic_others_9            7.745
std_og_mou_9           7.745
isd_ic_mou_9           7.745
spl_og_mou_9           7.745
onnet_mou_9            7.745
std_og_t2c_mou_9       7.745
loc_ic_mou_9           7.745
og_others_9            7.745
std_og_t2f_mou_9       7.745
std_og_t2t_mou_8       5.378
std_og_t2c_mou_8       5.378
loc_ic_t2m_mou_8       5.378
loc_ic_t2f_mou_8       5.378
og_others_8            5.378
std_og_t2m_mou_8       5.378
loc_og_mou_8           5.378
std_og_mou_8           5.378
loc_ic_mou_8           5.378
std_og_t2f_mou_8       5.378
loc_og_t2c_mou_8       5.378
spl_og_mou_8           5.378
loc_ic_t2t_mou_8       5.378
isd_og_mou_8           5.378
loc_og_t2f_mou_8       5.378
std_ic_t2f_mou_8       5.378
ic_others_8            5.378
isd_ic_mou_8           5.378
spl_ic_mou_8           5.378
std_ic_mou_8           5.378
onnet_mou_8            5.378
std_ic_t2o_mou_8       5.378
roam_ic_mou_8          5.378
offnet_mou_8           5.378
loc_og_t2m_mou_8       5.378
roam_og_mou_8          5.378
std_ic_t2m_mou_8       5.378
loc_og_t2t_mou_8       5.378
std_ic_t2t_mou_8       5.378
date_of_last_rech_9    4.760
std_ic_t2f_mou_6       3.937
spl_og_mou_6           3.937
loc_ic_mou_6           3.937
ic_others_6            3.937
isd_ic_mou_6           3.937
std_ic_t2m_mou_6       3.937
spl_ic_mou_6           3.937
loc_ic_t2m_mou_6       3.937
loc_ic_t2t_mou_6       3.937
std_ic_mou_6           3.937
std_ic_t2t_mou_6       3.937
std_ic_t2o_mou_6       3.937
loc_ic_t2f_mou_6       3.937
og_others_6            3.937
roam_ic_mou_6          3.937
offnet_mou_6           3.937
roam_og_mou_6          3.937
std_og_t2f_mou_6       3.937
loc_og_t2f_mou_6       3.937
loc_og_t2t_mou_6       3.937
std_og_t2c_mou_6       3.937
std_og_t2t_mou_6       3.937
std_og_t2m_mou_6       3.937
loc_og_mou_6           3.937
onnet_mou_6            3.937
std_og_mou_6           3.937
loc_og_t2m_mou_6       3.937
loc_og_t2c_mou_6       3.937
isd_og_mou_6           3.937
std_ic_t2f_mou_7       3.859
std_ic_t2m_mou_7       3.859
roam_og_mou_7          3.859
loc_og_t2t_mou_7       3.859
isd_og_mou_7           3.859
roam_ic_mou_7          3.859
spl_og_mou_7           3.859
offnet_mou_7           3.859
onnet_mou_7            3.859
std_ic_mou_7           3.859
spl_ic_mou_7           3.859
isd_ic_mou_7           3.859
ic_others_7            3.859
loc_og_t2m_mou_7       3.859
std_ic_t2o_mou_7       3.859
std_ic_t2t_mou_7       3.859
std_og_t2c_mou_7       3.859
loc_og_t2c_mou_7       3.859
loc_ic_mou_7           3.859
loc_og_mou_7           3.859
loc_ic_t2f_mou_7       3.859
std_og_t2t_mou_7       3.859
loc_ic_t2m_mou_7       3.859
std_og_t2m_mou_7       3.859
std_og_t2f_mou_7       3.859
loc_ic_t2t_mou_7       3.859
std_og_mou_7           3.859
loc_og_t2f_mou_7       3.859
og_others_7            3.859
date_of_last_rech_8    3.622
date_of_last_rech_7    1.767
last_date_of_month_9   1.659
date_of_last_rech_6    1.607
last_date_of_month_8   1.100
loc_og_t2o_mou         1.018
std_og_t2o_mou         1.018
loc_ic_t2o_mou         1.018
last_date_of_month_7   0.601
dtype: float64

telecom_df.head(2)

from fast_ml import feature_selection as fs

const_features_df = fs.get_constant_features(telecom_df, threshold=100, dropna=True)
const_features_df

telecom_df = telecom_df.drop(const_features_df['Var'].to_list(), axis=1)
telecom_df.shape

(99999, 170)

_,cols_with_le10_na = check_col_null_pct(telecom_df, 10)
plt.figure(figsize=(30,2))
cols_with_le10_na[(cols_with_le10_na > 0)].plot.bar()
plt.show();

telecom_df[['date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9']] = telecom_df[['date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9']].apply(lambda x: pd.to_datetime(x).dt.day)

find_outliers(telecom_df).sort_values(by=['outliers_cnt', 'lower_threshold', 'iqr_q1', 'iqr', 'iqr_q3', 'upper_threshold'], ascending=[False, True, False, False, False, False]).head(20)

telecom_df_bk = telecom_df.copy()
# telecom_df = telecom_df_bk

to_impute_df = telecom_df.select_dtypes(include = np.number)

si = SimpleImputer(strategy='median')
imputed_arr = si.fit_transform(to_impute_df)
df_imputed = pd.DataFrame(imputed_arr, columns = to_impute_df.columns)

telecom_df = telecom_df[telecom_df.columns.difference(to_impute_df.columns)]
telecom_df = pd.concat([telecom_df, df_imputed], axis=1)
telecom_df

# telecom_df['total_rech_amt_good_phase'] = telecom_df[['total_rech_amt_6','total_rech_amt_7']].sum(axis=1)
# telecom_df = telecom_df.drop(['total_rech_amt_6','total_rech_amt_7'], axis=1)
# telecom_df = telecom_df[(telecom_df['total_rech_amt_good_phase'] >= telecom_df['total_rech_amt_good_phase'].quantile(0.7))]

telecom_df['total_rech_amt_good_phase']  = telecom_df[['total_rech_amt_6','total_rech_amt_7']].mean(axis=1)
telecom_df = telecom_df.drop(['total_rech_amt_6','total_rech_amt_7'], axis=1)
telecom_df = telecom_df[(telecom_df['total_rech_amt_good_phase'] >= telecom_df['total_rech_amt_good_phase'].quantile(0.7))]

telecom_df['churn'] = telecom_df.apply(lambda x: 1 if ((x['total_ic_mou_9'] < 1) & (x['total_og_mou_9'] < 1) & (x['vol_2g_mb_9'] < 1 ) & (x['vol_3g_mb_9'] < 1) )  else  0, axis=1)

telecom_df.columns = telecom_df.columns.str.replace(pat='_8',repl='_action_phase')
telecom_df = telecom_df.rename(columns={'aug_vbc_3g': 'vbc_3g_action_phase'})
telecom_df.filter(like='_8').columns
telecom_df.shape

Index([], dtype='object')

(30011, 170)

sep_cols_to_drop = telecom_df.filter(like='_9').columns.to_list()
sep_cols_to_drop.append('sep_vbc_3g')
sep_cols_to_drop

['arpu_9',
 'date_of_last_rech_9',
 'ic_others_9',
 'isd_ic_mou_9',
 'isd_og_mou_9',
 'last_day_rch_amt_9',
 'loc_ic_mou_9',
 'loc_ic_t2f_mou_9',
 'loc_ic_t2m_mou_9',
 'loc_ic_t2t_mou_9',
 'loc_og_mou_9',
 'loc_og_t2c_mou_9',
 'loc_og_t2f_mou_9',
 'loc_og_t2m_mou_9',
 'loc_og_t2t_mou_9',
 'max_rech_amt_9',
 'monthly_2g_9',
 'monthly_3g_9',
 'offnet_mou_9',
 'og_others_9',
 'onnet_mou_9',
 'roam_ic_mou_9',
 'roam_og_mou_9',
 'sachet_2g_9',
 'sachet_3g_9',
 'spl_ic_mou_9',
 'spl_og_mou_9',
 'std_ic_mou_9',
 'std_ic_t2f_mou_9',
 'std_ic_t2m_mou_9',
 'std_ic_t2t_mou_9',
 'std_og_mou_9',
 'std_og_t2f_mou_9',
 'std_og_t2m_mou_9',
 'std_og_t2t_mou_9',
 'total_ic_mou_9',
 'total_og_mou_9',
 'total_rech_amt_9',
 'total_rech_num_9',
 'vol_2g_mb_9',
 'vol_3g_mb_9',
 'sep_vbc_3g']

telecom_df = telecom_df.drop(sep_cols_to_drop, axis=1)
telecom_df.shape

(30011, 128)

cols_to_combine =  telecom_df.filter(regex='.*(jun_|jul_|_6|_7).*',axis=1).columns
telecom_df[cols_to_combine].info()

<class 'pandas.core.frame.DataFrame'>
Index: 30011 entries, 7 to 99997
Data columns (total 82 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   arpu_6               30011 non-null  float64
 1   arpu_7               30011 non-null  float64
 2   date_of_last_rech_6  30011 non-null  float64
 3   date_of_last_rech_7  30011 non-null  float64
 4   ic_others_6          30011 non-null  float64
 5   ic_others_7          30011 non-null  float64
 6   isd_ic_mou_6         30011 non-null  float64
 7   isd_ic_mou_7         30011 non-null  float64
 8   isd_og_mou_6         30011 non-null  float64
 9   isd_og_mou_7         30011 non-null  float64
 10  jul_vbc_3g           30011 non-null  float64
 11  jun_vbc_3g           30011 non-null  float64
 12  last_day_rch_amt_6   30011 non-null  float64
 13  last_day_rch_amt_7   30011 non-null  float64
 14  loc_ic_mou_6         30011 non-null  float64
 15  loc_ic_mou_7         30011 non-null  float64
 16  loc_ic_t2f_mou_6     30011 non-null  float64
 17  loc_ic_t2f_mou_7     30011 non-null  float64
 18  loc_ic_t2m_mou_6     30011 non-null  float64
 19  loc_ic_t2m_mou_7     30011 non-null  float64
 20  loc_ic_t2t_mou_6     30011 non-null  float64
 21  loc_ic_t2t_mou_7     30011 non-null  float64
 22  loc_og_mou_6         30011 non-null  float64
 23  loc_og_mou_7         30011 non-null  float64
 24  loc_og_t2c_mou_6     30011 non-null  float64
 25  loc_og_t2c_mou_7     30011 non-null  float64
 26  loc_og_t2f_mou_6     30011 non-null  float64
 27  loc_og_t2f_mou_7     30011 non-null  float64
 28  loc_og_t2m_mou_6     30011 non-null  float64
 29  loc_og_t2m_mou_7     30011 non-null  float64
 30  loc_og_t2t_mou_6     30011 non-null  float64
 31  loc_og_t2t_mou_7     30011 non-null  float64
 32  max_rech_amt_6       30011 non-null  float64
 33  max_rech_amt_7       30011 non-null  float64
 34  monthly_2g_6         30011 non-null  float64
 35  monthly_2g_7         30011 non-null  float64
 36  monthly_3g_6         30011 non-null  float64
 37  monthly_3g_7         30011 non-null  float64
 38  offnet_mou_6         30011 non-null  float64
 39  offnet_mou_7         30011 non-null  float64
 40  og_others_6          30011 non-null  float64
 41  og_others_7          30011 non-null  float64
 42  onnet_mou_6          30011 non-null  float64
 43  onnet_mou_7          30011 non-null  float64
 44  roam_ic_mou_6        30011 non-null  float64
 45  roam_ic_mou_7        30011 non-null  float64
 46  roam_og_mou_6        30011 non-null  float64
 47  roam_og_mou_7        30011 non-null  float64
 48  sachet_2g_6          30011 non-null  float64
 49  sachet_2g_7          30011 non-null  float64
 50  sachet_3g_6          30011 non-null  float64
 51  sachet_3g_7          30011 non-null  float64
 52  spl_ic_mou_6         30011 non-null  float64
 53  spl_ic_mou_7         30011 non-null  float64
 54  spl_og_mou_6         30011 non-null  float64
 55  spl_og_mou_7         30011 non-null  float64
 56  std_ic_mou_6         30011 non-null  float64
 57  std_ic_mou_7         30011 non-null  float64
 58  std_ic_t2f_mou_6     30011 non-null  float64
 59  std_ic_t2f_mou_7     30011 non-null  float64
 60  std_ic_t2m_mou_6     30011 non-null  float64
 61  std_ic_t2m_mou_7     30011 non-null  float64
 62  std_ic_t2t_mou_6     30011 non-null  float64
 63  std_ic_t2t_mou_7     30011 non-null  float64
 64  std_og_mou_6         30011 non-null  float64
 65  std_og_mou_7         30011 non-null  float64
 66  std_og_t2f_mou_6     30011 non-null  float64
 67  std_og_t2f_mou_7     30011 non-null  float64
 68  std_og_t2m_mou_6     30011 non-null  float64
 69  std_og_t2m_mou_7     30011 non-null  float64
 70  std_og_t2t_mou_6     30011 non-null  float64
 71  std_og_t2t_mou_7     30011 non-null  float64
 72  total_ic_mou_6       30011 non-null  float64
 73  total_ic_mou_7       30011 non-null  float64
 74  total_og_mou_6       30011 non-null  float64
 75  total_og_mou_7       30011 non-null  float64
 76  total_rech_num_6     30011 non-null  float64
 77  total_rech_num_7     30011 non-null  float64
 78  vol_2g_mb_6          30011 non-null  float64
 79  vol_2g_mb_7          30011 non-null  float64
 80  vol_3g_mb_6          30011 non-null  float64
 81  vol_3g_mb_7          30011 non-null  float64
dtypes: float64(82)
memory usage: 19.0 MB

drop_lst, new_cols = combine_features(df=telecom_df, cols=cols_to_combine, pat1='_6', pat2='_7',to_append='_good_phase')
telecom_df = telecom_df.drop(drop_lst, axis=1)
telecom_df.shape
telecom_df.columns

(30011, 87)

Index(['aon', 'arpu_action_phase', 'vbc_3g_action_phase',
       'date_of_last_rech_action_phase', 'ic_others_action_phase',
       'isd_ic_mou_action_phase', 'isd_og_mou_action_phase',
       'last_day_rch_amt_action_phase', 'loc_ic_mou_action_phase',
       'loc_ic_t2f_mou_action_phase', 'loc_ic_t2m_mou_action_phase',
       'loc_ic_t2t_mou_action_phase', 'loc_og_mou_action_phase',
       'loc_og_t2c_mou_action_phase', 'loc_og_t2f_mou_action_phase',
       'loc_og_t2m_mou_action_phase', 'loc_og_t2t_mou_action_phase',
       'max_rech_amt_action_phase', 'mobile_number', 'monthly_2g_action_phase',
       'monthly_3g_action_phase', 'offnet_mou_action_phase',
       'og_others_action_phase', 'onnet_mou_action_phase',
       'roam_ic_mou_action_phase', 'roam_og_mou_action_phase',
       'sachet_2g_action_phase', 'sachet_3g_action_phase',
       'spl_ic_mou_action_phase', 'spl_og_mou_action_phase',
       'std_ic_mou_action_phase', 'std_ic_t2f_mou_action_phase',
       'std_ic_t2m_mou_action_phase', 'std_ic_t2t_mou_action_phase',
       'std_og_mou_action_phase', 'std_og_t2f_mou_action_phase',
       'std_og_t2m_mou_action_phase', 'std_og_t2t_mou_action_phase',
       'total_ic_mou_action_phase', 'total_og_mou_action_phase',
       'total_rech_amt_action_phase', 'total_rech_num_action_phase',
       'vol_2g_mb_action_phase', 'vol_3g_mb_action_phase',
       'total_rech_amt_good_phase', 'churn', 'arpu_good_phase',
       'date_of_last_rech_good_phase', 'ic_others_good_phase',
       'isd_ic_mou_good_phase', 'isd_og_mou_good_phase', 'vbc_3g_good_phase',
       'last_day_rch_amt_good_phase', 'loc_ic_mou_good_phase',
       'loc_ic_t2f_mou_good_phase', 'loc_ic_t2m_mou_good_phase',
       'loc_ic_t2t_mou_good_phase', 'loc_og_mou_good_phase',
       'loc_og_t2c_mou_good_phase', 'loc_og_t2f_mou_good_phase',
       'loc_og_t2m_mou_good_phase', 'loc_og_t2t_mou_good_phase',
       'max_rech_amt_good_phase', 'monthly_2g_good_phase',
       'monthly_3g_good_phase', 'offnet_mou_good_phase',
       'og_others_good_phase', 'onnet_mou_good_phase',
       'roam_ic_mou_good_phase', 'roam_og_mou_good_phase',
       'sachet_2g_good_phase', 'sachet_3g_good_phase', 'spl_ic_mou_good_phase',
       'spl_og_mou_good_phase', 'std_ic_mou_good_phase',
       'std_ic_t2f_mou_good_phase', 'std_ic_t2m_mou_good_phase',
       'std_ic_t2t_mou_good_phase', 'std_og_mou_good_phase',
       'std_og_t2f_mou_good_phase', 'std_og_t2m_mou_good_phase',
       'std_og_t2t_mou_good_phase', 'total_ic_mou_good_phase',
       'total_og_mou_good_phase', 'total_rech_num_good_phase',
       'vol_2g_mb_good_phase', 'vol_3g_mb_good_phase'],
      dtype='object')

cols_with_null,_ = check_col_null_pct(telecom_df)
cols_with_null

Series([], dtype: float64)

telecom_df = telecom_df.drop('mobile_number', axis=1)

# Histplot analysis for all the numeric columns
cols = telecom_df.select_dtypes(include=np.number).columns
fig, axs = plt.subplots(int(np.ceil(len(cols)/10)),10, figsize=(30, int(np.ceil(len(cols)/10))*2))

for idx, col in enumerate(cols):
    t1 = axs.flatten()[idx]
    t1.hist(telecom_df[col])
    t1.set_title(col)

plt.tight_layout()
plt.show();

# Boxplot analysis for all the numeric columns
cols = telecom_df.select_dtypes(include=np.number).columns
fig, axs = plt.subplots(int(np.ceil(len(cols)/8)),8, figsize=(30, int(np.ceil(len(cols)/8))*3))

for idx, col in enumerate(cols):
    t1 = axs.flatten()[idx]
    sns.boxplot(telecom_df[col], orient='vert', palette='viridis', ax=t1)
    t1.set_title(col)

plt.tight_layout()
plt.show();

# Remove Outlier
per = telecom_df['aon'].quantile([0.05,0.95]).values
telecom_df['aon'][telecom_df['aon'] >= per[1]] = per[1]

cls = ['total_ic_mou_action_phase','date_of_last_rech_action_phase','last_day_rch_amt_action_phase','roam_ic_mou_action_phase', 'roam_og_mou_action_phase',
       'total_rech_amt_action_phase','spl_ic_mou_action_phase','vol_3g_mb_action_phase','total_rech_num_good_phase','loc_og_mou_action_phase','vol_2g_mb_action_phase']
sns.pairplot(telecom_df[cls], palette='viridis')
plt.show();

# kdeplot analysis for all the numeric columns
cols = telecom_df.select_dtypes(include=np.number).columns
fig, axs = plt.subplots(int(np.ceil(len(cols)/6)),6, figsize=(30, int(np.ceil(len(cols)/6))*3))

for idx, col in enumerate(cols):
    t1 = axs.flatten()[idx]
    sns.kdeplot(telecom_df, x=col, hue='churn', palette='viridis', ax=t1)
    t1.set_title(col)

plt.tight_layout()
plt.show();

# Check the Heatmap for numerical columns
corr_df = telecom_df[telecom_df.select_dtypes(include=np.number).columns].corr()
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
fig, axs = plt.subplots(figsize=(20,10))
with sns.axes_style('white'):
    axs = sns.heatmap(corr_df, mask=mask, square=True)
plt.plot();

# Find top 20 correlations
corr_df = telecom_df.corr(numeric_only=True).abs()
corr_df = corr_df.unstack()
correlation = corr_df.sort_values()
correlation = corr_df.dropna()

correlation = correlation [correlation  != 1.0]
correlation = correlation .reset_index()
correlation.sort_values(by=0, ascending=False).head(20)

# evaluate the stats for all numeric columns
telecom_df.describe()

X = telecom_df.drop('churn', axis=1)
y = telecom_df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, stratify=y, random_state=100)

scaler = StandardScaler()
X_train[X_train.columns] = scaler.fit_transform(X_train, y_train)
X_train.head()

X_test[X_test.columns] = scaler.transform(X_test)

# telecom_df.to_csv('test.csv')
telecom_df['churn'].sum()/len(telecom_df['churn'])*100

8.823431408483556

X_train_ori, y_train_ori = X_train.copy(), y_train.copy()

# Helper function to call all the Imbalance techniques. 
def handle_imbalance(X_tr, y_tr, technique='oversampling', random_state=100):
    if technique == 'undersampling':
        under_sample = RandomUnderSampler(random_state=random_state, sampling_strategy='majority')
        # print(under_sample.get_params())
        # print(under_sample._sampling_strategy_docstring)
        X_train_udr, y_train_udr = under_sample.fit_resample(X_tr, y_tr)
        return X_train_udr, y_train_udr
    elif technique == 'tomek_links':
        tomek_sample = TomekLinks()
        # print(tomek_sample.get_params())
        # print(tomek_sample._sampling_strategy_docstring)
        X_train_tomek, y_train_tomek = tomek_sample.fit_resample(X_tr, y_tr)
        return X_train_tomek, y_train_tomek
    elif technique == 'oversampling':
        over_sample = RandomOverSampler(random_state=random_state)
        # print(over_sample.get_params())
        # print(over_sample._sampling_strategy_docstring)
        X_train_ovr, y_train_ovr = over_sample.fit_resample(X_tr, y_tr)
        return X_train_ovr, y_train_ovr
    elif technique == 'smote':
        smote_sample = SMOTE(random_state=random_state, k_neighbors=5)
        # print(smote_sample.get_params())
        # print(smote_sample._sampling_strategy_docstring)
        X_train_smote, y_train_smote = smote_sample.fit_resample(X_tr, y_tr)
        return X_train_smote, y_train_smote
    elif technique == 'adasyn':
        adasyn_sample = ADASYN(random_state=random_state, n_neighbors=5)
        # print(adasyn_sample.get_params())
        # print(adasyn_sample._sampling_strategy_docstring)
        X_train_adasyn, y_train_adasyn = adasyn_sample.fit_resample(X_tr, y_tr)
        return X_train_adasyn, y_train_adasyn
    elif technique == 'smote_tomek':
        smote_tomek_sample = SMOTETomek(random_state=random_state)
        # print(smote_tomek_sample.get_params())
        X_train_smote_tomek, y_train_smote_tomek = smote_tomek_sample.fit_resample(X_tr, y_tr)
        return X_train_smote_tomek, y_train_smote_tomek

X_train_ovr, y_train_ovr = handle_imbalance(X_train_ori, y_train_ori, technique='oversampling', random_state=100)
sum(y_train_ovr)/len(y_train_ovr)*100
sorted(Counter(y_train_ovr).items())

50.0

[(0, 19153), (1, 19153)]

X_train_smote, y_train_smote = handle_imbalance(X_train_ori, y_train_ori, technique='smote', random_state=100)
sum(y_train_smote)/len(y_train_smote)*100
sorted(Counter(y_train_smote).items())

50.0

[(0, 19153), (1, 19153)]

X_train_adasyn, y_train_adasyn = handle_imbalance(X_train_ori, y_train_ori, technique='adasyn', random_state=100)
sum(y_train_adasyn)/len(y_train_adasyn)*100
sorted(Counter(y_train_adasyn).items())

49.94773428108504

[(0, 19153), (1, 19113)]

def principal_component_analysis(X_tr, random_state=100):
    pca = PCA(random_state=random_state)
    pca.fit(X_tr)

    var_ratio_df = pd.DataFrame({'feature':X_tr.columns, 'var_ratio': pca.explained_variance_ratio_})
    components = pd.DataFrame(pca.components_)
    var_ratio_df = pd.concat([var_ratio_df,components],axis=1)
    var_ratio_df.head()
    
    fig, axs = plt.subplots(1,3, figsize=(25,5))
    axs[0].bar(range(1,len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_)
    axs[0].set_xlabel('Component number')
    axs[0].set_ylabel('Explained variance ratio')
    axs[0].set_title('Bar plot')

    axs[1].plot(pca.explained_variance_ratio_)
    axs[1].set_xlabel('Component number')
    axs[1].set_ylabel('Explained variance ratio')
    axs[1].set_title('Scree plot')

    var_cumu = np.cumsum(pca.explained_variance_ratio_) 
    axs[2].vlines(x=47, ymax=1, ymin=0, colors="r", linestyles="--")
    axs[2].hlines(y=0.95, xmax=100, xmin=0, colors="g", linestyles="--")
    axs[2].plot(var_cumu)
    axs[2].set_ylabel("Cumulative variance explained")
    axs[2].set_xlabel('Component number')
    plt.show();
    
def incremental_pca(X_tr, X_te, n_components=45):
    pca_incremental = IncrementalPCA(n_components, )
    pca_incremental.fit(X_tr)
    
    principal_components = pca_incremental.components_
    feature_names = X_tr.columns

    # Calculate the overall importance of each feature
    feature_importance = {}
    for i, component in enumerate(principal_components):
        for j, weight in enumerate(component):
            if feature_names[j] not in feature_importance:
                feature_importance[feature_names[j]] = 0.0
            feature_importance[feature_names[j]] += abs(weight)

    # Sort features by their overall importance
    sorted_feature_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

    feature_lst = []
    for feature, importance in sorted_feature_importance[:10]:
        feature_lst.append(f"{feature}: {importance}")

    df_tr_pca = pca_incremental.fit_transform(X_tr)
    df_te_pca = pca_incremental.transform(X_te)
    
    return df_tr_pca, df_te_pca, feature_names, feature_lst

print("PCA using Random Oversampling")
X_train , y_train = X_train_ovr, y_train_ovr 
principal_component_analysis(X_train, random_state=100)
df_train_ovr_pca, df_test_ovr_pca, feature_names, feature_lst = incremental_pca(X_train, X_test, n_components=45)

print("PCA using SMOTE")
X_train , y_train = X_train_smote, y_train_smote 
principal_component_analysis(X_train, random_state=100)
df_train_smote_pca, df_test_smote_pca, feature_names, feature_lst = incremental_pca(X_train, X_test, n_components=45)

print("PCA using ADASYN")
X_train , y_train = X_train_adasyn, y_train_adasyn 
principal_component_analysis(X_train, random_state=100)
df_train_adasyn_pca, df_test_adasyn_pca, feature_names, feature_lst = incremental_pca(X_train, X_test, n_components=45)

PCA using Random Oversampling

PCA using SMOTE

PCA using ADASYN

# resusable function to train using logistic regression Model
def model_training(fXt, fyt, fcutoff, ftest=False, fres=None):
    fXt_sm = sm.add_constant(fXt)
    
    if ftest == False:
        lrm = sm.GLM(fyt, fXt_sm, family=sm.families.Binomial())
        lrm = lrm.fit()
        fyt_pred = lrm.predict(fXt_sm)
    else:
        lrm = fres
        fyt_pred = lrm.predict(fXt_sm)
        
    fyt_pred = fyt_pred.values.reshape(-1)
    fyt_pred_final = pd.DataFrame({'Converted': fyt.values, 'Conv_Prob': fyt_pred})
    fyt_pred_final['ID'] = fyt.index
    fyt_pred_final['predicted'] = fyt_pred_final.Conv_Prob.map(lambda x: 1 if x > fcutoff else 0)
    return lrm, fyt_pred, fyt_pred_final

# Metrics for logistic regression 
def logreg_metrics_fn(fyt_pred_final):
    fconfusion = confusion_matrix(fyt_pred_final.Converted, fyt_pred_final.predicted )
    faccuracy = accuracy_score(fyt_pred_final.Converted, fyt_pred_final.predicted)
    
    TP = fconfusion[1,1] # true positive 
    TN = fconfusion[0,0] # true negatives
    FP = fconfusion[0,1] # false positives
    FN = fconfusion[1,0] # false negatives
    
    fSensi  = TP/(TP+FN) # Calculate the sensitivity
    fSpeci  = TN/(TN+FP) # Calculate the specificity
    fPreci  = TP/(TP+FP) # Calculate Precision
    fRecal  = TP/(TP+FN) # Calculate Recall
       
    return fconfusion, faccuracy, fSensi, fSpeci, fPreci, fRecal

# Function to generate VIF of scores. 
def get_vif_score(fXt, cl):
    vif = pd.DataFrame()
    vif['Features'] = fXt[cl].columns
    vif['VIF'] = [variance_inflation_factor(fXt[cl].values, i) for i in range(fXt[cl].shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return vif

# A common function to generate most imp metrics for all the classification algorithms. 
def generate_metrics(yt, yt_pred, yt_prob ):
    f1_sc = f1_score(yt, yt_pred,  )
    recall = recall_score(yt, yt_pred,  )
    roc_score = roc_auc_score(yt, yt_prob)
    conf_mat = confusion_matrix(yt, yt_pred)
    accuracy = accuracy_score(yt, yt_pred,  )
    precision = precision_score(yt, yt_pred,  )
    cl_rep = classification_report(yt, yt_pred)
    df = pd.DataFrame({"accuracy":[accuracy],"roc_score":[roc_score],"precision":[precision],"recall":[recall],"f1_score":[f1_sc], 
                       "classification_report":[cl_rep], "confusion_matrix": [conf_mat]})
    return df, accuracy,roc_score, precision,recall, f1_sc, cl_rep, conf_mat

# Function to generate final summary report comprising of all the models and its metrics scores
def generate_summary_report(df=None, model_name="", class_imb='', train_accuracy="", test_accuracy="", roc_score="", precision="", recall="", f1_score="", classification_rep="", conf_matrix="", feature_imp="", step='create'):
    if step == 'create':
        df = pd.DataFrame(columns=["model_name","class_imb","train_accuracy","test_accuracy","roc_score","precision","recall","f1_score", "classification_report", "confusion_matrix", "feature_imp"])
        return df
    elif step == 'add': 
        df.loc[len(df)] = pd.Series({"model_name": model_name,"class_imb":class_imb,"train_accuracy": train_accuracy, "test_accuracy": test_accuracy,"roc_score": roc_score,"precision": precision,"recall": recall,"f1_score":f1_score, "classification_report": classification_rep, "confusion_matrix": conf_matrix, "feature_imp":feature_imp})
        return df

X_train, y_train = X_train_ovr, y_train_ovr

logreg = LogisticRegression()
logreg = logreg.fit(X_train,y_train)

# Find the optimum no of top features and Its contribution to the overall score using the RFECV method. 
rfecv = RFECV(estimator=logreg, cv=5)
rfecv.fit(X_train, y_train)

RFECV(cv=5, estimator=LogisticRegression())

RFECV(cv=5, estimator=LogisticRegression())

LogisticRegression()

LogisticRegression()

rfecv.cv_results_['mean_test_score']
plt.figure(figsize=[30, 5])
plt.plot(range(1, len(X_train.columns)+1), rfecv.cv_results_['mean_test_score'])
plt.title('RFECV')
plt.ylabel('mean_test_score')
plt.xlabel('columns')
plt.show();

rfe = RFE(estimator=logreg, n_features_to_select= 15) 
rfe = rfe.fit(X_train, y_train)

cols = rfe.get_feature_names_out()
rfe_df = pd.DataFrame({'feature':X_train.columns, 'rank': rfe.ranking_ , 'support': rfe.support_})
rfe_df.sort_values(by='rank', ascending=True).head(15)

logreg = sm.GLM(y_train, sm.add_constant(X_train), family=sm.families.Binomial())
logreg = logreg.fit()
# logreg.summary()

logreg1, y_train_pred, y_train_pred_final = model_training(X_train[cols], y_train, 0.5)
logreg1.pvalues.sort_values(ascending=False).head()

vif_scores = get_vif_score(X_train, cols)
vif_scores[vif_scores['VIF'] > 5].head()

total_ic_mou_good_phase   0.000
std_ic_mou_good_phase     0.000
isd_ic_mou_good_phase     0.000
loc_ic_mou_good_phase     0.000
total_og_mou_good_phase   0.000
dtype: float64

# 'total_ic_mou_good_phase' feature has very high VIF score. Hence we drop it. 
cols = list(cols)
cols.remove('total_ic_mou_good_phase')

logreg2, y_train_pred, y_train_pred_final = model_training(X_train[cols], y_train, 0.5)
logreg2.pvalues.sort_values(ascending=False).head()

vif_scores = get_vif_score(X_train,cols)
vif_scores[vif_scores['VIF'] > 5].head()

std_ic_mou_good_phase     0.000
isd_ic_mou_good_phase     0.000
total_og_mou_good_phase   0.000
loc_ic_mou_good_phase     0.000
spl_ic_mou_action_phase   0.000
dtype: float64

# 'total_ic_mou_action_phase' feature has very high VIF score. Hence we drop it. 
cols.remove('total_ic_mou_action_phase')

logreg3, y_train_pred, y_train_pred_final = model_training(X_train[cols], y_train, 0.5)
logreg3.pvalues.sort_values(ascending=False).head()

vif_scores = get_vif_score(X_train,cols)
vif_scores[vif_scores['VIF'] > 5].head()

std_ic_mou_good_phase     0.000
isd_ic_mou_good_phase     0.000
isd_ic_mou_action_phase   0.000
std_ic_mou_action_phase   0.000
loc_ic_mou_good_phase     0.000
dtype: float64

# 'total_og_mou_good_phase' feature has very high VIF score. Hence we drop it. 
cols.remove('total_og_mou_good_phase')

logreg4, y_train_pred, y_train_pred_final = model_training(X_train[cols], y_train, 0.5)
logreg4.pvalues.sort_values(ascending=False).head()

vif_scores = get_vif_score(X_train,cols)
vif_scores[vif_scores['VIF'] > 5].head()

std_ic_mou_good_phase     0.000
isd_ic_mou_good_phase     0.000
isd_ic_mou_action_phase   0.000
std_ic_mou_action_phase   0.000
loc_ic_mou_good_phase     0.000
dtype: float64

# Important features contributing to the model. 
feature_imp = (logreg4.params.sort_values(ascending=False, key=abs)).reset_index()
feature_imp = feature_imp.rename(columns={'index':'columns', 0:'imp_score'})
feature_imp

# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Conv_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head(2)

# calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci', 'preci', 'recall'])
for i in numbers:
    cm1 = confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    preci = cm1[1,1]/(cm1[0,1]+cm1[1,1])
    recall = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] = [i, accuracy, sensi, speci, preci, recall]
    
cutoff_df

fig, axs = plt.subplots(1,3, figsize=(25,6))

# plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'], ax=axs[0])
axs[0].set_title('Accuracy Sensitivity Specificity')
axs[0].vlines( ymin=0, ymax=0.9,x=0.537, color="r")

# Plot ROC curve and check the area under curve 
RocCurveDisplay.from_predictions(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob, drop_intermediate=False, ax=axs[1])
axs[1].set_title('ROC curve')

# plot Precision and Recall curve and find the optimal cutoff
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conv_Prob)
axs[2].plot(thresholds, p[:-1], "b")
axs[2].plot(thresholds, r[:-1], "r")
axs[2].set_title('Precision Recall Curve')
axs[2].vlines( ymin=0, ymax=0.9,x=0.54, color="r")
plt.show();

# the cutoff value from accuracy sensitivity and specificity curve is 0.537
# Apply the cutoff value to see the prediction on train dataset.
y_train_pred_final['final_predicted'] = y_train_pred_final.Conv_Prob.map( lambda x: 1 if x > 0.537 else 0)
y_train_pred_final.head()

# Let's check the overall Metrics.
cf_matrix, train_accuracy, sensitivity, specificity, precision, recall = logreg_metrics_fn(y_train_pred_final)
print(f'Train Accuracy - {round(train_accuracy,3)}\nSensitivity - {round(sensitivity,3)}\nspecificity - {round(specificity,3)}\nPrecision - {round(precision,3)}\nRecall - {round(recall,3)}')

Train Accuracy - 0.808
Sensitivity - 0.843
specificity - 0.773
Precision - 0.788
Recall - 0.843

# the cutoff value from Precision - Recall curve is 0.54
# Apply the cutoff value to see the prediction on train dataset.
y_train_pred_final['final_predicted'] = y_train_pred_final.Conv_Prob.map( lambda x: 1 if x > 0.54 else 0)
y_train_pred_final.head()

# Let's check the overall metrics. 
cf_matrix, train_accuracy,sensitivity, specificity, precision, recall = logreg_metrics_fn(y_train_pred_final)
print(f'Train Accuracy - {round(train_accuracy,3)}\nSensitivity - {round(sensitivity,3)}\nspecificity - {round(specificity,3)}\nPrecision - {round(precision,3)}\nRecall - {round(recall,3)}')

Train Accuracy - 0.808
Sensitivity - 0.843
specificity - 0.773
Precision - 0.788
Recall - 0.843

# Apply the trained Model on the test set
logregp, y_test_pred, y_test_pred_final = model_training(X_test[cols], y_test, 0.54, True, logreg4)

# check the overall metrics for test set
cf_matrix, test_accuracy, sensitivity, specificity, precision, recall = logreg_metrics_fn(y_test_pred_final)
roc_score = roc_auc_score( y_test_pred_final.Converted, y_test_pred_final.Conv_Prob )
f1_sc = f1_score(y_test, y_test_pred_final.predicted)
class_report = classification_report(y_test, y_test_pred_final.predicted)

print(f'Test Accuracy - {round(test_accuracy,3)}\nROC Score - {round(roc_score,3)}\nSensitivity - {round(sensitivity,3)}\nSpecificity - {round(specificity,3)}\nPrecision - {round(precision,3)}\nRecall - {round(recall,3)}')

overall_summary_df = generate_summary_report()
overall_summary_df = generate_summary_report(df=overall_summary_df, model_name="LogisticRegression",class_imb='oversampling', train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score, 
                                             precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=cf_matrix, feature_imp=feature_imp, step='add')

Test Accuracy - 0.819
ROC Score - 0.872
Sensitivity - 0.821
Specificity - 0.819
Precision - 0.305
Recall - 0.821

# LogisticRegression after PCA and Oversampling
# Check the overall metrics after performing LogisticRegressionCV
def logistic_regression(df_train_pca, df_test_pca, y_train, y_test, overall_summary_df):
    lr_pca = LogisticRegressionCV(cv=10, random_state=100)
    lr_pca.fit(df_train_pca, y_train)
    
    lr_pca_feature_importances = lr_pca.coef_.reshape(-1)
    lr_pca_feature_imp_df = pd.DataFrame({'principal_component': range(len(lr_pca_feature_importances)), 'imp_score': lr_pca_feature_importances})
    
    lr_pca_ytrain_prob = lr_pca.predict_proba(df_train_pca)[:,1]
    lr_pca_ytrain_pred = lr_pca.predict(df_train_pca)

    lr_pca_ytest_prob = lr_pca.predict_proba(df_test_pca)[:,1]
    lr_pca_ytest_pred = lr_pca.predict(df_test_pca)

    _, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=lr_pca_ytrain_pred, yt_prob=lr_pca_ytrain_prob)
    metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=lr_pca_ytest_pred, yt_prob=lr_pca_ytest_prob)
    print(metrics_df)

    overall_summary_df = generate_summary_report(df=overall_summary_df, model_name="LogisticRegressionPCA",class_imb='oversampling', train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score, 
                                                precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=conf_matrix, feature_imp=lr_pca_feature_imp_df, step='add')

X_train, y_train = X_train_ovr, y_train_ovr 
logistic_regression(df_train_ovr_pca, df_test_ovr_pca, y_train, y_test, overall_summary_df = overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.827      0.892      0.318   0.838     0.461   

                               classification_report  \
0                precision    recall  f1-score   ...   

             confusion_matrix  
0  [[6782, 1428], [129, 665]]

# Reusable function to perform Decision Tree classification and check the overall performance of the model
def decision_tree_classifier(X_train, y_train, X_test, y_test, technique, overall_summary_df):
    dtc = DecisionTreeClassifier(random_state=100, max_depth=6)
    dtc = dtc.fit(X_train, y_train)

    # Get feature importance
    dtc_feature_importances = dtc.feature_importances_
    if X_train.shape[1] == len(X.columns):
        model_name="DecisionTreeClassifier"
        dtc_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': dtc_feature_importances})
        plt.figure(figsize=(60,8))
        plot_tree(dtc, feature_names=list(X_train.columns), class_names=['No Churn', 'Churn'], filled=True, fontsize=6)
        plt.show();
    else:
        model_name="DecisionTreeClassifierPCA"
        dtc_feature_imp_df = pd.DataFrame({'principal_component': range(len(dtc_feature_importances)), 'imp_score': dtc_feature_importances})
    dtc_feature_imp_df = dtc_feature_imp_df.sort_values(by='imp_score', ascending=False).head(10)

    # Predict on train set 
    y_train_prob_dtc = dtc.predict_proba(X_train)[:, 1]
    y_train_pred_dtc = dtc.predict(X_train)

    # Predict on test set 
    y_test_prob_dtc = dtc.predict_proba(X_test)[:, 1]
    y_test_pred_dtc = dtc.predict(X_test)

    # Check overall performance of the model using various metrics
    _, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=y_train_pred_dtc, yt_prob=y_train_prob_dtc)
    metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=y_test_pred_dtc, yt_prob=y_test_prob_dtc)
    print(metrics_df)
    
    overall_summary_df = generate_summary_report(df=overall_summary_df, model_name=model_name, class_imb=technique, train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score, 
                                                precision=precision, recall=recall, f1_score=f1_sc, classification_rep =class_report, conf_matrix=conf_matrix, feature_imp=dtc_feature_imp_df, step='add')

# Decision Tree after performing oversampling.
X_train, y_train = X_train_ovr, y_train_ovr
decision_tree_classifier(X_train, y_train, X_test, y_test,  technique='oversampling', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.854      0.902      0.359   0.831     0.501   

                               classification_report  \
0                precision    recall  f1-score   ...   

             confusion_matrix  
0  [[7029, 1181], [134, 660]]

# Decision Tree after performing SMOTE. 
X_train, y_train = X_train_smote, y_train_smote
decision_tree_classifier(X_train, y_train, X_test, y_test, technique='smote', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.895      0.911      0.443   0.761     0.560   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7452, 758], [190, 604]]

# Decision Tree after performing ADASYN. 
X_train, y_train = X_train_adasyn, y_train_adasyn
decision_tree_classifier(X_train, y_train, X_test, y_test, technique='adasyn', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.871      0.898      0.388   0.798     0.522   

                               classification_report  \
0                precision    recall  f1-score   ...   

             confusion_matrix  
0  [[7208, 1002], [160, 634]]

# Decision Tree after performing PCA and Oversampling. 
X_train, y_train = X_train_ovr, y_train_ovr 
decision_tree_classifier(df_train_ovr_pca, y_train, df_test_ovr_pca, y_test, technique='oversampling', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.820      0.846      0.300   0.781     0.433   

                               classification_report  \
0                precision    recall  f1-score   ...   

             confusion_matrix  
0  [[6760, 1450], [174, 620]]

# Reusable function to perform RandomForestClassifier and check the overall performance of the model
def random_forest_classifier(X_train, y_train, X_test, y_test, technique, overall_summary_df):
    rf = RandomForestClassifier(random_state=100, oob_score=True, max_depth=6)
    rf.fit(X_train, y_train)

    # Get feature importance
    rf_feature_importances = rf.feature_importances_
    if X_train.shape[1] == len(X.columns):
        model_name="RandomForestClassifier"
        rf_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': rf_feature_importances})
    else:
        model_name="RandomForestClassifierPCA"
        rf_feature_imp_df = pd.DataFrame({'principal_component': range(len(rf_feature_importances)), 'imp_score': rf_feature_importances})
    rf_feature_imp_df = rf_feature_imp_df.sort_values(by='imp_score', ascending=False).head(10)

    # Predict on Train set
    rf_ytrain_prob = rf.predict_proba(X_train)[:, 1]
    rf_ytrain_pred = rf.predict(X_train)

    # PRedict on Test set
    rf_ytest_prob = rf.predict_proba(X_test)[:, 1]
    rf_ytest_pred = rf.predict(X_test)

    # Check overall performance of the model using various metrics. 
    _, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=rf_ytrain_pred, yt_prob=rf_ytrain_prob)
    metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=rf_ytest_pred, yt_prob=rf_ytest_prob)
    print(metrics_df)

    overall_summary_df = generate_summary_report(df=overall_summary_df, model_name=model_name,class_imb=technique, train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score, 
                                                precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=conf_matrix, feature_imp=rf_feature_imp_df, step='add')

# Perform random forest after oversampling. 
X_train, y_train = X_train_ovr, y_train_ovr
random_forest_classifier(X_train, y_train, X_test, y_test,  technique='oversampling', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.907      0.934      0.482   0.792     0.600   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7535, 675], [165, 629]]

# Perform random forest after. SMOTE
X_train, y_train = X_train_smote, y_train_smote
random_forest_classifier(X_train, y_train, X_test, y_test, technique='smote', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.903      0.931      0.471   0.802     0.594   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7495, 715], [157, 637]]

# Perform random forest after. ADASYN
X_train, y_train = X_train_adasyn, y_train_adasyn
random_forest_classifier(X_train, y_train, X_test, y_test, technique='adasyn', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.888      0.932      0.429   0.817     0.563   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7347, 863], [145, 649]]

# Perform random forest after PCA and Oversampling
X_train, y_train = X_train_ovr, y_train_ovr 
random_forest_classifier(df_train_ovr_pca, y_train, df_test_ovr_pca, y_test, technique='oversampling', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.874      0.893      0.391   0.768     0.518   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7260, 950], [184, 610]]

# Reusable function to perform GradientBoostingClassifier and check the overall performance of the model
def gradient_boosting_classifier(X_train, y_train, X_test, y_test, technique, overall_summary_df):
    gbc = GradientBoostingClassifier(random_state=100, max_depth=6)
    gbc.fit(X_train, y_train)

    # Get feature importance
    gbc_feature_importances = gbc.feature_importances_
    if X_train.shape[1] == len(X.columns):
        model_name="GradientBoostingClassifier"
        gbc_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': gbc_feature_importances})
    else:
        model_name="GradientBoostingClassifierPCA"
        gbc_feature_imp_df = pd.DataFrame({'principal_component': range(len(gbc_feature_importances)), 'imp_score': gbc_feature_importances})    
    gbc_feature_imp_df = gbc_feature_imp_df.sort_values(by='imp_score', ascending=False).head(10)

    # Predict on train set. 
    gbc_ytrain_prob = gbc.predict_proba(X_train)[:, 1]
    gbc_ytrain_pred = gbc.predict(X_train)

    # Reddit on test set. 
    gbc_ytest_prob = gbc.predict_proba(X_test)[:, 1]
    gbc_ytest_pred = gbc.predict(X_test)
    
    cvs = cross_val_score(gbc, X_test, y_test, cv=3, scoring='roc_auc')
    print(f'cross_val_score is : {cvs}')

    # Check overall performance of the model using various metrics. 
    _, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=gbc_ytrain_pred, yt_prob=gbc_ytrain_prob)
    metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=gbc_ytest_pred, yt_prob=gbc_ytest_prob)
    print(metrics_df)

    overall_summary_df = generate_summary_report(df=overall_summary_df, model_name=model_name, class_imb=technique, train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score, 
                                                precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=conf_matrix, feature_imp=gbc_feature_imp_df, step='add')

# Gradient boosting after oversampling
X_train, y_train = X_train_ovr, y_train_ovr
gradient_boosting_classifier(X_train, y_train, X_test, y_test, technique='oversampling', overall_summary_df=overall_summary_df)

cross_val_score is : [0.93851828 0.94362607 0.94864421]
   accuracy  roc_score  precision  recall  f1_score  \
0     0.928      0.945      0.567   0.788     0.660   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7732, 478], [168, 626]]

# Gradient boosting after SMOTE
X_train, y_train = X_train_smote, y_train_smote
gradient_boosting_classifier(X_train, y_train, X_test, y_test, technique='smote', overall_summary_df=overall_summary_df)

cross_val_score is : [0.93851828 0.94362607 0.94864421]
   accuracy  roc_score  precision  recall  f1_score  \
0     0.936      0.942      0.623   0.685     0.653   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7881, 329], [250, 544]]

# Gradient boosting after ADAsyn
X_train, y_train = X_train_adasyn, y_train_adasyn
gradient_boosting_classifier(X_train, y_train, X_test, y_test, technique='adasyn', overall_summary_df=overall_summary_df)

cross_val_score is : [0.93851828 0.94362607 0.94864421]
   accuracy  roc_score  precision  recall  f1_score  \
0     0.934      0.940      0.614   0.689     0.649   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7866, 344], [247, 547]]

# Gradient boosting after PCA and oversampling
X_train, y_train = X_train_ovr, y_train_ovr 
gradient_boosting_classifier(df_train_ovr_pca, y_train, df_test_ovr_pca, y_test, technique='oversampling', overall_summary_df=overall_summary_df)

cross_val_score is : [0.89185653 0.88849354 0.89216733]
   accuracy  roc_score  precision  recall  f1_score  \
0     0.903      0.906      0.467   0.686     0.556   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7589, 621], [249, 545]]

# Reusable function to perform XGBClassifier and check the overall performance of the model
def xgb_classifier(X_train, y_train, X_test, y_test, technique, overall_summary_df):
    xgb_cfl =  xgb.XGBClassifier(scale_pos_weight= 1, objective = 'binary:logistic', random_state= 100, max_depth=6)
    xgb_cfl.fit(X_train, y_train)

    # Get feature importance
    xgb_feature_importances = xgb_cfl.feature_importances_
    if X_train.shape[1] == len(X.columns):
        model_name="XGBClassifier"
        xgb_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': xgb_feature_importances})
    else:
        model_name="XGBClassifierPCA"
        xgb_feature_imp_df = pd.DataFrame({'principal_component': range(len(xgb_feature_importances)), 'imp_score': xgb_feature_importances}) 
    xgb_feature_imp_df = xgb_feature_imp_df.sort_values(by='imp_score', ascending=False).head(10)

    # Predict on trains set. 
    xgb_ytrain_prob = xgb_cfl.predict_proba(X_train)[:, 1]
    xgb_ytrain_pred = xgb_cfl.predict(X_train)

    # Predict on test set. 
    xgb_ytest_prob = xgb_cfl.predict_proba(X_test)[:, 1]
    xgb_ytest_pred = xgb_cfl.predict(X_test)

    # Check the overall performance of the model using various metrics. 
    _, train_accuracy, _, _, _, _, _, _ = generate_metrics(y_train, yt_pred=xgb_ytrain_pred, yt_prob=xgb_ytrain_prob)
    metrics_df, test_accuracy, roc_score, precision,recall, f1_sc, class_report, conf_matrix = generate_metrics(y_test, yt_pred=xgb_ytest_pred, yt_prob=xgb_ytest_prob)
    print(metrics_df)

    overall_summary_df = generate_summary_report(df=overall_summary_df, model_name=model_name, class_imb=technique, train_accuracy=train_accuracy, test_accuracy=test_accuracy, roc_score=roc_score, 
                                                precision=precision, recall=recall, f1_score=f1_sc, classification_rep=class_report, conf_matrix=conf_matrix, feature_imp=xgb_feature_imp_df, step='add')

# Perform XGBOOST after oversampling
X_train, y_train = X_train_ovr, y_train_ovr
xgb_classifier(X_train, y_train, X_test, y_test,  technique='oversampling', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.938      0.942      0.636   0.688     0.661   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7898, 312], [248, 546]]

# Perform XGBOOST after SMOTE
X_train, y_train = X_train_smote, y_train_smote
xgb_classifier(X_train, y_train, X_test, y_test, technique='smote', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.937      0.939      0.651   0.605     0.627   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7953, 257], [314, 480]]

# Perform XGBOOST after ADASYN
X_train, y_train = X_train_adasyn, y_train_adasyn
xgb_classifier(X_train, y_train, X_test, y_test, technique='adasyn', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.936      0.936      0.650   0.602     0.625   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7953, 257], [316, 478]]

# Perform XGBOOST after PCA and oversampling
X_train, y_train = X_train_ovr, y_train_ovr 
xgb_classifier(df_train_ovr_pca, y_train, df_test_ovr_pca, y_test, technique='oversampling', overall_summary_df=overall_summary_df)

   accuracy  roc_score  precision  recall  f1_score  \
0     0.920      0.898      0.547   0.553     0.550   

                               classification_report  \
0                precision    recall  f1-score   ...   

            confusion_matrix  
0  [[7847, 363], [355, 439]]

# Evaluate the performance metrics of various Model. 
overall_summary_df.sort_values(by=['roc_score', 'test_accuracy', 'train_accuracy'], ascending=[False, True, True])

overall_feature_imp_df = pd.DataFrame(columns=['model','class_imb','feature', 'imp_score'])
for items in zip(overall_summary_df.model_name, overall_summary_df.feature_imp, overall_summary_df.class_imb):
    for val in items[1].values:
        overall_feature_imp_df.loc[len(overall_feature_imp_df)] = items[0],items[2],val[0],val[1]

# Imp features of Top two models
overall_feature_imp_df[(overall_feature_imp_df['model'].eq('GradientBoostingClassifier')) & (overall_feature_imp_df['class_imb'].eq('smote')) | (overall_feature_imp_df['model'].eq('XGBClassifier')) & (overall_feature_imp_df['class_imb'].eq('oversampling'))]

# Plot the Barplot and evaluate the performance of various models.  
overall_summary_df.set_index('model_name').plot.bar(figsize=(30,5))
plt.hlines(xmin=-0.5,xmax=30,y=0.5, color='r',linestyles='--')
plt.xticks(rotation=45)
plt.title('Models Performance Summary')
plt.show();

end_time = ct.time()
total_time = end_time - start_time
print(f'Time taken to complete :{(total_time/60)} mins')

Time taken to complete :16.31866637070974 mins

## Due to computational challenges the optimization was performed with a limited set of parameter combinations, please uncomment and run
# %conda install cupy -y
# import cupy as cp
# from lightgbm import LGBMClassifier

# X_train, y_train =  X_train_ovr.copy(), y_train_ovr.copy()
# 
# # param = {
#     'max_depth' : [4, 5, 6, 7, 8],
#     'learning_rate' : [0.01, 0.1, 1],
#     'n_estimators' : [100, 200, 500],
#     'random_state' :[100]
# }

# xgbc = xgb.XGBClassifier(random_state=100, objective='binary:logistic', eval_metric='logloss', device='cuda') 
# random_search = RandomizedSearchCV(estimator=xgbc, param_distributions=param, n_iter=20, scoring='roc_auc', random_state=100, cv=3, n_jobs=-1)
# random_search.fit(X_train, y_train)

# random_search.best_score_
# random_search.best_params_

# # feature importance
# random_search_feature_importances = random_search.best_estimator_.feature_importances_
# random_search_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': random_search_feature_importances})
# random_search_feature_imp_df.sort_values(by='imp_score', ascending=False).head(5)

# # Evaluate on test set
# best_model = random_search.best_estimator_
# test_score = best_model.score(X_test, y_test)
# print("Test Set score:", test_score)

# X_train, y_train =  X_train_smote.copy(), y_train_smote.copy()

# param_grid = {
#     'max_depth': [6, 8, 10],   
#     'reg_alpha': [0, 0.1, 0.2],
#     # 'reg_lambda': [0, 0.1, 0.2],    
#     'n_estimators': [100, 200, 300], 
#     'learning_rate': [0.01, 0.1, 0.2]
# }

# lgbm = LGBMClassifier(random_state=100)
# random_search = RandomizedSearchCV(estimator=lgbm, param_distributions=param_grid, n_iter=20, scoring='roc_auc', random_state=100, cv=3, n_jobs=-1)
# random_search.fit(X_train, y_train)

# # Best parameters and best score
# random_search.best_params_
# random_search.best_score_

# # feature importance
# random_search_feature_importances = random_search.best_estimator_.feature_importances_
# random_search_feature_imp_df = pd.DataFrame({'columns':X.columns, 'imp_score': random_search_feature_importances})
# random_search_feature_imp_df.sort_values(by='imp_score', ascending=False).head(5)

# # Evaluate on test set
# best_model = random_search.best_estimator_
# best_model.score(X_test, y_test)
#cross_val_score(best_model, X_test, y_test, cv=5, scoring='roc_auc')

	mobile_number	circle_id	loc_og_t2o_mou	std_og_t2o_mou	loc_ic_t2o_mou	last_date_of_month_6	last_date_of_month_7	last_date_of_month_8	last_date_of_month_9	arpu_6	arpu_7	arpu_8	arpu_9	onnet_mou_6	onnet_mou_7	onnet_mou_8	onnet_mou_9	offnet_mou_6	offnet_mou_7	offnet_mou_8	offnet_mou_9	roam_ic_mou_6	roam_ic_mou_7	roam_ic_mou_8	roam_ic_mou_9	roam_og_mou_6	roam_og_mou_7	roam_og_mou_8	roam_og_mou_9	loc_og_t2t_mou_6	loc_og_t2t_mou_7	loc_og_t2t_mou_8	loc_og_t2t_mou_9	loc_og_t2m_mou_6	loc_og_t2m_mou_7	loc_og_t2m_mou_8	loc_og_t2m_mou_9	loc_og_t2f_mou_6	loc_og_t2f_mou_7	loc_og_t2f_mou_8	loc_og_t2f_mou_9	loc_og_t2c_mou_6	loc_og_t2c_mou_7	loc_og_t2c_mou_8	loc_og_t2c_mou_9	loc_og_mou_6	loc_og_mou_7	loc_og_mou_8	loc_og_mou_9	std_og_t2t_mou_6	std_og_t2t_mou_7	std_og_t2t_mou_8	std_og_t2t_mou_9	std_og_t2m_mou_6	std_og_t2m_mou_7	std_og_t2m_mou_8	std_og_t2m_mou_9	std_og_t2f_mou_6	std_og_t2f_mou_7	std_og_t2f_mou_8	std_og_t2f_mou_9	std_og_t2c_mou_6	std_og_t2c_mou_7	std_og_t2c_mou_8	std_og_t2c_mou_9	std_og_mou_6	std_og_mou_7	std_og_mou_8	std_og_mou_9	isd_og_mou_6	isd_og_mou_7	isd_og_mou_8	isd_og_mou_9	spl_og_mou_6	spl_og_mou_7	spl_og_mou_8	spl_og_mou_9	og_others_6	og_others_7	og_others_8	og_others_9	total_og_mou_6	total_og_mou_7	total_og_mou_8	total_og_mou_9	loc_ic_t2t_mou_6	loc_ic_t2t_mou_7	loc_ic_t2t_mou_8	loc_ic_t2t_mou_9	loc_ic_t2m_mou_6	loc_ic_t2m_mou_7	loc_ic_t2m_mou_8	loc_ic_t2m_mou_9	loc_ic_t2f_mou_6	loc_ic_t2f_mou_7	loc_ic_t2f_mou_8	loc_ic_t2f_mou_9	loc_ic_mou_6	loc_ic_mou_7	loc_ic_mou_8	loc_ic_mou_9	std_ic_t2t_mou_6	std_ic_t2t_mou_7	std_ic_t2t_mou_8	std_ic_t2t_mou_9	std_ic_t2m_mou_6	std_ic_t2m_mou_7	std_ic_t2m_mou_8	std_ic_t2m_mou_9	std_ic_t2f_mou_6	std_ic_t2f_mou_7	std_ic_t2f_mou_8	std_ic_t2f_mou_9	std_ic_t2o_mou_6	std_ic_t2o_mou_7	std_ic_t2o_mou_8	std_ic_t2o_mou_9	std_ic_mou_6	std_ic_mou_7	std_ic_mou_8	std_ic_mou_9	total_ic_mou_6	total_ic_mou_7	total_ic_mou_8	total_ic_mou_9	spl_ic_mou_6	spl_ic_mou_7	spl_ic_mou_8	spl_ic_mou_9	isd_ic_mou_6	isd_ic_mou_7	isd_ic_mou_8	isd_ic_mou_9	ic_others_6	ic_others_7	ic_others_8	ic_others_9	total_rech_num_6	total_rech_num_7	total_rech_num_8	total_rech_num_9	total_rech_amt_6	total_rech_amt_7	total_rech_amt_8	total_rech_amt_9	max_rech_amt_6	max_rech_amt_7	max_rech_amt_8	max_rech_amt_9	date_of_last_rech_6	date_of_last_rech_7	date_of_last_rech_8	date_of_last_rech_9	last_day_rch_amt_6	last_day_rch_amt_7	last_day_rch_amt_8	last_day_rch_amt_9	date_of_last_rech_data_6	date_of_last_rech_data_7	date_of_last_rech_data_8	date_of_last_rech_data_9	total_rech_data_6	total_rech_data_7	total_rech_data_8	total_rech_data_9	max_rech_data_6	max_rech_data_7	max_rech_data_8	max_rech_data_9	count_rech_2g_6	count_rech_2g_7	count_rech_2g_8	count_rech_2g_9	count_rech_3g_6	count_rech_3g_7	count_rech_3g_8	count_rech_3g_9	av_rech_amt_data_6	av_rech_amt_data_7	av_rech_amt_data_8	av_rech_amt_data_9	vol_2g_mb_6	vol_2g_mb_7	vol_2g_mb_8	vol_2g_mb_9	vol_3g_mb_6	vol_3g_mb_7	vol_3g_mb_8	vol_3g_mb_9	arpu_3g_6	arpu_3g_7	arpu_3g_8	arpu_3g_9	arpu_2g_6	arpu_2g_7	arpu_2g_8	arpu_2g_9	night_pck_user_6	night_pck_user_7	night_pck_user_8	night_pck_user_9	monthly_2g_6	monthly_2g_7	monthly_2g_8	monthly_2g_9	sachet_2g_6	sachet_2g_7	sachet_2g_8	sachet_2g_9	monthly_3g_6	monthly_3g_7	monthly_3g_8	monthly_3g_9	sachet_3g_6	sachet_3g_7	sachet_3g_8	sachet_3g_9	fb_user_6	fb_user_7	fb_user_8	fb_user_9	aon	aug_vbc_3g	jul_vbc_3g	jun_vbc_3g	sep_vbc_3g
0	7000842753	109	0.000	0.000	0.000	6/30/2014	7/31/2014	8/31/2014	9/30/2014	197.385	214.816	213.803	21.100	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	0.000	0.000	0.000	0.000	NaN	NaN	0.160	NaN	NaN	NaN	4.130	NaN	NaN	NaN	1.150	NaN	NaN	NaN	5.440	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	0.000	0.000	5.440	0.000	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	NaN	NaN	0.000	NaN	4	3	2	6	362	252	252	0	252	252	252	0	6/21/2014	7/16/2014	8/8/2014	9/28/2014	252	252	252	0	6/21/2014	7/16/2014	8/8/2014	NaN	1.000	1.000	1.000	NaN	252.000	252.000	252.000	NaN	0.000	0.000	0.000	NaN	1.000	1.000	1.000	NaN	252.000	252.000	252.000	NaN	30.130	1.320	5.750	0.000	83.570	150.760	109.610	0.000	212.170	212.170	212.170	NaN	212.170	212.170	212.170	NaN	0.000	0.000	0.000	NaN	0	0	0	0	0	0	0	0	1	1	1	0	0	0	0	0	1.000	1.000	1.000	NaN	968	30.400	0.000	101.200	3.580
1	7001865778	109	0.000	0.000	0.000	6/30/2014	7/31/2014	8/31/2014	9/30/2014	34.047	355.074	268.321	86.285	24.110	78.680	7.680	18.340	15.740	99.840	304.760	53.760	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	23.880	74.560	7.680	18.340	11.510	75.940	291.860	53.760	0.000	0.000	0.000	0.000	0.000	2.910	0.000	0.000	35.390	150.510	299.540	72.110	0.230	4.110	0.000	0.000	0.000	0.460	0.130	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.230	4.580	0.130	0.000	0.000	0.000	0.000	0.000	4.680	23.430	12.760	0.000	0.000	0.000	0.000	0.000	40.310	178.530	312.440	72.110	1.610	29.910	29.230	116.090	17.480	65.380	375.580	56.930	0.000	8.930	3.610	0.000	19.090	104.230	408.430	173.030	0.000	0.000	2.350	0.000	5.900	0.000	12.490	15.010	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	5.900	0.000	14.840	15.010	26.830	104.230	423.280	188.040	0.000	0.000	0.000	0.000	1.830	0.000	0.000	0.000	0.000	0.000	0.000	0.000	4	9	11	5	74	384	283	121	44	154	65	50	6/29/2014	7/31/2014	8/28/2014	9/30/2014	44	23	30	0	NaN	7/25/2014	8/10/2014	NaN	NaN	1.000	2.000	NaN	NaN	154.000	25.000	NaN	NaN	1.000	2.000	NaN	NaN	0.000	0.000	NaN	NaN	154.000	50.000	NaN	0.000	108.070	365.470	0.000	0.000	0.000	0.000	0.000	NaN	0.000	0.000	NaN	NaN	28.610	7.600	NaN	NaN	0.000	0.000	NaN	0	1	0	0	0	0	2	0	0	0	0	0	0	0	0	0	NaN	1.000	1.000	NaN	1006	0.000	0.000	0.000	0.000

	Desc	Var	Value	Perc
0	Constant	circle_id	109	100.000
1	Constant	last_date_of_month_6	6/30/2014	100.000
2	Constant	last_date_of_month_7	7/31/2014	100.000
3	Constant	last_date_of_month_8	8/31/2014	100.000
4	Constant	last_date_of_month_9	9/30/2014	100.000
5	Constant	loc_ic_t2o_mou	0.000	100.000
6	Constant	loc_og_t2o_mou	0.000	100.000
7	Constant	std_ic_t2o_mou_6	0.000	100.000
8	Constant	std_ic_t2o_mou_7	0.000	100.000
9	Constant	std_ic_t2o_mou_8	0.000	100.000
10	Constant	std_ic_t2o_mou_9	0.000	100.000
11	Constant	std_og_t2c_mou_6	0.000	100.000
12	Constant	std_og_t2c_mou_7	0.000	100.000
13	Constant	std_og_t2c_mou_8	0.000	100.000
14	Constant	std_og_t2c_mou_9	0.000	100.000
15	Constant	std_og_t2o_mou	0.000	100.000

	aon	arpu_action_phase	vbc_3g_action_phase	date_of_last_rech_action_phase	ic_others_action_phase	isd_ic_mou_action_phase	isd_og_mou_action_phase	last_day_rch_amt_action_phase	loc_ic_mou_action_phase	loc_ic_t2f_mou_action_phase	loc_ic_t2m_mou_action_phase	loc_ic_t2t_mou_action_phase	loc_og_mou_action_phase	loc_og_t2c_mou_action_phase	loc_og_t2f_mou_action_phase	loc_og_t2m_mou_action_phase	loc_og_t2t_mou_action_phase	max_rech_amt_action_phase	monthly_2g_action_phase	monthly_3g_action_phase	offnet_mou_action_phase	og_others_action_phase	onnet_mou_action_phase	roam_ic_mou_action_phase	roam_og_mou_action_phase	sachet_2g_action_phase	sachet_3g_action_phase	spl_ic_mou_action_phase	spl_og_mou_action_phase	std_ic_mou_action_phase	std_ic_t2f_mou_action_phase	std_ic_t2m_mou_action_phase	std_ic_t2t_mou_action_phase	std_og_mou_action_phase	std_og_t2f_mou_action_phase	std_og_t2m_mou_action_phase	std_og_t2t_mou_action_phase	total_ic_mou_action_phase	total_og_mou_action_phase	total_rech_amt_action_phase	total_rech_num_action_phase	vol_2g_mb_action_phase	vol_3g_mb_action_phase	total_rech_amt_good_phase	churn	arpu_good_phase	date_of_last_rech_good_phase	ic_others_good_phase	isd_ic_mou_good_phase	isd_og_mou_good_phase	vbc_3g_good_phase	last_day_rch_amt_good_phase	loc_ic_mou_good_phase	loc_ic_t2f_mou_good_phase	loc_ic_t2m_mou_good_phase	loc_ic_t2t_mou_good_phase	loc_og_mou_good_phase	loc_og_t2c_mou_good_phase	loc_og_t2f_mou_good_phase	loc_og_t2m_mou_good_phase	loc_og_t2t_mou_good_phase	max_rech_amt_good_phase	monthly_2g_good_phase	monthly_3g_good_phase	offnet_mou_good_phase	og_others_good_phase	onnet_mou_good_phase	roam_ic_mou_good_phase	roam_og_mou_good_phase	sachet_2g_good_phase	sachet_3g_good_phase	spl_ic_mou_good_phase	spl_og_mou_good_phase	std_ic_mou_good_phase	std_ic_t2f_mou_good_phase	std_ic_t2m_mou_good_phase	std_ic_t2t_mou_good_phase	std_og_mou_good_phase	std_og_t2f_mou_good_phase	std_og_t2m_mou_good_phase	std_og_t2t_mou_good_phase	total_ic_mou_good_phase	total_og_mou_good_phase	total_rech_num_good_phase	vol_2g_mb_good_phase	vol_3g_mb_good_phase
count	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000	30011.000
mean	1248.437	534.857	129.440	26.911	1.238	11.701	2.029	95.653	237.214	14.736	155.449	66.438	263.226	1.713	6.494	168.853	87.507	162.869	0.114	0.173	377.902	0.059	268.612	13.256	21.469	0.608	0.140	0.028	6.885	48.355	2.681	30.498	15.052	353.884	1.790	179.672	172.196	295.427	623.775	613.639	10.225	69.209	269.864	696.664	0.088	588.210	27.164	1.333	11.758	2.236	128.244	104.886	246.759	16.062	160.799	69.692	284.192	1.715	7.038	181.696	95.327	173.538	0.128	0.180	421.899	0.370	300.533	15.467	25.679	0.544	0.144	0.042	6.671	52.307	2.934	32.767	16.562	405.214	2.023	208.294	194.815	311.194	697.911	12.017	78.515	268.243
std	940.230	492.260	390.479	5.038	12.890	74.929	44.795	145.260	305.235	43.705	215.972	155.515	365.285	7.398	20.220	234.066	234.595	172.606	0.357	0.583	475.492	3.320	466.014	74.552	106.245	1.844	0.975	0.117	22.893	140.906	20.270	105.291	72.433	616.036	11.700	416.709	410.034	360.343	685.983	601.822	9.479	268.494	859.299	488.782	0.284	409.006	3.374	13.397	67.200	44.785	366.111	115.078	298.251	44.592	208.405	153.873	357.301	6.703	21.500	230.603	229.182	153.504	0.337	0.540	440.113	1.845	436.837	67.369	94.737	1.668	0.852	0.152	18.351	134.978	18.804	95.183	76.238	574.891	11.885	395.221	388.715	345.411	610.373	8.730	254.201	794.962
min	180.000	-945.808	0.000	1.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	368.500	0.000	-749.783	1.500	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	9.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.500	0.000	0.000
25%	480.000	289.610	0.000	25.000	0.000	0.000	0.000	0.000	60.030	0.000	35.890	8.570	48.005	0.000	0.000	28.480	7.760	67.000	0.000	0.000	95.695	0.000	32.360	0.000	0.000	0.000	0.000	0.000	0.000	2.280	0.000	0.310	0.000	3.730	0.000	0.660	0.000	78.930	188.790	309.000	4.000	0.000	0.000	450.000	0.000	381.272	26.000	0.000	0.000	0.000	0.000	40.000	68.078	0.255	41.500	11.550	63.700	0.000	0.000	37.975	11.760	110.000	0.000	0.000	155.050	0.000	48.625	0.000	0.000	0.000	0.000	0.000	0.000	5.000	0.000	1.748	0.000	10.227	0.000	3.792	0.000	103.058	312.782	6.500	0.000	0.000
50%	914.000	452.091	0.000	28.000	0.000	0.000	0.000	50.000	142.840	1.850	89.810	26.840	148.280	0.000	0.160	89.810	28.640	130.000	0.000	0.000	240.940	0.000	99.440	0.000	0.000	0.000	0.000	0.000	0.490	12.560	0.000	5.710	0.560	72.890	0.000	23.660	5.930	193.440	435.330	520.000	8.000	0.000	0.000	568.500	0.000	485.603	28.000	0.000	0.000	0.000	0.000	80.000	156.585	2.820	98.535	31.725	175.655	0.000	0.855	107.310	35.435	124.000	0.000	0.000	299.535	0.000	136.895	0.000	0.000	0.000	0.000	0.000	1.730	18.295	0.000	9.205	2.340	165.200	0.000	46.985	18.895	211.185	547.010	9.500	0.000	0.000
75%	1924.000	671.150	1.600	30.000	0.060	0.000	0.000	130.000	302.110	11.605	196.975	70.330	348.310	0.050	4.680	223.590	84.670	198.000	0.000	0.000	482.610	0.000	297.735	0.000	0.000	0.000	0.000	0.000	6.380	43.410	0.130	25.330	8.860	481.030	0.000	164.725	132.820	380.410	833.100	790.000	13.000	9.620	0.000	795.500	0.000	674.492	29.500	0.250	0.802	0.000	25.562	124.000	315.910	13.460	202.500	74.570	375.308	1.040	5.625	242.270	92.980	200.000	0.000	0.000	532.495	0.000	380.250	3.055	7.270	0.000	0.000	0.000	7.033	50.077	0.740	29.870	11.810	620.480	0.000	237.115	211.520	395.233	896.840	15.000	29.330	99.697
max	3179.000	33543.624	12916.220	31.000	1209.860	4100.380	5681.540	4449.000	5748.810	1588.530	5738.460	4003.210	11039.910	351.830	928.490	4961.330	10752.560	4449.000	5.000	16.000	14007.340	394.930	10752.560	4169.810	5337.040	44.000	41.000	6.230	1390.880	5957.140	1394.890	5645.860	4309.290	13980.060	516.910	13950.040	8014.430	5990.710	14043.060	45320.000	196.000	11117.610	30036.060	37762.500	1.000	31438.461	30.500	1420.040	3811.385	5695.470	8190.585	3100.000	5939.095	1358.435	4432.685	5716.160	6582.420	420.575	1331.230	4150.165	5941.565	3299.000	4.500	11.500	8314.795	185.065	7331.060	3060.600	2410.835	38.000	27.000	16.610	1144.500	5630.410	1185.045	3122.585	5630.245	8856.235	382.920	7857.410	7308.050	6266.145	9347.210	155.500	7939.075	36667.845

	aon	arpu_action_phase	vbc_3g_action_phase	date_of_last_rech_action_phase	ic_others_action_phase	isd_ic_mou_action_phase	isd_og_mou_action_phase	last_day_rch_amt_action_phase	loc_ic_mou_action_phase	loc_ic_t2f_mou_action_phase	loc_ic_t2m_mou_action_phase	loc_ic_t2t_mou_action_phase	loc_og_mou_action_phase	loc_og_t2c_mou_action_phase	loc_og_t2f_mou_action_phase	loc_og_t2m_mou_action_phase	loc_og_t2t_mou_action_phase	max_rech_amt_action_phase	monthly_2g_action_phase	monthly_3g_action_phase	offnet_mou_action_phase	og_others_action_phase	onnet_mou_action_phase	roam_ic_mou_action_phase	roam_og_mou_action_phase	sachet_2g_action_phase	sachet_3g_action_phase	spl_ic_mou_action_phase	spl_og_mou_action_phase	std_ic_mou_action_phase	std_ic_t2f_mou_action_phase	std_ic_t2m_mou_action_phase	std_ic_t2t_mou_action_phase	std_og_mou_action_phase	std_og_t2f_mou_action_phase	std_og_t2m_mou_action_phase	std_og_t2t_mou_action_phase	total_ic_mou_action_phase	total_og_mou_action_phase	total_rech_amt_action_phase	total_rech_num_action_phase	vol_2g_mb_action_phase	vol_3g_mb_action_phase	total_rech_amt_good_phase	arpu_good_phase	date_of_last_rech_good_phase	ic_others_good_phase	isd_ic_mou_good_phase	isd_og_mou_good_phase	vbc_3g_good_phase	last_day_rch_amt_good_phase	loc_ic_mou_good_phase	loc_ic_t2f_mou_good_phase	loc_ic_t2m_mou_good_phase	loc_ic_t2t_mou_good_phase	loc_og_mou_good_phase	loc_og_t2c_mou_good_phase	loc_og_t2f_mou_good_phase	loc_og_t2m_mou_good_phase	loc_og_t2t_mou_good_phase	max_rech_amt_good_phase	monthly_2g_good_phase	monthly_3g_good_phase	offnet_mou_good_phase	og_others_good_phase	onnet_mou_good_phase	roam_ic_mou_good_phase	roam_og_mou_good_phase	sachet_2g_good_phase	sachet_3g_good_phase	spl_ic_mou_good_phase	spl_og_mou_good_phase	std_ic_mou_good_phase	std_ic_t2f_mou_good_phase	std_ic_t2m_mou_good_phase	std_ic_t2t_mou_good_phase	std_og_mou_good_phase	std_og_t2f_mou_good_phase	std_og_t2m_mou_good_phase	std_og_t2t_mou_good_phase	total_ic_mou_good_phase	total_og_mou_good_phase	total_rech_num_good_phase	vol_2g_mb_good_phase	vol_3g_mb_good_phase
39561	-0.436	0.145	-0.325	0.807	-0.074	-0.159	-0.044	-0.321	-0.282	-0.333	-0.049	-0.387	-0.573	-0.229	-0.308	-0.598	-0.276	-0.193	-0.318	-0.298	1.182	0.880	0.156	-0.183	-0.202	-0.327	-0.146	-0.247	-0.276	1.954	-0.141	2.210	0.612	1.432	-0.151	1.741	0.372	0.505	0.961	0.231	0.817	-0.254	-0.310	-0.232	-0.250	0.842	-0.005	-0.173	-0.050	-0.345	-0.384	-0.626	-0.360	-0.540	-0.383	-0.658	-0.243	-0.312	-0.646	-0.349	-0.606	-0.380	-0.333	0.431	1.407	0.266	-0.235	-0.269	-0.321	-0.178	-0.252	-0.343	0.563	-0.168	0.729	0.131	1.003	-0.173	0.935	0.541	-0.343	0.544	0.279	-0.312	-0.334
70763	-0.647	0.260	-0.074	-1.959	-0.092	1.523	-0.027	3.223	4.986	5.083	6.201	-0.214	-0.059	-0.229	-0.269	0.279	-0.327	2.277	-0.318	-0.298	-0.294	-0.018	-0.559	-0.183	-0.202	-0.327	-0.146	-0.247	-0.291	-0.344	-0.141	-0.290	-0.208	-0.570	-0.151	-0.422	-0.425	4.409	-0.549	0.127	-0.863	-0.254	-0.310	1.018	0.990	-0.343	-0.095	-0.144	-0.036	0.008	3.423	9.640	9.116	12.141	-0.087	1.598	-0.244	-0.242	2.644	-0.110	2.139	-0.380	-0.333	0.882	-0.184	-0.531	-0.235	-0.269	-0.321	0.481	-0.252	-0.298	0.661	-0.168	1.150	-0.207	-0.674	-0.173	-0.481	-0.507	8.505	0.299	-0.456	-0.310	-0.252
63600	-0.939	0.244	-0.325	-0.181	0.129	-0.159	-0.044	0.246	-0.715	-0.333	-0.648	-0.412	-0.684	-0.214	-0.308	-0.700	-0.348	-0.193	-0.318	-0.298	-0.749	-0.018	0.307	-0.183	-0.202	-0.327	-0.146	-0.247	-0.287	0.252	-0.141	-0.081	0.652	0.114	-0.151	-0.393	0.589	-0.526	-0.279	0.220	-0.338	-0.254	-0.310	0.199	0.212	0.102	-0.095	-0.173	-0.050	-0.345	0.125	-0.692	-0.359	-0.618	-0.407	-0.772	-0.203	-0.312	-0.775	-0.399	-0.350	-0.380	-0.333	-0.849	-0.184	0.166	0.063	-0.103	-0.321	-0.178	-0.252	-0.315	-0.003	-0.168	-0.095	0.146	-0.011	-0.173	-0.423	0.426	-0.631	-0.479	-0.117	-0.312	-0.334
5725	-0.460	-1.121	-0.325	-3.935	-0.092	-0.159	-0.044	-0.513	-0.786	-0.333	-0.734	-0.433	-0.703	-0.229	-0.308	-0.721	-0.357	-0.823	-0.318	-0.298	-0.780	-0.018	-0.570	-0.145	-0.136	-0.327	-0.146	-0.247	-0.291	-0.344	-0.141	-0.290	-0.208	-0.574	-0.151	-0.428	-0.425	-0.830	-0.903	-1.089	-0.968	-0.254	-0.310	1.458	1.488	0.102	-0.088	-0.173	-0.050	-0.213	-0.427	-0.561	-0.327	-0.490	-0.334	-0.451	-0.244	-0.312	-0.365	-0.307	0.146	-0.380	-0.333	0.334	-0.184	3.832	-0.235	-0.079	-0.321	0.481	-0.252	0.051	0.173	-0.168	-0.056	0.403	3.455	-0.173	0.642	4.511	-0.447	2.972	1.581	-0.273	-0.208
67497	-0.444	2.176	-0.325	0.807	-0.092	-0.159	-0.044	0.246	3.287	21.541	0.383	-0.265	2.203	0.506	43.720	-0.121	-0.265	-0.076	-0.318	-0.298	1.448	-0.018	-0.527	-0.183	-0.202	-0.327	-0.146	-0.247	0.011	-0.339	-0.141	-0.285	-0.204	-0.574	-0.151	-0.428	-0.425	2.625	0.682	2.365	2.391	-0.254	-0.310	2.316	2.364	0.398	-0.095	-0.173	-0.050	-0.345	-0.541	4.036	25.938	0.418	-0.253	1.431	-0.244	31.199	-0.414	-0.394	1.437	-0.380	-0.333	0.848	-0.184	-0.683	-0.235	-0.269	-0.321	-0.178	0.709	-0.312	-0.374	-0.168	-0.340	-0.201	-0.709	-0.173	-0.531	-0.507	3.277	0.166	1.638	-0.312	-0.334

	Converted	Conv_Prob	ID	predicted	0.000	0.100	0.200	0.300	0.400	0.500	0.600	0.700	0.800	0.900
0	0	0.146	0	0	1	1	0	0	0	0	0	0	0	0
1	0	0.060	1	0	1	0	0	0	0	0	0	0	0	0

	aon	arpu_6	arpu_7	arpu_8	arpu_9	aug_vbc_3g	date_of_last_rech_6	date_of_last_rech_7	date_of_last_rech_8	date_of_last_rech_9	ic_others_6	ic_others_7	ic_others_8	ic_others_9	isd_ic_mou_6	isd_ic_mou_7	isd_ic_mou_8	isd_ic_mou_9	isd_og_mou_6	isd_og_mou_7	isd_og_mou_8	isd_og_mou_9	jul_vbc_3g	jun_vbc_3g	last_day_rch_amt_6	last_day_rch_amt_7	last_day_rch_amt_8	last_day_rch_amt_9	loc_ic_mou_6	loc_ic_mou_7	loc_ic_mou_8	loc_ic_mou_9	loc_ic_t2f_mou_6	loc_ic_t2f_mou_7	loc_ic_t2f_mou_8	loc_ic_t2f_mou_9	loc_ic_t2m_mou_6	loc_ic_t2m_mou_7	loc_ic_t2m_mou_8	loc_ic_t2m_mou_9	loc_ic_t2t_mou_6	loc_ic_t2t_mou_7	loc_ic_t2t_mou_8	loc_ic_t2t_mou_9	loc_og_mou_6	loc_og_mou_7	loc_og_mou_8	loc_og_mou_9	loc_og_t2c_mou_6	loc_og_t2c_mou_7	loc_og_t2c_mou_8	loc_og_t2c_mou_9	loc_og_t2f_mou_6	loc_og_t2f_mou_7	loc_og_t2f_mou_8	loc_og_t2f_mou_9	loc_og_t2m_mou_6	loc_og_t2m_mou_7	loc_og_t2m_mou_8	loc_og_t2m_mou_9	loc_og_t2t_mou_6	loc_og_t2t_mou_7	loc_og_t2t_mou_8	loc_og_t2t_mou_9	max_rech_amt_6	max_rech_amt_7	max_rech_amt_8	max_rech_amt_9	mobile_number	monthly_2g_6	monthly_2g_7	monthly_2g_8	monthly_2g_9	monthly_3g_6	monthly_3g_7	monthly_3g_8	monthly_3g_9	offnet_mou_6	offnet_mou_7	offnet_mou_8	offnet_mou_9	og_others_6	og_others_7	og_others_8	og_others_9	onnet_mou_6	onnet_mou_7	onnet_mou_8	onnet_mou_9	roam_ic_mou_6	roam_ic_mou_7	roam_ic_mou_8	roam_ic_mou_9	roam_og_mou_6	roam_og_mou_7	roam_og_mou_8	roam_og_mou_9	sachet_2g_6	sachet_2g_7	sachet_2g_8	sachet_2g_9	sachet_3g_6	sachet_3g_7	sachet_3g_8	sachet_3g_9	sep_vbc_3g	spl_ic_mou_6	spl_ic_mou_7	spl_ic_mou_8	spl_ic_mou_9	spl_og_mou_6	spl_og_mou_7	spl_og_mou_8	spl_og_mou_9	std_ic_mou_6	std_ic_mou_7	std_ic_mou_8	std_ic_mou_9	std_ic_t2f_mou_6	std_ic_t2f_mou_7	std_ic_t2f_mou_8	std_ic_t2f_mou_9	std_ic_t2m_mou_6	std_ic_t2m_mou_7	std_ic_t2m_mou_8	std_ic_t2m_mou_9	std_ic_t2t_mou_6	std_ic_t2t_mou_7	std_ic_t2t_mou_8	std_ic_t2t_mou_9	std_og_mou_6	std_og_mou_7	std_og_mou_8	std_og_mou_9	std_og_t2f_mou_6	std_og_t2f_mou_7	std_og_t2f_mou_8	std_og_t2f_mou_9	std_og_t2m_mou_6	std_og_t2m_mou_7	std_og_t2m_mou_8	std_og_t2m_mou_9	std_og_t2t_mou_6	std_og_t2t_mou_7	std_og_t2t_mou_8	std_og_t2t_mou_9	total_ic_mou_6	total_ic_mou_7	total_ic_mou_8	total_ic_mou_9	total_og_mou_6	total_og_mou_7	total_og_mou_8	total_og_mou_9	total_rech_amt_6	total_rech_amt_7	total_rech_amt_8	total_rech_amt_9	total_rech_num_6	total_rech_num_7	total_rech_num_8	total_rech_num_9	vol_2g_mb_6	vol_2g_mb_7	vol_2g_mb_8	vol_2g_mb_9	vol_3g_mb_6	vol_3g_mb_7	vol_3g_mb_8	vol_3g_mb_9
0	968.000	197.385	214.816	213.803	21.100	30.400	21.000	16.000	8.000	28.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	101.200	252.000	252.000	252.000	0.000	92.160	92.550	5.440	91.640	0.880	0.930	1.150	0.960	56.490	57.080	4.130	56.610	15.690	15.740	0.160	15.660	65.110	63.685	0.000	61.840	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	41.030	40.430	0.000	39.120	11.910	11.610	0.000	11.260	252.000	252.000	252.000	0.000	7000842753.000	0.000	0.000	0.000	0.000	1.000	1.000	1.000	0.000	96.310	91.735	0.000	87.290	0.000	0.000	0.000	0.000	34.310	32.330	0.000	29.840	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	3.580	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	5.890	5.960	0.000	5.380	0.000	0.000	0.000	0.000	2.030	2.040	0.000	1.740	0.000	0.000	0.000	0.000	11.640	11.090	0.000	8.410	0.000	0.000	0.000	0.000	3.950	3.635	0.000	2.500	0.000	0.000	0.000	0.000	0.000	0.000	5.440	0.000	0.000	0.000	0.000	0.000	362.000	252.000	252.000	0.000	4.000	3.000	2.000	6.000	30.130	1.320	5.750	0.000	83.570	150.760	109.610	0.000
1	1006.000	34.047	355.074	268.321	86.285	0.000	29.000	31.000	28.000	30.000	0.000	0.000	0.000	0.000	1.830	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	44.000	23.000	30.000	0.000	19.090	104.230	408.430	173.030	0.000	8.930	3.610	0.000	17.480	65.380	375.580	56.930	1.610	29.910	29.230	116.090	35.390	150.510	299.540	72.110	0.000	2.910	0.000	0.000	0.000	0.000	0.000	0.000	11.510	75.940	291.860	53.760	23.880	74.560	7.680	18.340	44.000	154.000	65.000	50.000	7001865778.000	0.000	1.000	0.000	0.000	0.000	0.000	0.000	0.000	15.740	99.840	304.760	53.760	0.000	0.000	0.000	0.000	24.110	78.680	7.680	18.340	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	2.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	4.680	23.430	12.760	0.000	5.900	0.000	14.840	15.010	0.000	0.000	0.000	0.000	5.900	0.000	12.490	15.010	0.000	0.000	2.350	0.000	0.230	4.580	0.130	0.000	0.000	0.000	0.000	0.000	0.000	0.460	0.130	0.000	0.230	4.110	0.000	0.000	26.830	104.230	423.280	188.040	40.310	178.530	312.440	72.110	74.000	384.000	283.000	121.000	4.000	9.000	11.000	5.000	0.000	108.070	365.470	0.000	0.000	0.000	0.000	0.000
2	1103.000	167.690	189.058	210.226	290.714	0.000	17.000	24.000	14.000	29.000	0.930	3.140	0.000	0.360	0.000	0.850	0.000	0.010	0.000	0.000	0.000	0.000	0.000	4.170	0.000	200.000	86.000	0.000	229.560	208.860	155.990	345.410	99.480	122.290	49.630	158.190	14.380	15.440	38.890	38.980	115.690	71.110	67.460	148.230	60.660	67.410	67.660	64.810	0.000	135.540	45.760	0.480	24.110	21.790	15.610	22.240	29.340	16.860	38.460	28.160	7.190	28.740	13.580	14.390	86.000	200.000	86.000	100.000	7001625959.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	143.330	220.590	208.360	118.910	0.450	0.000	0.000	0.000	11.540	55.240	37.260	74.810	0.000	0.000	0.000	38.490	0.000	0.000	0.000	70.940	0.000	0.000	0.000	1.000	0.000	0.000	0.000	0.000	0.000	0.210	0.000	0.000	0.450	46.560	236.840	96.840	42.080	139.330	306.660	239.030	171.490	21.730	58.340	43.230	3.860	45.180	177.010	167.090	118.180	72.410	71.290	28.690	49.440	47.640	108.680	120.940	18.040	1.480	14.760	22.830	0.000	41.810	67.410	75.530	9.280	4.340	26.490	22.580	8.760	370.040	519.530	395.030	517.740	155.330	412.940	285.460	124.940	168.000	315.000	116.000	358.000	5.000	4.000	2.000	7.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	8.420
3	2491.000	221.338	251.102	508.054	389.500	0.000	28.000	31.000	31.000	30.000	0.000	0.000	0.000	0.800	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	30.000	50.000	50.000	30.000	233.480	111.590	48.180	160.190	57.430	27.090	19.840	65.590	113.960	64.510	20.280	52.860	62.080	19.980	8.040	41.730	183.030	118.680	37.990	83.030	0.000	0.000	0.000	0.000	1.910	0.650	4.910	2.060	107.430	83.210	22.460	65.460	73.680	34.810	10.610	15.490	60.000	50.000	50.000	50.000	7001204172.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	123.310	109.010	71.680	113.540	0.000	0.000	0.000	0.000	99.910	54.390	310.980	241.710	0.000	54.860	44.380	0.000	0.000	28.090	39.040	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.590	0.000	0.000	0.550	10.960	0.000	18.090	43.290	45.990	105.010	4.940	143.830	1.180	0.000	0.000	0.000	1.330	38.560	4.940	13.980	43.480	66.440	0.000	129.840	29.230	16.630	296.110	236.210	0.000	0.000	0.000	0.000	2.990	1.730	6.530	9.990	26.230	14.890	289.580	226.210	280.080	216.610	53.130	305.380	223.230	135.310	352.210	362.540	230.000	310.000	601.000	410.000	10.000	11.000	18.000	14.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
4	1526.000	261.636	309.876	238.174	163.426	0.000	26.000	28.000	9.000	28.000	0.480	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	50.000	110.000	110.000	50.000	214.030	198.040	337.940	202.810	1.500	0.000	0.000	0.000	106.840	109.540	104.130	48.240	105.680	88.490	233.810	154.560	117.960	241.330	208.160	98.610	0.000	0.000	0.000	0.000	0.000	0.000	0.000	1.930	67.640	91.880	124.260	37.890	50.310	149.440	83.890	58.780	56.000	110.000	110.000	50.000	7000142493.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	76.960	91.880	124.260	45.810	0.000	0.000	0.000	0.000	50.310	149.440	83.890	58.780	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	1.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.180	0.000	0.000	0.000	5.980	1.930	0.250	0.860	2.310	0.000	0.000	0.000	0.000	1.930	0.250	0.000	0.000	0.000	0.000	0.860	2.310	9.310	0.000	0.000	0.000	0.000	0.000	0.000	0.000	9.310	0.000	0.000	0.000	0.000	0.000	0.000	0.000	216.440	198.290	338.810	205.310	127.280	241.330	208.160	104.590	196.000	350.000	287.000	200.000	5.000	6.000	3.000	4.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
99994	3199.000	18.471	69.161	57.530	29.950	0.000	3.000	20.000	26.000	25.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	60.000	0.000	0.000	0.000	93.090	72.880	160.830	154.430	17.450	5.160	43.790	49.460	47.960	46.930	76.080	82.890	27.680	20.780	40.940	22.060	18.180	57.830	55.490	27.030	0.000	0.000	3.210	0.110	0.000	0.000	0.000	2.250	12.780	54.460	49.580	24.780	5.400	3.360	5.910	0.000	60.000	110.000	0.000	50.000	7001548952.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	15.190	54.460	52.790	27.140	0.000	0.000	0.000	0.000	5.400	3.360	5.910	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	2.410	3.010	3.210	0.110	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	93.090	72.880	160.830	154.430	20.590	60.840	58.710	27.140	60.000	110.000	0.000	50.000	2.000	3.000	2.000	2.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
99995	2052.000	112.201	77.811	79.081	140.835	0.000	17.000	17.000	24.000	26.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	149.000	30.000	51.760	69.880	66.930	107.610	2.160	8.730	4.180	18.440	37.380	57.510	50.980	63.440	12.210	3.630	11.760	25.710	121.990	75.480	79.410	99.490	0.000	0.000	0.000	0.000	1.710	9.810	5.830	3.280	91.340	47.530	57.510	46.710	28.930	18.130	16.060	49.490	110.000	110.000	149.000	30.000	7000607688.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	100.830	69.010	66.390	56.130	0.000	0.000	0.000	0.000	29.260	18.130	16.060	49.490	0.000	0.000	0.000	0.950	0.000	0.000	0.000	4.230	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	3.050	5.010	4.340	11.490	0.880	1.230	0.000	0.000	0.000	0.000	4.340	11.490	0.880	1.230	0.000	0.000	0.000	0.000	8.090	11.660	0.000	1.900	2.930	4.810	0.000	0.000	4.830	6.840	0.000	1.900	0.330	0.000	0.000	0.000	56.110	81.380	67.810	108.840	130.090	87.140	82.460	106.410	110.000	110.000	182.000	30.000	5.000	3.000	2.000	1.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000
99996	185.000	229.187	0.000	0.000	0.000	0.000	13.000	26.000	27.000	2.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	30.000	0.000	0.000	0.000	473.690	92.550	93.830	91.640	0.000	0.930	0.930	0.960	380.680	57.080	58.240	56.610	93.010	15.740	16.030	15.660	14.440	63.685	63.730	61.840	4.710	0.000	0.000	0.000	0.000	0.000	0.000	0.000	13.330	40.430	40.360	39.120	1.110	11.610	11.730	11.260	50.000	0.000	0.000	0.000	7000087541.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	21.040	91.735	92.140	87.290	2.410	0.000	0.000	0.000	1.110	32.330	32.360	29.840	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	2.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	8.280	0.000	0.000	0.000	0.000	5.960	5.880	5.380	0.000	0.000	0.000	0.000	0.000	2.040	2.030	1.740	0.000	0.000	0.000	0.000	0.000	11.090	10.410	8.410	0.000	0.000	0.000	0.000	0.000	3.635	3.310	2.500	0.000	0.000	0.000	0.000	473.690	0.000	0.000	0.000	25.140	0.000	0.000	0.000	119.000	0.000	0.000	0.000	5.000	2.000	0.000	1.000	0.210	0.000	0.000	0.000	275.850	0.000	0.000	0.000
99997	478.000	322.991	303.386	606.817	731.010	1445.740	17.000	19.000	20.000	17.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	1151.030	1173.180	202.000	179.000	179.000	179.000	0.810	11.090	1.710	2.210	0.000	0.000	0.000	0.000	0.810	10.390	1.710	2.210	0.000	0.700	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	202.000	179.000	179.000	252.000	7000498689.000	0.000	0.000	0.000	0.000	1.000	2.000	4.000	4.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	1.000	0.000	0.000	0.000	1.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.580	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.580	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.810	11.090	2.290	2.210	0.000	0.000	0.000	0.000	381.000	358.000	716.000	862.000	3.000	4.000	4.000	6.000	76.510	241.770	136.470	85.670	1453.630	1382.080	2683.300	4177.140
99998	751.000	687.065	0.000	0.000	0.000	0.000	16.000	27.000	27.000	28.000	2.300	0.000	0.000	0.000	382.480	0.000	0.000	0.000	17.360	0.000	0.000	0.000	0.000	0.000	175.000	0.000	0.000	0.000	398.940	92.550	93.830	91.640	80.090	0.930	0.930	0.960	190.610	57.080	58.240	56.610	128.230	15.740	16.030	15.660	250.810	63.685	63.730	61.840	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	166.460	40.430	40.360	39.120	84.340	11.610	11.730	11.260	175.000	0.000	0.000	0.000	7001905007.000	1.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	166.460	91.735	92.140	87.290	0.560	0.000	0.000	0.000	84.340	32.330	32.360	29.840	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	1.180	0.000	0.000	0.000	4.990	5.960	5.880	5.380	0.000	0.000	0.000	0.000	4.990	2.040	2.030	1.740	0.000	0.000	0.000	0.000	0.000	11.090	10.410	8.410	0.000	0.000	0.000	0.000	0.000	3.635	3.310	2.500	0.000	0.000	0.000	0.000	788.730	0.000	0.000	0.000	269.930	0.000	0.000	0.000	710.000	0.000	0.000	0.000	5.000	0.000	0.000	0.000	26.110	0.000	0.000	0.000	152.700	0.000	0.000	0.000

	level_0	level_1	0
3868	arpu_good_phase	total_rech_amt_good_phase	0.974
3699	total_rech_amt_good_phase	arpu_good_phase	0.974
3316	total_rech_amt_action_phase	arpu_action_phase	0.954
123	arpu_action_phase	total_rech_amt_action_phase	0.954
558	isd_og_mou_action_phase	isd_og_mou_good_phase	0.949
4171	isd_og_mou_good_phase	isd_og_mou_action_phase	0.949
6937	total_ic_mou_good_phase	loc_ic_mou_good_phase	0.892
4500	loc_ic_mou_good_phase	total_ic_mou_good_phase	0.892
3153	total_ic_mou_action_phase	loc_ic_mou_action_phase	0.887
716	loc_ic_mou_action_phase	total_ic_mou_action_phase	0.887
1905	onnet_mou_action_phase	std_og_t2t_mou_action_phase	0.857
3082	std_og_t2t_mou_action_phase	onnet_mou_action_phase	0.857
858	loc_ic_t2m_mou_action_phase	loc_ic_mou_action_phase	0.852
689	loc_ic_mou_action_phase	loc_ic_t2m_mou_action_phase	0.852
1734	offnet_mou_action_phase	std_og_t2m_mou_action_phase	0.850
2995	std_og_t2m_mou_action_phase	offnet_mou_action_phase	0.850
5689	onnet_mou_good_phase	std_og_t2t_mou_good_phase	0.847
6866	std_og_t2t_mou_good_phase	onnet_mou_good_phase	0.847
4473	loc_ic_mou_good_phase	loc_ic_t2m_mou_good_phase	0.846
4642	loc_ic_t2m_mou_good_phase	loc_ic_mou_good_phase	0.846

	feature	rank	support
47	isd_ic_mou_good_phase	1	True
63	offnet_mou_good_phase	1	True
37	total_ic_mou_action_phase	1	True
12	loc_og_mou_action_phase	1	True
72	std_ic_mou_good_phase	1	True
38	total_og_mou_action_phase	1	True
27	spl_ic_mou_action_phase	1	True
65	onnet_mou_good_phase	1	True
8	loc_ic_mou_action_phase	1	True
80	total_ic_mou_good_phase	1	True
5	isd_ic_mou_action_phase	1	True
19	monthly_3g_action_phase	1	True
81	total_og_mou_good_phase	1	True
51	loc_ic_mou_good_phase	1	True
29	std_ic_mou_action_phase	1	True

	Features	VIF
13	total_ic_mou_good_phase	442.120
9	loc_ic_mou_good_phase	327.360
6	total_ic_mou_action_phase	97.370
12	std_ic_mou_good_phase	70.810
1	loc_ic_mou_action_phase	68.930

	Features	VIF
6	total_ic_mou_action_phase	81.080
1	loc_ic_mou_action_phase	58.200
13	total_og_mou_good_phase	26.800
10	offnet_mou_good_phase	14.800
11	onnet_mou_good_phase	14.070

	Features	VIF
12	total_og_mou_good_phase	26.790
9	offnet_mou_good_phase	14.770
10	onnet_mou_good_phase	14.050

	columns	imp_score
0	total_og_mou_action_phase	-1.225
1	const	-1.202
2	loc_ic_mou_action_phase	-1.126
3	loc_og_mou_action_phase	-0.910
4	onnet_mou_good_phase	0.608
5	spl_ic_mou_action_phase	-0.602
6	offnet_mou_good_phase	0.566
7	monthly_3g_action_phase	-0.501
8	std_ic_mou_action_phase	-0.420
9	isd_ic_mou_action_phase	-0.395
10	loc_ic_mou_good_phase	0.338
11	isd_ic_mou_good_phase	0.245
12	std_ic_mou_good_phase	0.142

	Conv_Prob	ID	predicted	0.000	0.100	0.200	0.300	0.400	0.500	0.600	0.700	0.800	0.900	final_predicted
0	0.146	0	0	1	1	0	0	0	0	0	0	0	0	0
1	0.060	1	0	1	0	0	0	0	0	0	0	0	0	0
2	0.542	2	1	1	1	1	1	1	1	0	0	0	0	1
3	0.986	3	1	1	1	1	1	1	1	1	1	1	1	1
4	0.003	4	0	1	0	0	0	0	0	0	0	0	0	0

	model_name	class_imb	train_accuracy	test_accuracy	roc_score	precision	recall	f1_score	classification_report	confusion_matrix	feature_imp
10	GradientBoostingClassifier	oversampling	0.979	0.928	0.945	0.567	0.788	0.660	precision recall f1-score ...	[[7732, 478], [168, 626]]	columns imp_score ...
14	XGBClassifier	oversampling	0.996	0.938	0.942	0.636	0.688	0.661	precision recall f1-score ...	[[7898, 312], [248, 546]]	columns imp_score ...
11	GradientBoostingClassifier	smote	0.979	0.936	0.942	0.623	0.685	0.653	precision recall f1-score ...	[[7881, 329], [250, 544]]	columns imp_score ...
12	GradientBoostingClassifier	adasyn	0.979	0.934	0.940	0.614	0.689	0.649	precision recall f1-score ...	[[7866, 344], [247, 547]]	columns imp_score ...
15	XGBClassifier	smote	0.997	0.937	0.939	0.651	0.605	0.627	precision recall f1-score ...	[[7953, 257], [314, 480]]	columns imp_score ...
16	XGBClassifier	adasyn	0.997	0.936	0.936	0.650	0.602	0.625	precision recall f1-score ...	[[7953, 257], [316, 478]]	columns imp_score ...
6	RandomForestClassifier	oversampling	0.882	0.907	0.934	0.482	0.792	0.600	precision recall f1-score ...	[[7535, 675], [165, 629]]	columns imp_score 3...
8	RandomForestClassifier	adasyn	0.894	0.888	0.932	0.429	0.817	0.563	precision recall f1-score ...	[[7347, 863], [145, 649]]	columns imp_score ...
7	RandomForestClassifier	smote	0.905	0.903	0.931	0.471	0.802	0.594	precision recall f1-score ...	[[7495, 715], [157, 637]]	columns imp_score ...
3	DecisionTreeClassifier	smote	0.908	0.895	0.911	0.443	0.761	0.560	precision recall f1-score ...	[[7452, 758], [190, 604]]	columns imp_score ...
13	GradientBoostingClassifierPCA	oversampling	0.965	0.903	0.906	0.467	0.686	0.556	precision recall f1-score ...	[[7589, 621], [249, 545]]	principal_component imp_score 8 ...
2	DecisionTreeClassifier	oversampling	0.880	0.854	0.902	0.359	0.831	0.501	precision recall f1-score ...	[[7029, 1181], [134, 660]]	columns imp_score ...
17	XGBClassifierPCA	oversampling	0.997	0.920	0.898	0.547	0.553	0.550	precision recall f1-score ...	[[7847, 363], [355, 439]]	principal_component imp_score 8 ...
4	DecisionTreeClassifier	adasyn	0.890	0.871	0.898	0.388	0.798	0.522	precision recall f1-score ...	[[7208, 1002], [160, 634]]	columns imp_score ...
9	RandomForestClassifierPCA	oversampling	0.855	0.874	0.893	0.391	0.768	0.518	precision recall f1-score ...	[[7260, 950], [184, 610]]	principal_component imp_score 8 ...
1	LogisticRegressionPCA	oversampling	0.829	0.827	0.892	0.318	0.838	0.461	precision recall f1-score ...	[[6782, 1428], [129, 665]]	principal_component imp_score 0 ...
0	LogisticRegression	oversampling	0.808	0.819	0.872	0.305	0.821	0.444	precision recall f1-score ...	[[6722, 1488], [142, 652]]	columns imp_score 0 t...
5	DecisionTreeClassifierPCA	oversampling	0.832	0.820	0.846	0.300	0.781	0.433	precision recall f1-score ...	[[6760, 1450], [174, 620]]	principal_component imp_score 8 ...

Data Science Portfolio

Telecom-churn-Case-Study

Problem Statement

Business problem overview

Understanding and defining churn

Definitions of churn

Revenue-based churn

Usage-based churn

High-value churn

Understanding the business objective and the data

Understanding customer behaviour during churn

Data dictionary

Data Dictionary - Telecom Churn : Download

Data Preparation

Modelling

Note: The solution may take 10-20mins to complete depending upon the system configurations¶

Process Flow Diagram¶

Initialization¶

1. Import all necessary libraries. LogisticRegression, Decision Tree classifier, Random Forest, Imblearn, RFE, PCA etc

2. Initialize the telecom_churn dataset

Custom Functions¶

Data Preprocessing¶

3. Check the shape and size of the dataset.

4. Check for any duplicate entries in the data set. Also check if there is any duplicates in mobile number column.

5. Check for column null percentage.

6. Check for any constant features or feature that has only one label.

7. Plot the bar chart for columns with less than 10% of NULL values.

8. Convert Date column to day in numbers format.

9. Check for any outliers in the data set.

10. Impute Missing Values: Since there are lot of missing values in the dataset, we therefore decide to impute it. Also most of the columns are skewed, hence we use "median" as a strategy to impute it.

11. Filter the high value customers who have recharged more than 70% of the average recharge value during the good phase.

12. Tag the churned customers (1 or 0) by applying conditions on the following fourth month columns: total_ic_mou_9, total_og_mou_9, vol_2g_mb_9, vol_3g_mb_9

13. Rename columns_8 as action phase based on business requirements.

14. Find all columns related to churn phase or cols with _9 in name. and drop it

15. Filter features that are identified by months _6 & _7, in order to combine those features as "good phase" and also check the datatypes

16. Now that we have identified the features for "Good phase", using an aggr method we can go ahead combine those features. once features are combined as "Good Phase", we drop all those redundant features of months _6&_7.

Exploratory Data Analysis¶

Univariate analysis¶

Bivariate Analysis¶

Multivariate analysis¶

Top 5 Highly correlated features are: ¶

Train and Test Split¶

Scaling¶

Class Imbalance¶

Helper Function for Class Imbalance¶

Class Imbalance using Oversampling¶

Class Imbalance using SMOTE¶

Class Imbalance using ADASYN¶

----------------------------------¶

Principal Component Analysis¶

Apply PCA (Dimensionality Reduction) on dataset generated by multiple class imbalance techniques¶

----------------------------------¶

Machine Learning - Algorithms and Models¶

Custom Functions for Model Building¶

Logistic Regression - RFE¶

Model Building¶

Logreg Model 1¶

Logreg Model 2¶

Logreg Model 3¶

Logreg Model 4¶

Logreg Model 5¶

Accuracy, Sensitivity, Specificity, Precision, Recall¶

ROC Curve Precision Recall curve¶

Model Testing¶

Logistic Regression - PCA¶

DecisionTreeClassifier¶

Custom Function¶

DecisionTree - Oversampling¶

Decision Tree - SMOTE¶

DT - ADASYN¶

Decision Tree - PCA¶

RandomForestClassifier¶

Custom Function¶

Random Forest - Oversampling¶

Random Forest - SMOTE¶

Random Forest - ADASYN¶

Random Forest - PCA¶

GradientBoostingClassifier¶

Custom Function¶

Gradient Boosting - Oversampling¶

10. Impute Missing Values:
Since there are lot of missing values in the dataset, we therefore decide to impute it.
Also most of the columns are skewed, hence we use "median" as a strategy to impute it.

16. Now that we have identified the features for "Good phase", using an aggr method we can go ahead combine those features.
once features are combined as "Good Phase", we drop all those redundant features of months _6&_7.

Top 5 Highly correlated features are:
¶

Apply PCA (Dimensionality Reduction) on dataset generated by multiple class imbalance techniques
¶