# Import all required libraries (Numpy, Pandas, Seaborn Matplotlib) for performing multiple linear regression
import numpy as np
import pandas as pd
import seaborn as sns; sns.set_theme(color_codes=True)
import matplotlib.pyplot as plt
%matplotlib inline

# Import Statistical and Machine learning modules (Statsmodel, SKLearn) for Python
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import warnings # Suppress Warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Set custom configurations For pandas. 
pd.set_option("display.max_rows", 900)
pd.set_option("display.max_columns", 900) 
pd.set_option('display.float_format', lambda x: '%.6f' % x)

# To distinguish numerical columns either as categorical/discrete or non categorical and return as dict
def classify_feature_dtype(df,cols):
    d_categories = {'int_cat': [], "float_ts":[] }
    for col in cols:
        if (len(df[col].unique()) < 20):
            d_categories['int_cat'].append(col)
        else:
            d_categories['float_ts'].append(col)
    return d_categories

# Print all statistical information for a given set of columns
def show_stats(df, cols):
    for col in list(cols):
        print("Total Nulls: {0},\nMode: {1}".format(df[col].isna().sum(), df[col].mode()[0]))
        if len(df[col].unique()) < 50:
            print("\nUnique: {0}\n".format(df[col].unique()))
        if (df[col].dtype == int) or (df[col].dtype == float):
            print("Median   : {0}, \nVariance: {1}, \n\nDescribe: {2} \n".format(df[col].median(), df[col].var(), df[col].describe()))
        print("ValueCounts: {0} \n\n\n".format((df[col].value_counts(normalize=True) * 100).head(5)))
        print("------------------------------------------------------------------")

# Return the percentage of null values in each columns in a dataframe
def check_cols_null_pct(df):
    df_non_na = df.count() / len(df)  # Ratio of non null values
    df_na_pct = (1 - df_non_na) * 100 # Find the Percentage of null values
    return df_na_pct.sort_values(ascending=False) # Sort the resulting values in descending order

# Generates charts based on the data type of the cols, as part of the univariate analysis 
# it takes dataframe, columns, train data 0,1, and feature type as args.
def univariate_plots(df, cols, target=None, ftype=None, l_dict = None):
    for col in cols:
        #generate plots and graphs for category type. (generates piechart, countplot, boxplot / if training data is provided it generates bar chart instead)
        if ftype == "category":
            fig, axs = plt.subplots(1, 3, figsize=(20, 6))
                        
            col_idx = 0
            
            axs[col_idx].pie(x=df[col].value_counts().head(15), labels=df[col].value_counts().head(15).index, autopct="%1.1f%%", 
                    radius=1, textprops={"fontsize": 10, "color": "Black"}, startangle=90, rotatelabels=False)
            axs[col_idx].set_title("PieChart of {0}".format(col), y=1); plt.xticks(rotation=45); plt.ylabel("Percentage")
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1

            sns.countplot(data=df, y=col, order=df[col].value_counts().index, palette="viridis",  ax=axs[col_idx])
            if (l_dict is not None) and (l_dict.get(col) is not None):
                axs[col_idx].legend([ f'{k} - {v}' for k,v in l_dict[col].items()])
            axs[col_idx].set_title("Countplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col); plt.ylabel("Count")
            fig.subplots_adjust(wspace=0.5, hspace=0.3)

            col_idx += 1
            
            ax = sns.barplot(data=df, x=col, y=target, palette="viridis",  ax=axs[col_idx], errwidth=0)
            for i in ax.containers:
                ax.bar_label(i,)
            axs[col_idx].set_title('Barplot against target'); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)

            plt.suptitle("Univariate analysis of {0}".format(col), fontsize=12, y=0.95)
            plt.tight_layout()
            plt.subplots_adjust(top=0.85)
            plt.show()
            plt.clf()

        #generate plots and graphs for numerical types. (generates boxplot, histplot, kdeplot, scatterplot)
        elif ftype == "non_categorical":        
            fig, axs = plt.subplots(1, 4, figsize=(20, 6))
            
            col_idx = 0
            
            sns.boxplot(data=df, y=col, palette="viridis", flierprops=dict( marker="o", markersize=6, markerfacecolor="red", markeredgecolor="black"),
                        medianprops=dict(linestyle="-", linewidth=3, color="#FF9900"), whiskerprops=dict(linestyle="-", linewidth=2, color="black"),
                        capprops=dict(linestyle="-", linewidth=2, color="black"), ax=axs[col_idx])
            axs[col_idx].set_title("Boxplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1

            axs[col_idx].hist(data=df, x=col, label=col)
            axs[col_idx].set_title("Histogram of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1

            sns.kdeplot(df[col], shade=True, ax=axs[col_idx])
            axs[col_idx].set_title("KDE plot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)
            
            col_idx += 1

            sns.scatterplot(df[col], ax=axs[col_idx])
            axs[col_idx].set_title("Scatterplot of {0}".format(col)); plt.xticks(rotation=45); plt.xlabel(col)
            fig.subplots_adjust(wspace=0.5, hspace=0.3)

            plt.suptitle("Univariate analysis of {0}".format(col), fontsize=12, y=0.95)
            plt.tight_layout()
            plt.subplots_adjust(top=0.85)
            plt.show()
            plt.clf()
    
# Perform Outlier analysis on the given dataframe.
# Find Lower threshold, Upper threshold and IQR values. 
# Return the Result as a dataframe. 
# find_outlier = True argument: restricts the output df to outlier columns. whereas find_outlier = False: returns results for all columns
def get_extremeval_threshld(df, find_outlier=False):
    outlier_df = pd.DataFrame(columns=[i for i in df.columns if find_outlier == True], data=None)
    
    for col in df.columns:
        thirdq, firstq = df[col].quantile(0.75), df[col].quantile(0.25)
        iqr = 1.5 * (thirdq - firstq)
        extvalhigh, extvallow = iqr + thirdq, firstq - iqr
        
        if find_outlier == True:
            dfout = df.loc[(df[col] > extvalhigh) | (df[col] < extvallow)]
            dfout = dfout.assign(name=col, thresh_low=extvallow, thresh_high=extvalhigh)
        else:
            dfout = pd.DataFrame([[col, extvallow, extvalhigh]], columns=['name', 'thresh_low', 'thresh_high'])
            
        outlier_df = pd.concat([outlier_df, dfout])
    # outlier_df = outlier_df.reset_index(drop=True)
    outlier_df = outlier_df.set_index('name',drop=True)
    return outlier_df

# import and read the dataset
bike_share_df = pd.read_csv('day.csv', sep=",", header=0)
bike_share_df.head()

# Perform renaming of inconsistent columns and drop the unnecessary
bike_share_df = bike_share_df.drop(columns=['instant '], errors='ignore')
bike_share_df = bike_share_df.rename(str.strip, axis='columns')
bike_share_df = bike_share_df.rename({'dteday':'date', 'yr':'year', 'mnth':'month', 'atemp':'actual_temp', 'hum':'humidity', 'casual':'casual_users', 'registered':'registered_users', 'cnt':'total_cnt'}, axis='columns')

bike_share_df.columns # get the list of columns
bike_share_df.info() # shows column information

Index(['date', 'season', 'year', 'month', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'actual_temp', 'humidity', 'windspeed',
       'casual_users', 'registered_users', 'total_cnt'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              730 non-null    object 
 1   season            730 non-null    int64  
 2   year              730 non-null    int64  
 3   month             730 non-null    int64  
 4   holiday           730 non-null    int64  
 5   weekday           730 non-null    int64  
 6   workingday        730 non-null    int64  
 7   weathersit        730 non-null    int64  
 8   temp              730 non-null    float64
 9   actual_temp       730 non-null    float64
 10  humidity          730 non-null    float64
 11  windspeed         730 non-null    float64
 12  casual_users      730 non-null    int64  
 13  registered_users  730 non-null    int64  
 14  total_cnt         730 non-null    int64  
dtypes: float64(4), int64(10), object(1)
memory usage: 85.7+ KB

# Return the percentage of null values in each columns in a dataframe
check_cols_null_pct(bike_share_df) 

# Print all statistical information for a given set of columns
show_stats(bike_share_df, bike_share_df.columns)

date               0.000000
season             0.000000
year               0.000000
month              0.000000
holiday            0.000000
weekday            0.000000
workingday         0.000000
weathersit         0.000000
temp               0.000000
actual_temp        0.000000
humidity           0.000000
windspeed          0.000000
casual_users       0.000000
registered_users   0.000000
total_cnt          0.000000
dtype: float64

Total Nulls: 0,
Mode: 01-01-2018 
ValueCounts: date
01-01-2018    0.136986
25-04-2019    0.136986
27-04-2019    0.136986
28-04-2019    0.136986
29-04-2019    0.136986
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 3

Unique: [1 2 3 4]

Median   : 3.0, 
Variance: 1.232508408967052, 

Describe: count   730.000000
mean      2.498630
std       1.110184
min       1.000000
25%       2.000000
50%       3.000000
75%       3.000000
max       4.000000
Name: season, dtype: float64 

ValueCounts: season
3   25.753425
2   25.205479
1   24.657534
4   24.383562
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 0

Unique: [0 1]

Median   : 0.5, 
Variance: 0.2503429355281207, 

Describe: count   730.000000
mean      0.500000
std       0.500343
min       0.000000
25%       0.000000
50%       0.500000
75%       1.000000
max       1.000000
Name: year, dtype: float64 

ValueCounts: year
0   50.000000
1   50.000000
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 1

Unique: [ 1  2  3  4  5  6  7  8  9 10 11 12]

Median   : 7.0, 
Variance: 11.903985568521309, 

Describe: count   730.000000
mean      6.526027
std       3.450215
min       1.000000
25%       4.000000
50%       7.000000
75%      10.000000
max      12.000000
Name: month, dtype: float64 

ValueCounts: month
1   8.493151
3   8.493151
5   8.493151
7   8.493151
8   8.493151
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 0

Unique: [0 1]

Median   : 0.0, 
Variance: 0.027977901798297095, 

Describe: count   730.000000
mean      0.028767
std       0.167266
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       1.000000
Name: holiday, dtype: float64 

ValueCounts: holiday
0   97.123288
1    2.876712
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 1

Unique: [1 2 3 4 5 6 0]

Median   : 3.0, 
Variance: 4.001354830223422, 

Describe: count   730.000000
mean      2.995890
std       2.000339
min       0.000000
25%       1.000000
50%       3.000000
75%       5.000000
max       6.000000
Name: weekday, dtype: float64 

ValueCounts: weekday
1   14.383562
2   14.383562
3   14.246575
4   14.246575
5   14.246575
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 1

Unique: [1 0]

Median   : 1.0, 
Variance: 0.21403686791814577, 

Describe: count   730.000000
mean      0.690411
std       0.462641
min       0.000000
25%       0.000000
50%       1.000000
75%       1.000000
max       1.000000
Name: workingday, dtype: float64 

ValueCounts: workingday
1   69.041096
0   30.958904
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 1

Unique: [2 1 3]

Median   : 1.0, 
Variance: 0.29681492756073846, 

Describe: count   730.000000
mean      1.394521
std       0.544807
min       1.000000
25%       1.000000
50%       1.000000
75%       2.000000
max       3.000000
Name: weathersit, dtype: float64 

ValueCounts: weathersit
1   63.424658
2   33.698630
3    2.876712
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 10.899153
Median   : 20.4658265, 
Variance: 56.35097933285952, 

Describe: count   730.000000
mean     20.319259
std       7.506729
min       2.424346
25%      13.811885
50%      20.465826
75%      26.880615
max      35.328347
Name: temp, dtype: float64 

ValueCounts: temp
26.035000   0.684932
10.899153   0.684932
27.880000   0.547945
28.563347   0.547945
23.130847   0.547945
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 32.7344
Median   : 24.368225, 
Variance: 66.42751661885134, 

Describe: count   730.000000
mean     23.726322
std       8.150308
min       3.953480
25%      16.889713
50%      24.368225
75%      30.445775
max      42.044800
Name: actual_temp, dtype: float64 

ValueCounts: actual_temp
32.734400   0.547945
18.781050   0.410959
31.850400   0.410959
28.598750   0.273973
23.326250   0.273973
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 61.3333
Median   : 62.625, 
Variance: 202.70894176890425, 

Describe: count   730.000000
mean     62.765175
std      14.237589
min       0.000000
25%      52.000000
50%      62.625000
75%      72.989575
max      97.250000
Name: humidity, dtype: float64 

ValueCounts: humidity
61.333300   0.547945
63.083300   0.410959
55.208300   0.410959
60.500000   0.410959
56.833300   0.410959
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 7.12545
Median   : 12.125325, 
Variance: 26.996760622179345, 

Describe: count   730.000000
mean     12.763620
std       5.195841
min       1.500244
25%       9.041650
50%      12.125325
75%      15.625589
max      34.000021
Name: windspeed, dtype: float64 

ValueCounts: windspeed
9.041918    0.410959
11.166689   0.410959
11.250104   0.410959
15.333486   0.410959
7.959064    0.410959
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 120
Median   : 717.0, 
Variance: 471254.6181408198, 

Describe: count    730.000000
mean     849.249315
std      686.479875
min        2.000000
25%      316.250000
50%      717.000000
75%     1096.500000
max     3410.000000
Name: casual_users, dtype: float64 

ValueCounts: casual_users
120   0.547945
968   0.547945
639   0.410959
163   0.410959
775   0.410959
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 1707
Median   : 3664.5, 
Variance: 2432847.2895522094, 

Describe: count    730.000000
mean    3658.757534
std     1559.758728
min       20.000000
25%     2502.250000
50%     3664.500000
75%     4783.250000
max     6946.000000
Name: registered_users, dtype: float64 

ValueCounts: registered_users
4841   0.410959
6248   0.410959
1707   0.410959
3461   0.273973
2713   0.273973
Name: proportion, dtype: float64 


------------------------------------------------------------------
Total Nulls: 0,
Mode: 1096
Median   : 4548.5, 
Variance: 3748141.098718458, 

Describe: count    730.000000
mean    4508.006849
std     1936.011647
min       22.000000
25%     3169.750000
50%     4548.500000
75%     5966.000000
max     8714.000000
Name: total_cnt, dtype: float64 

ValueCounts: total_cnt
5409   0.273973
2424   0.273973
5698   0.273973
4459   0.273973
5119   0.273973
Name: proportion, dtype: float64 


------------------------------------------------------------------

# Replace columns with nominal values into strings
bike_share_df['season']     = bike_share_df['season'].replace(to_replace = [1, 2, 3, 4], value = ['Spring', 'Summer', 'Fall', 'Winter'])
bike_share_df['year']       = bike_share_df['year'].replace(to_replace = [0, 1], value = [2018, 2019])
bike_share_df['month']      = bike_share_df['month'].replace(to_replace = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], value = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
bike_share_df['holiday']    = bike_share_df['holiday'].replace(to_replace = [0, 1], value = ['NotHoliday', 'Holiday'])
bike_share_df['weekday']    = bike_share_df['weekday'].replace(to_replace = [0, 1, 2, 3, 4, 5, 6], value = ['Sun', 'Mon', 'Tue', 'Wed', 'Thur', 'Fri', 'Sat'])
bike_share_df['workingday'] = bike_share_df['workingday'].replace(to_replace = [0, 1], value = ['NotWorkingday', 'Workingday'])
bike_share_df['weathersit'] = bike_share_df['weathersit'].replace(to_replace = [1, 2, 3, 4], value = ['Cloud', 'Mist', 'LightRain', 'HeavyRain'])

# Convert dtype of columns to category for categorical columns
bike_share_df[['season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weathersit']] = bike_share_df[['season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weathersit']].astype('category')
# Convert date to datetime format
bike_share_df['date'] = pd.to_datetime(bike_share_df['date'], dayfirst=True, format='mixed')

# Distinguish numerical columns either as categorical/discrete or non-categorical/ continuous and return as dict
dtype_dict = classify_feature_dtype(bike_share_df, bike_share_df.columns)

mask = bike_share_df.columns.isin(dtype_dict['int_cat']) # retrieve columns of categorical type for masking

# function to retrieve lower threshold and higher threshold for all continuous type columns 
col_threshld = get_extremeval_threshld(bike_share_df.loc[:,~mask], find_outlier = False)
col_threshld

# function to retrieve lower threshold and higher threshold for Outlier columns and find the count
extreme_val_df = get_extremeval_threshld(bike_share_df.loc[:,~mask], find_outlier = True)
outlier_cols = extreme_val_df.groupby(by='name')[['thresh_low','thresh_high']].value_counts()
outlier_cols

bike_share_df[['humidity', 'windspeed', 'casual_users']].describe([.05,.10,.75,.95,.99])

name          thresh_low   thresh_high
casual_users  -854.125000  2266.875000    44
humidity      20.515637    104.473938      2
windspeed     -0.834259    25.501498      13
Name: count, dtype: int64

label_dict = {'year':{0: 2018, 1:2019},'holiday':{0:'no',1:'yes'},'workingday':{0:'no',1:'yes'}}

# Plot Univariate graphs for both categorical and continuous variables. as well as perform bivariate analysis against total_cnt column
print("Univariate Analysis of categorical Variables")
univariate_plots(bike_share_df, dtype_dict['int_cat'], target='total_cnt', ftype = "category", l_dict=label_dict) 
print("--------------------------------------------------------------------------")
print("Univariate Analysis of Continuous Variables")
univariate_plots(bike_share_df, dtype_dict['float_ts'], target='total_cnt', ftype = "non_categorical")
print("--------------------------------------------------------------------------")

Univariate Analysis of categorical Variables

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

--------------------------------------------------------------------------
Univariate Analysis of Continuous Variables

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

# Fix Outliers by setting either thresh low or thresh low for both extremes 
lower_cutoff = col_threshld.loc['humidity','thresh_low']
bike_share_df['humidity'] = np.where((bike_share_df['humidity'] < lower_cutoff), lower_cutoff, bike_share_df['humidity'])
	
upper_cutoff = col_threshld.loc['windspeed','thresh_high']
bike_share_df['windspeed'] = np.where((bike_share_df['windspeed'] > upper_cutoff), upper_cutoff, bike_share_df['windspeed'])

# line graph of date vs total_cnt
plt.figure(figsize=(22,4));
sns.lineplot(x='date',y='total_cnt', data=bike_share_df, palette="flare");
plt.show();

# Pairplot analysis of all the continuous feature columns in the dataframe.  
sns.pairplot(bike_share_df[['temp','actual_temp','humidity','windspeed','casual_users','registered_users', 'total_cnt']])
plt.show();

# Regression line analysis of all the continuous feature type columns. 
axs = 331
plt.figure(figsize=(22,15))
for i in list(set(dtype_dict['float_ts']) - set(['date','total_cnt'])):
    plt.subplot(axs)
    sns.regplot(x=i, y='total_cnt' ,data=bike_share_df,color= 'green', line_kws={"color": "red"})
    axs += 1
plt.show();

# Generate Bivariate Boxplots combinations for all the categorical and continuous columns
x_lst = list(dtype_dict['int_cat']) # x_list variable contains all the categorical Columns 
y_lst = list(dtype_dict['float_ts'])# y_list contains all Continuous feature type columns 

axs = 1
for x_col in x_lst:
    plt.figure(figsize=(22,72))
    for y_col in y_lst:
        plt.subplot(18,4,axs)
        sns.boxplot(x=x_col, y=y_col, data=bike_share_df, palette='tab10')
        axs += 1
    plt.show();
plt.show();

# # Multivariate Boxplots  - 
# z_lst = x_lst = list(dtype_dict['int_cat'])
# y_lst = list(dtype_dict['float_ts'])

# z_lst = x_lst.copy()
# z_lst.remove('month')
# z_lst.remove('weekday')

# axs = 1
# for x_col in x_lst:
#     plt.figure(figsize=(25,650))
#     for y_col in y_lst:
#         for z_col in z_lst:
#             plt.subplot(200,4,axs)
#             sns.boxplot(x=x_col, y=y_col, hue=z_col, data=bike_share_df, palette='tab10')
#             plt.legend()
#             axs += 1
#     plt.show();
#     # axs = 1
#     print("--------------------------------------------------------------------------")
# plt.show();

# Find the Top 10 Correlations amongst the columns using df.corr()

corr_0 = bike_share_df.corr(numeric_only=True).abs()
corr_0 = corr_0.unstack()
correlation_0 = corr_0.sort_values()
correlation_0 = corr_0.dropna()

correlation_0 = correlation_0[correlation_0 != 1.0]
correlation_target_zero = correlation_0.reset_index()
correlation_target_zero.sort_values(by=0, ascending=False).head(10)

# Display the Correlation values Using the SNS. Heatmap. 
plt.figure(figsize=(22,8))
sns.heatmap(bike_share_df.select_dtypes(exclude=[object,'category']).corr(), annot=True)
plt.show();

# Validate the dataframe Content and the column datatypes 
bike_share_df.head(2)
bike_share_df.dtypes

date                datetime64[ns]
season                    category
year                      category
month                     category
holiday                   category
weekday                   category
workingday                category
weathersit                category
temp                       float64
actual_temp                float64
humidity                   float64
windspeed                  float64
casual_users                 int64
registered_users             int64
total_cnt                    int64
dtype: object

# Convert all the categorical columns to numerical values (in dummy variables format), Using get_dummies  
bike_share_mod_df = bike_share_df.copy()
dummy_df = pd.get_dummies(bike_share_mod_df, columns=['season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weathersit'], dtype=int, drop_first=True)
bike_share_mod_df = dummy_df.copy()

bike_share_mod_df.head(2)

bike_share_df_bk = bike_share_mod_df.copy()
#-- bike_share_mod_df = bike_share_df_bk

# Drop all the unwanted features from the dataframe.
bike_share_mod_df = bike_share_mod_df.drop(['date', 'casual_users', 'registered_users','actual_temp'], axis=1)
bike_share_mod_df = bike_share_mod_df.astype(float)
bike_share_mod_df.head(2)

# To perform multiple linear regression, We now convert the given dataset into train and test subsets, at a ratio of 80:20  

df_train, df_test  = train_test_split(bike_share_mod_df, train_size=0.8,test_size=0.2, random_state=0)
df_train.shape, df_test.shape

((584, 29), (146, 29))

# In order to avoid scaling issues during model training, We use the MinMaxscalar to scale all Variables Between zero and one 
scaler = MinMaxScaler()
num_vars = ['temp', 'humidity', 'windspeed','total_cnt']

# A method on transformers which fits the estimator and returns the transformed training data
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_train.head(2)

# Linear correlation between temp and total_cnt
plt.figure(figsize=(6,6))
sns.scatterplot(x='temp', y='total_cnt', data=df_train)
plt.show();

# Heatmap to check the correlation amongst the variables
plt.figure(figsize=(30,15))
sns.heatmap(df_train.corr(), annot=True)
plt.show();

# To build the model, We now Split the Train data set Into X_train with predictor variables and Y_train with target variable.
Y_train = df_train.pop('total_cnt')
X_train = df_train

X_train.head(2)
Y_train.head(2)
X_train.shape, Y_train.shape

239   0.496088
97    0.166705
Name: total_cnt, dtype: float64

((584, 28), (584,))

# Perform Ordinary least squares Linear Regression.
lr_model = LinearRegression() # Create object lr_model
lr_model.fit(X_train, Y_train) # Fit linear model using X_train and Y_train

LinearRegression()

LinearRegression()

# Get Coefficients of all the trained columns of lr_model using coef_ return it as dataframe
coef_df = pd.DataFrame(lr_model.coef_, X_train.columns, columns=["Coefficient"])
coef_df.head()

print('---------------------------------------------------')
print(f'Model_Score: {lr_model.score(X_train, Y_train)}')
print(f'Model_Intercept: {lr_model.intercept_}')

---------------------------------------------------
Model_Score: 0.8370930015017464
Model_Intercept: 0.30300351026871525

Y_train_pred = lr_model.predict(X_train)
Y_comp_df = pd.DataFrame({"Actual" : Y_train, "Predicted" : Y_train_pred})
Y_comp_df.head()

def adj_r2_score(xfe, yt, ypred):
    SS_Reg = np.sum((yt - ypred)**2)
    SS_Total = np.sum((yt - np.mean(yt))**2)
    r2 = 1-SS_Reg/SS_Total
    
    n = len(xfe)
    p = len(xfe.columns)
    adj_r2 = 1-((1-r2)*(n-1))/(n-p-1)
    return adj_r2

print('---------------------------------------------------')
print(f'Model_Score: {lr_model.score(X_train, Y_train)}')
print(f'Model_Intercept: {lr_model.intercept_}')
print(f'Mean Absolute Error: {mean_absolute_error(Y_train, Y_train_pred)}')
print(f'Mean Squared Error: {mean_squared_error(Y_train, Y_train_pred)}')
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(Y_train, Y_train_pred))}') 
print(f'R2_score: {r2_score(Y_train, Y_train_pred)}')
print(f'Adjusted_R2_score: {adj_r2_score(X_train, Y_train, Y_train_pred)}')

---------------------------------------------------
Model_Score: 0.8370930015017464
Model_Intercept: 0.30300351026871525
Mean Absolute Error: 0.0641989884673339
Mean Squared Error: 0.007699439599146163
Root Mean Squared Error: 0.08774645063560214
R2_score: 0.8370930015017464
Adjusted_R2_score: 0.8288742700459786

# For testing model, We now Split the Test data set into X_test with predictor variables and Y_test with target variable.
num_vars = ['temp', 'humidity', 'windspeed','total_cnt']
df_test[num_vars] = scaler.transform(df_test[num_vars])

X_test = df_test.copy()
Y_test = X_test.pop('total_cnt')

X_test.head(5)
Y_test.head(5)
X_test.shape, Y_test.shape

196   0.678900
187   0.525771
14    0.141049
31    0.153935
390   0.466291
Name: total_cnt, dtype: float64

((146, 28), (146,))

# Predict the y_pred values using lr_model.predict() 
# Compare the actual values vs predicted values.
 
Y_pred = lr_model.predict(X_test)
Y_comp_df = pd.DataFrame({"Actual" : Y_test, "Predicted" : Y_pred})
Y_comp_df.head()

# Get Coefficients of all the test columns of lr_model using lr_model.coef_ return it as dataframe
coef_df = pd.DataFrame(lr_model.coef_, X_test.columns, columns=["Coefficient"])
coef_df.head()

print('---------------------------------------------------')
print(f'Model_Score: {lr_model.score(X_test, Y_test)}')
print(f'Model_Intercept: {lr_model.intercept_}')
print(f'Mean Absolute Error: {mean_absolute_error(Y_test, Y_pred)}')
print(f'Mean Squared Error: {mean_squared_error(Y_test, Y_pred)}')
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(Y_test, Y_pred))}') 
print(f'R2_score: {r2_score(Y_test, Y_pred)}')
print(f'Adjusted_R2_score: {adj_r2_score(X_test, Y_test, Y_pred)}')

---------------------------------------------------
Model_Score: 0.8712402002824886
Model_Intercept: 0.30300351026871525
Mean Absolute Error: 0.06804551577481996
Mean Squared Error: 0.007545283334061818
Root Mean Squared Error: 0.08686359038205718
R2_score: 0.8712402002824886
Adjusted_R2_score: 0.8404258892389816

# Since the coefficient of determination of the prediction, is not in Recommended range, 
# We now perform Recursive feature elimination, to eliminate Unnecessary features from the Training data set   

rfe = RFE(estimator=lr_model, n_features_to_select= 15, step=1) # RFE rank feature and remove one feature at a time till features count = 15
rfe = rfe.fit(X_train, Y_train) # fit the RFE model on the Training data set

#Now combines columns and rank and return as table,
# best features are assigned rank 1.
a = list(zip(X_train.columns,rfe.support_,rfe.ranking_))
sorted(a,key=lambda x:x[2])

[('temp', True, 1),
 ('humidity', True, 1),
 ('windspeed', True, 1),
 ('season_Spring', True, 1),
 ('season_Winter', True, 1),
 ('year_2019', True, 1),
 ('month_Dec', True, 1),
 ('month_Feb', True, 1),
 ('month_Jan', True, 1),
 ('month_Jul', True, 1),
 ('month_Nov', True, 1),
 ('month_Sep', True, 1),
 ('holiday_NotHoliday', True, 1),
 ('weathersit_LightRain', True, 1),
 ('weathersit_Mist', True, 1),
 ('workingday_Workingday', False, 2),
 ('weekday_Sat', False, 3),
 ('weekday_Sun', False, 4),
 ('month_May', False, 5),
 ('weekday_Mon', False, 6),
 ('weekday_Tue', False, 7),
 ('month_Aug', False, 8),
 ('month_Jun', False, 9),
 ('weekday_Wed', False, 10),
 ('month_Mar', False, 11),
 ('month_Oct', False, 12),
 ('season_Summer', False, 13),
 ('weekday_Thur', False, 14)]

# Top 15 features from RFE
rfe_top_features = X_train.columns[rfe.support_]
rfe_top_features

Index(['temp', 'humidity', 'windspeed', 'season_Spring', 'season_Winter',
       'year_2019', 'month_Dec', 'month_Feb', 'month_Jan', 'month_Jul',
       'month_Nov', 'month_Sep', 'holiday_NotHoliday', 'weathersit_LightRain',
       'weathersit_Mist'],
      dtype='object')

# lowest ranked features from RFE
rfe_bottom_ranked = X_train.columns[~rfe.support_]
rfe_bottom_ranked

Index(['season_Summer', 'month_Aug', 'month_Jun', 'month_Mar', 'month_May',
       'month_Oct', 'weekday_Mon', 'weekday_Sat', 'weekday_Sun',
       'weekday_Thur', 'weekday_Tue', 'weekday_Wed', 'workingday_Workingday'],
      dtype='object')

# Perform Ordinary least squares Linear Regression.
X_train_rfe = X_train[rfe_top_features]

lr_model = LinearRegression() 
lr_model.fit(X_train_rfe,Y_train)

# Get Coefficients of all the trained columns of lr_model using coef_ return it as dataframe
coef_df = pd.DataFrame(lr_model.coef_, X_train_rfe.columns, columns=["Coefficient"])
coef_df

print('---------------------------------------------------')
print(f'Model_Score: {lr_model.score(X_train_rfe, Y_train)}')
print(f'Model_Intercept: {lr_model.intercept_}')

LinearRegression()

LinearRegression()

---------------------------------------------------
Model_Score: 0.8293483572406961
Model_Intercept: 0.31058533723271714

def train_sm(Xtr, Ytr): # This function creates a new lr_model using statsmodel upon called. 
    Xtr_sm = sm.add_constant(Xtr) # Adding a constant variable , a column of ones to an array.
    lr = sm.OLS(Ytr, Xtr_sm) # Create Ordinary Least Squares model 
    lrn_model = lr.fit() # fit the model
    l_param = lrn_model.params # params
    l_summary = lrn_model.summary() # summary
    return l_summary, l_param, lrn_model, Xtr_sm

# Measure of VIF, Multicollinearity among features and returns the VIF score for each feature. 
def find_VIF(Xtr): 
    vif = pd.DataFrame()
    vif['Features'] = Xtr.columns
    vif['VIF'] = [variance_inflation_factor(Xtr.values, i) for i in range(Xtr.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return vif

# train_sm function creates a new model whenever invoked 
summary, param, sm_model, X_train_sm = train_sm(X_train_rfe, Y_train)
summary, param       # summary of the linear model
# find_VIF function return the VIF score for all the features
find_VIF(X_train_rfe)

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 ==============================================================================
 Dep. Variable:              total_cnt   R-squared:                       0.829
 Model:                            OLS   Adj. R-squared:                  0.825
 Method:                 Least Squares   F-statistic:                     184.0
 Date:                Mon, 23 Dec 2024   Prob (F-statistic):          1.27e-206
 Time:                        22:19:02   Log-Likelihood:                 578.83
 No. Observations:                 584   AIC:                            -1126.
 Df Residuals:                     568   BIC:                            -1056.
 Df Model:                          15                                         
 Covariance Type:            nonrobust                                         
 ========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
 ----------------------------------------------------------------------------------------
 const                    0.3106      0.035      8.808      0.000       0.241       0.380
 temp                     0.4068      0.032     12.858      0.000       0.345       0.469
 humidity                -0.1430      0.028     -5.133      0.000      -0.198      -0.088
 windspeed               -0.1243      0.019     -6.697      0.000      -0.161      -0.088
 season_Spring           -0.0986      0.017     -5.783      0.000      -0.132      -0.065
 season_Winter            0.0697      0.013      5.254      0.000       0.044       0.096
 year_2019                0.2244      0.008     29.037      0.000       0.209       0.240
 month_Dec               -0.0644      0.017     -3.823      0.000      -0.098      -0.031
 month_Feb               -0.0495      0.020     -2.454      0.014      -0.089      -0.010
 month_Jan               -0.0662      0.020     -3.265      0.001      -0.106      -0.026
 month_Jul               -0.0687      0.016     -4.238      0.000      -0.101      -0.037
 month_Nov               -0.0721      0.018     -4.045      0.000      -0.107      -0.037
 month_Sep                0.0539      0.016      3.473      0.001       0.023       0.084
 holiday_NotHoliday       0.0518      0.023      2.225      0.026       0.006       0.098
 weathersit_LightRain    -0.2014      0.029     -7.025      0.000      -0.258      -0.145
 weathersit_Mist         -0.0464      0.010     -4.533      0.000      -0.067      -0.026
 ==============================================================================
 Omnibus:                       93.598   Durbin-Watson:                   2.028
 Prob(Omnibus):                  0.000   Jarque-Bera (JB):              243.452
 Skew:                          -0.812   Prob(JB):                     1.36e-53
 Kurtosis:                       5.714   Cond. No.                         22.0
 ==============================================================================
 
 Notes:
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
 """,
 const                   0.310585
 temp                    0.406779
 humidity               -0.142953
 windspeed              -0.124305
 season_Spring          -0.098555
 season_Winter           0.069679
 year_2019               0.224385
 month_Dec              -0.064447
 month_Feb              -0.049548
 month_Jan              -0.066233
 month_Jul              -0.068706
 month_Nov              -0.072105
 month_Sep               0.053923
 holiday_NotHoliday      0.051792
 weathersit_LightRain   -0.201380
 weathersit_Mist        -0.046398
 dtype: float64)

# 'holiday_NotHoliday' has very high VIF. Hence dropped

X_train_rfe = X_train_rfe.drop(["holiday_NotHoliday"], axis = 1)
summary, param, sm_model, X_train_sm = train_sm(X_train_rfe, Y_train)
summary, param
find_VIF(X_train_rfe)

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 ==============================================================================
 Dep. Variable:              total_cnt   R-squared:                       0.828
 Model:                            OLS   Adj. R-squared:                  0.824
 Method:                 Least Squares   F-statistic:                     195.5
 Date:                Mon, 23 Dec 2024   Prob (F-statistic):          1.04e-206
 Time:                        22:19:02   Log-Likelihood:                 576.29
 No. Observations:                 584   AIC:                            -1123.
 Df Residuals:                     569   BIC:                            -1057.
 Df Model:                          14                                         
 Covariance Type:            nonrobust                                         
 ========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
 ----------------------------------------------------------------------------------------
 const                    0.3623      0.027     13.608      0.000       0.310       0.415
 temp                     0.4057      0.032     12.782      0.000       0.343       0.468
 humidity                -0.1441      0.028     -5.158      0.000      -0.199      -0.089
 windspeed               -0.1238      0.019     -6.648      0.000      -0.160      -0.087
 season_Spring           -0.0993      0.017     -5.805      0.000      -0.133      -0.066
 season_Winter            0.0692      0.013      5.201      0.000       0.043       0.095
 year_2019                0.2248      0.008     28.991      0.000       0.210       0.240
 month_Dec               -0.0646      0.017     -3.817      0.000      -0.098      -0.031
 month_Feb               -0.0498      0.020     -2.460      0.014      -0.090      -0.010
 month_Jan               -0.0675      0.020     -3.318      0.001      -0.107      -0.028
 month_Jul               -0.0699      0.016     -4.298      0.000      -0.102      -0.038
 month_Nov               -0.0753      0.018     -4.225      0.000      -0.110      -0.040
 month_Sep                0.0536      0.016      3.438      0.001       0.023       0.084
 weathersit_LightRain    -0.1991      0.029     -6.926      0.000      -0.256      -0.143
 weathersit_Mist         -0.0457      0.010     -4.456      0.000      -0.066      -0.026
 ==============================================================================
 Omnibus:                       99.586   Durbin-Watson:                   2.030
 Prob(Omnibus):                  0.000   Jarque-Bera (JB):              261.798
 Skew:                          -0.858   Prob(JB):                     1.42e-57
 Kurtosis:                       5.796   Cond. No.                         16.9
 ==============================================================================
 
 Notes:
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
 """,
 const                   0.362280
 temp                    0.405727
 humidity               -0.144122
 windspeed              -0.123822
 season_Spring          -0.099266
 season_Winter           0.069209
 year_2019               0.224756
 month_Dec              -0.064581
 month_Feb              -0.049825
 month_Jan              -0.067528
 month_Jul              -0.069878
 month_Nov              -0.075318
 month_Sep               0.053552
 weathersit_LightRain   -0.199087
 weathersit_Mist        -0.045743
 dtype: float64)

# 'humidity' is insignificant in presence of other variables. Hence dropped.

X_train_rfe = X_train_rfe.drop(["humidity"], axis = 1)
summary, param, sm_model, X_train_sm = train_sm(X_train_rfe, Y_train)
summary, param,  # summary of the linear model
find_VIF(X_train_rfe)

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 ==============================================================================
 Dep. Variable:              total_cnt   R-squared:                       0.820
 Model:                            OLS   Adj. R-squared:                  0.816
 Method:                 Least Squares   F-statistic:                     199.5
 Date:                Mon, 23 Dec 2024   Prob (F-statistic):          3.02e-202
 Time:                        22:19:02   Log-Likelihood:                 562.95
 No. Observations:                 584   AIC:                            -1098.
 Df Residuals:                     570   BIC:                            -1037.
 Df Model:                          13                                         
 Covariance Type:            nonrobust                                         
 ========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
 ----------------------------------------------------------------------------------------
 const                    0.3057      0.025     12.328      0.000       0.257       0.354
 temp                     0.3721      0.032     11.718      0.000       0.310       0.435
 windspeed               -0.0961      0.018     -5.271      0.000      -0.132      -0.060
 season_Spring           -0.1008      0.017     -5.769      0.000      -0.135      -0.066
 season_Winter            0.0614      0.014      4.545      0.000       0.035       0.088
 year_2019                0.2309      0.008     29.482      0.000       0.215       0.246
 month_Dec               -0.0695      0.017     -4.024      0.000      -0.103      -0.036
 month_Feb               -0.0483      0.021     -2.334      0.020      -0.089      -0.008
 month_Jan               -0.0706      0.021     -3.396      0.001      -0.111      -0.030
 month_Jul               -0.0592      0.016     -3.590      0.000      -0.092      -0.027
 month_Nov               -0.0755      0.018     -4.141      0.000      -0.111      -0.040
 month_Sep                0.0442      0.016      2.794      0.005       0.013       0.075
 weathersit_LightRain    -0.2675      0.026    -10.260      0.000      -0.319      -0.216
 weathersit_Mist         -0.0775      0.008     -9.242      0.000      -0.094      -0.061
 ==============================================================================
 Omnibus:                       96.811   Durbin-Watson:                   2.034
 Prob(Omnibus):                  0.000   Jarque-Bera (JB):              254.529
 Skew:                          -0.835   Prob(JB):                     5.37e-56
 Kurtosis:                       5.770   Cond. No.                         15.8
 ==============================================================================
 
 Notes:
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
 """,
 const                   0.305669
 temp                    0.372128
 windspeed              -0.096087
 season_Spring          -0.100812
 season_Winter           0.061423
 year_2019               0.230880
 month_Dec              -0.069472
 month_Feb              -0.048319
 month_Jan              -0.070613
 month_Jul              -0.059188
 month_Nov              -0.075467
 month_Sep               0.044193
 weathersit_LightRain   -0.267501
 weathersit_Mist        -0.077549
 dtype: float64)

# 'windspeed' is insignificant in presence of other variables. Hence dropped.

X_train_rfe = X_train_rfe.drop(["windspeed"], axis = 1)
summary, param, sm_model, X_train_sm = train_sm(X_train_rfe, Y_train)
summary, param,  # summary of the linear model
find_VIF(X_train_rfe)

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 ==============================================================================
 Dep. Variable:              total_cnt   R-squared:                       0.811
 Model:                            OLS   Adj. R-squared:                  0.807
 Method:                 Least Squares   F-statistic:                     204.2
 Date:                Mon, 23 Dec 2024   Prob (F-statistic):          1.52e-197
 Time:                        22:19:02   Log-Likelihood:                 549.05
 No. Observations:                 584   AIC:                            -1072.
 Df Residuals:                     571   BIC:                            -1015.
 Df Model:                          12                                         
 Covariance Type:            nonrobust                                         
 ========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
 ----------------------------------------------------------------------------------------
 const                    0.2465      0.023     10.897      0.000       0.202       0.291
 temp                     0.3962      0.032     12.322      0.000       0.333       0.459
 season_Spring           -0.1029      0.018     -5.757      0.000      -0.138      -0.068
 season_Winter            0.0722      0.014      5.286      0.000       0.045       0.099
 year_2019                0.2308      0.008     28.807      0.000       0.215       0.247
 month_Dec               -0.0591      0.018     -3.370      0.001      -0.094      -0.025
 month_Feb               -0.0424      0.021     -2.002      0.046      -0.084      -0.001
 month_Jan               -0.0604      0.021     -2.849      0.005      -0.102      -0.019
 month_Jul               -0.0571      0.017     -3.388      0.001      -0.090      -0.024
 month_Nov               -0.0756      0.019     -4.055      0.000      -0.112      -0.039
 month_Sep                0.0472      0.016      2.920      0.004       0.015       0.079
 weathersit_LightRain    -0.2863      0.026    -10.835      0.000      -0.338      -0.234
 weathersit_Mist         -0.0773      0.009     -9.009      0.000      -0.094      -0.060
 ==============================================================================
 Omnibus:                       96.575   Durbin-Watson:                   2.075
 Prob(Omnibus):                  0.000   Jarque-Bera (JB):              272.387
 Skew:                          -0.809   Prob(JB):                     7.11e-60
 Kurtosis:                       5.928   Cond. No.                         14.6
 ==============================================================================
 
 Notes:
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
 """,
 const                   0.246488
 temp                    0.396206
 season_Spring          -0.102911
 season_Winter           0.072246
 year_2019               0.230825
 month_Dec              -0.059143
 month_Feb              -0.042356
 month_Jan              -0.060353
 month_Jul              -0.057134
 month_Nov              -0.075606
 month_Sep               0.047227
 weathersit_LightRain   -0.286307
 weathersit_Mist        -0.077342
 dtype: float64)

# 'month_Feb' is insignificant in presence of other variables. Hence dropped.

X_train_rfe = X_train_rfe.drop(["month_Feb"], axis = 1)
summary, param, sm_model, X_train_sm = train_sm(X_train_rfe, Y_train)
summary, param,  # summary of the linear model
find_VIF(X_train_rfe)
sm_model_tmp, X_train_sm_tmp = sm_model, X_train_sm

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 ==============================================================================
 Dep. Variable:              total_cnt   R-squared:                       0.810
 Model:                            OLS   Adj. R-squared:                  0.806
 Method:                 Least Squares   F-statistic:                     221.3
 Date:                Mon, 23 Dec 2024   Prob (F-statistic):          7.29e-198
 Time:                        22:19:02   Log-Likelihood:                 547.01
 No. Observations:                 584   AIC:                            -1070.
 Df Residuals:                     572   BIC:                            -1018.
 Df Model:                          11                                         
 Covariance Type:            nonrobust                                         
 ========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
 ----------------------------------------------------------------------------------------
 const                    0.2390      0.022     10.685      0.000       0.195       0.283
 temp                     0.4075      0.032     12.836      0.000       0.345       0.470
 season_Spring           -0.1205      0.016     -7.712      0.000      -0.151      -0.090
 season_Winter            0.0705      0.014      5.157      0.000       0.044       0.097
 year_2019                0.2305      0.008     28.693      0.000       0.215       0.246
 month_Dec               -0.0487      0.017     -2.897      0.004      -0.082      -0.016
 month_Jan               -0.0376      0.018     -2.099      0.036      -0.073      -0.002
 month_Jul               -0.0593      0.017     -3.517      0.000      -0.092      -0.026
 month_Nov               -0.0706      0.019     -3.810      0.000      -0.107      -0.034
 month_Sep                0.0472      0.016      2.909      0.004       0.015       0.079
 weathersit_LightRain    -0.2852      0.026    -10.766      0.000      -0.337      -0.233
 weathersit_Mist         -0.0770      0.009     -8.945      0.000      -0.094      -0.060
 ==============================================================================
 Omnibus:                       90.429   Durbin-Watson:                   2.061
 Prob(Omnibus):                  0.000   Jarque-Bera (JB):              256.637
 Skew:                          -0.757   Prob(JB):                     1.87e-56
 Kurtosis:                       5.873   Cond. No.                         14.2
 ==============================================================================
 
 Notes:
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
 """,
 const                   0.238972
 temp                    0.407463
 season_Spring          -0.120462
 season_Winter           0.070526
 year_2019               0.230463
 month_Dec              -0.048662
 month_Jan              -0.037632
 month_Jul              -0.059342
 month_Nov              -0.070581
 month_Sep               0.047165
 weathersit_LightRain   -0.285170
 weathersit_Mist        -0.076977
 dtype: float64)

# 'month_Jan' is insignificant in presence of other variables. Hence dropped.

X_train_rfe = X_train_rfe.drop(["month_Jan"], axis = 1)
summary, param, sm_model, X_train_sm = train_sm(X_train_rfe, Y_train)
summary, param,  # summary of the linear model
find_VIF(X_train_rfe)

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 ==============================================================================
 Dep. Variable:              total_cnt   R-squared:                       0.808
 Model:                            OLS   Adj. R-squared:                  0.805
 Method:                 Least Squares   F-statistic:                     241.5
 Date:                Mon, 23 Dec 2024   Prob (F-statistic):          4.06e-198
 Time:                        22:19:02   Log-Likelihood:                 544.77
 No. Observations:                 584   AIC:                            -1068.
 Df Residuals:                     573   BIC:                            -1019.
 Df Model:                          10                                         
 Covariance Type:            nonrobust                                         
 ========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
 ----------------------------------------------------------------------------------------
 const                    0.2277      0.022     10.457      0.000       0.185       0.270
 temp                     0.4245      0.031     13.792      0.000       0.364       0.485
 season_Spring           -0.1279      0.015     -8.387      0.000      -0.158      -0.098
 season_Winter            0.0710      0.014      5.174      0.000       0.044       0.098
 year_2019                0.2302      0.008     28.583      0.000       0.214       0.246
 month_Dec               -0.0408      0.016     -2.485      0.013      -0.073      -0.009
 month_Jul               -0.0628      0.017     -3.732      0.000      -0.096      -0.030
 month_Nov               -0.0661      0.018     -3.580      0.000      -0.102      -0.030
 month_Sep                0.0465      0.016      2.858      0.004       0.015       0.078
 weathersit_LightRain    -0.2860      0.027    -10.768      0.000      -0.338      -0.234
 weathersit_Mist         -0.0767      0.009     -8.889      0.000      -0.094      -0.060
 ==============================================================================
 Omnibus:                       86.217   Durbin-Watson:                   2.072
 Prob(Omnibus):                  0.000   Jarque-Bera (JB):              239.004
 Skew:                          -0.730   Prob(JB):                     1.26e-52
 Kurtosis:                       5.773   Cond. No.                         13.8
 ==============================================================================
 
 Notes:
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
 """,
 const                   0.227663
 temp                    0.424493
 season_Spring          -0.127939
 season_Winter           0.070961
 year_2019               0.230237
 month_Dec              -0.040814
 month_Jul              -0.062838
 month_Nov              -0.066071
 month_Sep               0.046478
 weathersit_LightRain   -0.286031
 weathersit_Mist        -0.076718
 dtype: float64)

# 'month_Nov' is insignificant in presence of other variables. Hence dropped.

X_train_rfe = X_train_rfe.drop(["month_Nov"], axis = 1)
summary, param, sm_model, X_train_sm = train_sm(X_train_rfe, Y_train)
summary, param,  # summary of the linear model
find_VIF(X_train_rfe)

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 ==============================================================================
 Dep. Variable:              total_cnt   R-squared:                       0.804
 Model:                            OLS   Adj. R-squared:                  0.801
 Method:                 Least Squares   F-statistic:                     261.5
 Date:                Mon, 23 Dec 2024   Prob (F-statistic):          1.35e-196
 Time:                        22:19:02   Log-Likelihood:                 538.31
 No. Observations:                 584   AIC:                            -1057.
 Df Residuals:                     574   BIC:                            -1013.
 Df Model:                           9                                         
 Covariance Type:            nonrobust                                         
 ========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
 ----------------------------------------------------------------------------------------
 const                    0.2086      0.021      9.782      0.000       0.167       0.251
 temp                     0.4508      0.030     14.932      0.000       0.392       0.510
 season_Spring           -0.1199      0.015     -7.865      0.000      -0.150      -0.090
 season_Winter            0.0493      0.012      3.964      0.000       0.025       0.074
 year_2019                0.2294      0.008     28.200      0.000       0.213       0.245
 month_Dec               -0.0194      0.015     -1.254      0.210      -0.050       0.011
 month_Jul               -0.0671      0.017     -3.955      0.000      -0.100      -0.034
 month_Sep                0.0500      0.016      3.051      0.002       0.018       0.082
 weathersit_LightRain    -0.2827      0.027    -10.540      0.000      -0.335      -0.230
 weathersit_Mist         -0.0730      0.009     -8.437      0.000      -0.090      -0.056
 ==============================================================================
 Omnibus:                       80.491   Durbin-Watson:                   2.075
 Prob(Omnibus):                  0.000   Jarque-Bera (JB):              203.056
 Skew:                          -0.713   Prob(JB):                     8.07e-45
 Kurtosis:                       5.513   Cond. No.                         13.4
 ==============================================================================
 
 Notes:
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
 """,
 const                   0.208629
 temp                    0.450834
 season_Spring          -0.119877
 season_Winter           0.049264
 year_2019               0.229375
 month_Dec              -0.019381
 month_Jul              -0.067103
 month_Sep               0.050030
 weathersit_LightRain   -0.282668
 weathersit_Mist        -0.073034
 dtype: float64)

# 'month_Dec' is insignificant in presence of other variables. Hence dropped.

X_train_rfe = X_train_rfe.drop(["month_Dec"], axis = 1)
summary, param, sm_model, X_train_sm = train_sm(X_train_rfe, Y_train)
summary, param,  # summary of the linear model
find_VIF(X_train_rfe)

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 ==============================================================================
 Dep. Variable:              total_cnt   R-squared:                       0.803
 Model:                            OLS   Adj. R-squared:                  0.801
 Method:                 Least Squares   F-statistic:                     293.7
 Date:                Mon, 23 Dec 2024   Prob (F-statistic):          1.67e-197
 Time:                        22:19:02   Log-Likelihood:                 537.51
 No. Observations:                 584   AIC:                            -1057.
 Df Residuals:                     575   BIC:                            -1018.
 Df Model:                           8                                         
 Covariance Type:            nonrobust                                         
 ========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
 ----------------------------------------------------------------------------------------
 const                    0.2049      0.021      9.697      0.000       0.163       0.246
 temp                     0.4567      0.030     15.304      0.000       0.398       0.515
 season_Spring           -0.1198      0.015     -7.855      0.000      -0.150      -0.090
 season_Winter            0.0461      0.012      3.785      0.000       0.022       0.070
 year_2019                0.2292      0.008     28.170      0.000       0.213       0.245
 month_Jul               -0.0683      0.017     -4.030      0.000      -0.102      -0.035
 month_Sep                0.0506      0.016      3.089      0.002       0.018       0.083
 weathersit_LightRain    -0.2841      0.027    -10.598      0.000      -0.337      -0.231
 weathersit_Mist         -0.0735      0.009     -8.500      0.000      -0.091      -0.057
 ==============================================================================
 Omnibus:                       77.442   Durbin-Watson:                   2.075
 Prob(Omnibus):                  0.000   Jarque-Bera (JB):              189.872
 Skew:                          -0.696   Prob(JB):                     5.89e-42
 Kurtosis:                       5.422   Cond. No.                         13.2
 ==============================================================================
 
 Notes:
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
 """,
 const                   0.204875
 temp                    0.456706
 season_Spring          -0.119780
 season_Winter           0.046074
 year_2019               0.229219
 month_Jul              -0.068302
 month_Sep               0.050646
 weathersit_LightRain   -0.284113
 weathersit_Mist        -0.073536
 dtype: float64)

y_train_pred = sm_model.predict(X_train_sm)

# Plot the histogram of the error terms
fig = plt.figure()
sns.histplot((Y_train - y_train_pred), bins = 20,kde=True)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18) 
plt.show();

# Dividing into X_test and Y_test
Y_test = df_test.pop('total_cnt')
X_test = df_test

# using our trained model to make predictions.
# Creating X_test_new dataframe by dropping variables from X_test

final_features = X_train_rfe.columns
X_test_new = X_test[final_features]

X_test_new = sm.add_constant(X_test_new) # Adding a constant variable 
Y_test_pred = sm_model.predict(X_test_new) # Making predictions

Y_comp_df = pd.DataFrame({"Actual" : Y_test, "Predicted" : Y_test_pred})
Y_comp_df.head()

# To find top 3 features of the model
coefficients = sm_model.params
# Get the feature names
feature_names = final_features

# Create a dataframe to store the top_features
top_features = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients[1:]})
# Sort the dataframe by coefficient absolute value
top_features = top_features.sort_values(by='coefficient', key=abs, ascending=False)

print(top_features.reset_index(drop=True).head(5))

                feature  coefficient
0                  temp     0.456706
1  weathersit_LightRain    -0.284113
2             year_2019     0.229219
3         season_Spring    -0.119780
4       weathersit_Mist    -0.073536

print('---------------------------------------------------')
print(f'Mean Absolute Error: {mean_absolute_error(Y_test, Y_test_pred)}')
print(f'Mean Absolute Percentage Error: {mean_absolute_percentage_error(Y_test, Y_test_pred)}')
print(f'Mean Squared Error: {mean_squared_error(Y_test, Y_test_pred)}')
print(f'Root Mean Squared Error: {np.sqrt(mean_squared_error(Y_test, Y_test_pred))}') 
print(f'R2_score: {r2_score(Y_test, Y_test_pred)}')
print(f'Adjusted_R2_score: {adj_r2_score(X_test, Y_test, Y_test_pred)}')
print(f'Adjusted_R2_score using sm_model_rsq_adj: {sm_model.rsquared_adj}')

---------------------------------------------------
Mean Absolute Error: 0.07332379465688707
Mean Absolute Percentage Error: 0.21722130847518956
Mean Squared Error: 0.008736601751308767
Root Mean Squared Error: 0.09346979058128228
R2_score: 0.8509104241809033
Adjusted_R2_score: 0.8152308675746238
Adjusted_R2_score using sm_model_rsq_adj: 0.8006724734729809

y_test = Y_test
y_pred = Y_test_pred

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8509104241809033

# Plotting y_test and y_pred to understand the spread.
fig = plt.figure()
sns.scatterplot(x=Y_test,y=Y_test_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)
plt.xlabel('y_test', fontsize=18)                 
plt.ylabel('y_pred', fontsize=16)  
plt.show();

	instant	dteday	season	mnth	weekday	workingday	weathersit	temp	atemp	hum	windspeed	casual	registered	cnt
0	1	01-01-2018	1	1	1	1	2	14.110847	18.181250	80.583300	10.749882	331	654	985
1	2	02-01-2018	1	1	2	1	2	14.902598	17.686950	69.608700	16.652113	131	670	801
2	3	03-01-2018	1	1	3	1	1	8.050924	9.470250	43.727300	16.636703	120	1229	1349
3	4	04-01-2018	1	1	4	1	1	8.200000	10.606100	59.043500	10.739832	108	1454	1562
4	5	05-01-2018	1	1	5	1	1	9.305237	11.463500	43.695700	12.522300	82	1518	1600

	Actual	Predicted
239	0.496088	0.499726
97	0.166705	0.286831
503	0.951680	0.855537
642	0.935803	0.861560
498	0.324551	0.633351

Data Science Portfolio

Bike Sharing Case Study

Business Goal

Multiple Linear Regression¶

1. Importing and Understanding the Data¶

Custom Functions¶

2. EDA - Data Cleaning and Data Visualization¶

Replace column values¶

Outlier Analysis¶

Univariate and Bivariate Analysis¶

Observations:¶

Categorical Features¶

Fix Outliers¶

Bivariate Analysis¶

Multivariate Analysis¶

3. MLR - Data Preparation (One Hot Encoding)¶

Dummy Variables Creation¶

4. MLR - Data Splitting for Train,Test Sets and Rescaling¶

scaling using MinMaxScaler¶

Visualizing the correlations¶

5. MLR - Model Building & Analysis using SKlearn on Train data¶

Predicting on Training dataset¶

Model Analysis using SKlearn on Test data¶

6. MLR - Recursive Feature Elimination (RFE)¶

7. MLR - Model Building and Analysis on top features(RFE)¶

8. MLR - Model building and Variance Inflation factor analysis using Statsmodel¶

Base Model¶

Model 1¶

Model 2¶

Model 3¶

Model 4 - Alternate Model¶

Model 5¶

Model 6¶

Model 7 Final Model¶

9. MLR - Residual Analysis of the training data¶

10. MLR - Predictions on the Test Data Using the Final Model¶

11. MLR - Model Evaluation¶

	thresh_low	thresh_high
name
date	2017-01-01 12:00:00	2020-12-29 12:00:00
temp	-5.791209	46.483709
actual_temp	-3.444381	50.779869
humidity	20.515637	104.473938
windspeed	-0.834259	25.501498
casual_users	-854.125000	2266.875000
registered_users	-919.250000	8204.750000
total_cnt	-1024.625000	10160.375000

	humidity	windspeed	casual_users
count	730.000000	730.000000	730.000000
mean	62.765175	12.763620	849.249315
std	14.237589	5.195841	686.479875
min	0.000000	1.500244	2.000000
5%	40.741735	5.326052	88.450000
10%	45.000000	6.704754	139.900000
50%	62.625000	12.125325	717.000000
75%	72.989575	15.625589	1096.500000
95%	86.868735	22.999988	2355.000000
99%	92.795857	27.380948	2931.680000
max	97.250000	34.000021	3410.000000

	level_0	level_1	0
0	temp	actual_temp	0.991696
6	actual_temp	temp	0.991696
35	registered_users	total_cnt	0.945411
41	total_cnt	registered_users	0.945411
40	total_cnt	casual_users	0.672123
29	casual_users	total_cnt	0.672123
11	actual_temp	total_cnt	0.630685
37	total_cnt	actual_temp	0.630685
36	total_cnt	temp	0.627044
5	temp	total_cnt	0.627044

	date	season	year	month	holiday	weekday	workingday	weathersit	temp	actual_temp	humidity	windspeed	casual_users	registered_users	total_cnt
0	2018-01-01	Spring	2018	Jan	NotHoliday	Mon	Workingday	Mist	14.110847	18.181250	80.583300	10.749882	331	654	985
1	2018-01-02	Spring	2018	Jan	NotHoliday	Tue	Workingday	Mist	14.902598	17.686950	69.608700	16.652113	131	670	801

	temp	humidity	windspeed	total_cnt	season_Spring	season_Summer	season_Winter	year_2019	month_Aug	month_Dec	month_Feb	month_Jan	month_Jul	month_Jun	month_Mar	month_May	month_Nov	month_Oct	month_Sep	holiday_NotHoliday	weekday_Mon	weekday_Sat	weekday_Sun	weekday_Thur	weekday_Tue	weekday_Wed	workingday_Workingday	weathersit_LightRain	weathersit_Mist
239	0.807351	0.428211	0.775475	0.496088	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000
97	0.344785	0.810325	0.545905	0.166705	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000

	Coefficient
temp	0.438665
humidity	-0.147242
windspeed	-0.121544
season_Spring	-0.093910
season_Summer	0.006138

	temp	humidity	windspeed	season_Spring	year_2019	month_Feb	month_Jan	month_Jul	holiday_NotHoliday	weekday_Mon	weekday_Sat	weekday_Thur	workingday_Workingday	weathersit_Mist
196	0.781941	0.460557	0.490778	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000	0.000000	1.000000	0.000000
187	0.860857	0.552784	0.345523	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000
14	0.217065	0.340487	0.341867	1.000000	0.000000	0.000000	1.000000	0.000000	1.000000	1.000000	0.000000	0.000000	1.000000	1.000000
31	0.165779	0.801018	0.032244	1.000000	0.000000	1.000000	0.000000	0.000000	1.000000	0.000000	0.000000	1.000000	1.000000	1.000000
390	0.352054	0.717517	0.091901	1.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000	1.000000

	Actual	Predicted
196	0.678900	0.463670
187	0.525771	0.538209
14	0.141049	0.126886
31	0.153935	0.116398
390	0.466291	0.418788

	Coefficient
temp	0.406779
humidity	-0.142953
windspeed	-0.124305
season_Spring	-0.098555
season_Winter	0.069679
year_2019	0.224385
month_Dec	-0.064447
month_Feb	-0.049548
month_Jan	-0.066233
month_Jul	-0.068706
month_Nov	-0.072105
month_Sep	0.053923
holiday_NotHoliday	0.051792
weathersit_LightRain	-0.201380
weathersit_Mist	-0.046398

	Features	VIF
12	holiday_NotHoliday	21.000000
0	temp	17.040000
1	humidity	14.990000
2	windspeed	4.980000
3	season_Spring	4.380000
4	season_Winter	2.900000
14	weathersit_Mist	2.450000
8	month_Jan	2.270000
5	year_2019	2.070000
7	month_Feb	2.030000
10	month_Nov	1.820000
6	month_Dec	1.680000
9	month_Jul	1.450000
13	weathersit_LightRain	1.350000
11	month_Sep	1.190000

	Features	VIF
1	humidity	13.880000
0	temp	11.990000
2	windspeed	4.270000
3	season_Spring	4.080000
4	season_Winter	2.770000
13	weathersit_Mist	2.450000
8	month_Jan	2.220000
5	year_2019	2.060000
7	month_Feb	2.010000
10	month_Nov	1.810000
6	month_Dec	1.630000
9	month_Jul	1.430000
12	weathersit_LightRain	1.330000
11	month_Sep	1.190000

	Features	VIF
0	temp	4.760000
1	windspeed	4.230000
2	season_Spring	3.880000
3	season_Winter	2.540000
7	month_Jan	2.170000
4	year_2019	2.030000
6	month_Feb	2.000000
9	month_Nov	1.790000
5	month_Dec	1.570000
12	weathersit_Mist	1.520000
8	month_Jul	1.370000
10	month_Sep	1.170000
11	weathersit_LightRain	1.090000

	Features	VIF
1	season_Spring	3.570000
0	temp	2.800000
2	season_Winter	2.540000
6	month_Jan	2.160000
3	year_2019	2.020000
5	month_Feb	2.000000
8	month_Nov	1.750000
4	month_Dec	1.570000
11	weathersit_Mist	1.500000
7	month_Jul	1.340000
9	month_Sep	1.170000
10	weathersit_LightRain	1.070000

	Features	VIF
0	temp	2.790000
2	season_Winter	2.490000
3	year_2019	2.020000
1	season_Spring	1.910000
7	month_Nov	1.740000
5	month_Jan	1.580000
10	weathersit_Mist	1.500000
4	month_Dec	1.460000
6	month_Jul	1.340000
8	month_Sep	1.170000
9	weathersit_LightRain	1.070000

	Features	VIF
0	temp	2.760000
3	year_2019	2.020000
2	season_Winter	1.540000
8	weathersit_Mist	1.490000
5	month_Jul	1.340000
1	season_Spring	1.290000
4	month_Dec	1.270000
6	month_Sep	1.170000
7	weathersit_LightRain	1.060000

	Features	VIF
0	temp	2.750000
3	year_2019	2.020000
7	weathersit_Mist	1.480000
2	season_Winter	1.340000
4	month_Jul	1.340000
1	season_Spring	1.250000
5	month_Sep	1.160000
6	weathersit_LightRain	1.060000

	Actual	Predicted
196	0.678900	0.493691
187	0.525771	0.529732
14	0.141049	0.110694
31	0.153935	0.087272
390	0.466291	0.401564

Data Science Portfolio

Bike Sharing Case Study

Business Goal

Multiple Linear Regression¶

Bike Sharing - Demand Prediction Case Study¶

1. Importing and Understanding the Data¶

Custom Functions¶

2. EDA - Data Cleaning and Data Visualization¶

Replace column values¶

Outlier Analysis¶

Univariate and Bivariate Analysis¶

Observations:¶

Categorical Features¶

Fix Outliers¶

Bivariate Analysis¶

Multivariate Analysis¶

3. MLR - Data Preparation (One Hot Encoding)¶

Dummy Variables Creation¶

4. MLR - Data Splitting for Train,Test Sets and Rescaling¶

scaling using MinMaxScaler¶

Visualizing the correlations¶

5. MLR - Model Building & Analysis using SKlearn on Train data¶

Predicting on Training dataset¶

Model Analysis using SKlearn on Test data¶

6. MLR - Recursive Feature Elimination (RFE)¶

7. MLR - Model Building and Analysis on top features(RFE)¶

8. MLR - Model building and Variance Inflation factor analysis using Statsmodel¶

Base Model¶

Model 1¶

Model 2¶

Model 3¶

Model 4 - Alternate Model¶

Model 5¶

Model 6¶

Model 7 Final Model¶

9. MLR - Residual Analysis of the training data¶

10. MLR - Predictions on the Test Data Using the Final Model¶

11. MLR - Model Evaluation¶

Related Posts

Telecom-churn-Case-Study 13 Nov 2024

Lead-Scoring-Case-Study 13 Oct 2024

RSVP Movies Case study 13 Aug 2024