Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
6.9 kB
0
Indexable
Never
allData <- read.csv("ChrisData.csv")

ndata <- nrow(allData)
set.seed(5)
train <- sample(ndata, ndata*0.8)
data_all_train <- allData[train, ]
data_all_test <- allData[-train, ]

lm_all_data <- lm(Performance ~ ., data = data_all_train)
summary(lm_all_data)

data <- allData[,c(-3, -15)]
dim(data)
data <- na.omit(data)

library(leaps)
library("ggplot2")

ndata <- nrow(data)
set.seed(5)
train <- sample(ndata, ndata*0.8)
data_train <- data[train, ]
data_test <- data[-train, ]

lm_all <- lm(Performance ~ ., data = data_train)
summary(lm_all)

pred_test_all <- predict(lm_all, newdata = data_test)
MSE_all <- mean((data_test$Performance - pred_test_all)^2)
MSE_all # mean square error 120.9725

# Find best subset
best_subset <- regsubsets(Performance ~ ., data = data_train, nvmax = 29)
best_subset_summary <- summary(best_subset)
best_adjr2 <- which.max(best_subset_summary$adjr2)
best_rss <- which.min(best_subset_summary$rss)
best_cp <- which.min(best_subset_summary$cp)
best_bic <- which.min(best_subset_summary$bic)

best_rss; best_bic; best_cp; best_adjr2 #rss = 29, bic = 5, cp = 12, adjr2 = 17

par(mfrow = c(2,2))
plot(best_subset_summary$rss, xlab = "Number of Variables", ylab = "RSS", 
     main = "No. of variables vs RSS", type = 'l')
points(best_rss, best_subset_summary$rss[best_rss], col = "mediumpurple", pch = "x")

plot(best_subset_summary$adjr2, xlab = "Number of Variables", 
     ylab = "Adjusted R^2", main = "No. of variables vs Adjusted R^2", 
     type = 'l')
points(best_adjr2, best_subset_summary$adjr2[best_adjr2], col = "olivedrab", 
       pch = "x")

plot(best_subset_summary$cp, xlab = "Number of Variables", ylab = "Mallows CP", 
     main = "No. of variables vs Mallows CP", type = 'l')
points(best_cp, best_subset_summary$cp[best_cp], col = "lightseagreen", pch = "x")

plot(best_subset_summary$bic, xlab = "Number of Variables", ylab = "BIC", 
     main = "No. of variables BIC", type = 'l')
points(best_bic, best_subset_summary$bic[best_bic], col = "orchid", pch = "x")

# Bic Model
coef(best_subset, 5) # Variables in the selected BIC model

lm_bic <- lm(Performance ~ JobRotation + Mentoring + Autonomy + Tools + 
               Training, data = data_train)
summary(lm_bic)
pred_test_bic <- predict(lm_bic, newdata = data_test)
MSE_bic <- mean((data_test$Performance - pred_test_bic)^2)
MSE_bic # 121.0996

# Cp Model
coef(best_subset, 12) # Variables in the selected BIC model

lm_cp <- lm(Performance ~ DistanceFromHome + JobInvolvement + + JobLevel + 
              JobRotation + FeedbackFromManager + Mentoring + Proactivity + 
              Autonomy + MonthlyIncome + OverTime + Tools + Training, 
            data = data_train)
summary(lm_cp)
pred_test_cp <- predict(lm_cp, newdata = data_test)
MSE_cp <- mean((data_test$Performance - pred_test_cp)^2)
MSE_cp # 117.7484

# Adj R^2 Model
coef(best_subset, 17) # Variables in the selected BIC model

lm_adjr2 <- lm(Performance ~ DistanceFromHome + EnvironmentSatisfaction + 
                 JobInvolvement + JobLevel + JobRotation + FeedbackFromManager + 
                 Mentoring + Proactivity + Autonomy + MonthlyIncome + OverTime + 
                 PercentSalaryHike + StockOptionLevel + Tools + TotalWorkingYears + 
                 Training + YearsWithCurrManager, data = data_train)
summary(lm_adjr2)
pred_test_adjr2 <- predict(lm_adjr2, newdata = data_test)
MSE_adjr2 <- mean((data_test$Performance - pred_test_adjr2)^2)
MSE_adjr2 # 119.9606

labels <- c("RSS", "BIC", "CP", "Adj R^2")
MSE <- c(MSE_all, MSE_bic, MSE_cp, MSE_adjr2)
var <- c(best_rss, best_bic, best_cp, best_adjr2)
table <- data.frame(labels, MSE, var)
table
table_plot <- as.data.frame(table)
table_plot 

(121-117)/117 # 3.4% better MSE

avg_DistanceFromHome <- mean(data$DistanceFromHome)
avg_JobInvolvement <- mean(data$JobInvolvement)
avg_JobLevel <- mean(data$JobLevel)
avg_JobRotation <- mean(data$JobRotation)
feedbackH <- ifelse(data$FeedbackFromManager == "H", 1, 0)
avg_FeedbackFromManager <- mean(feedbackH)
mentoringY <- ifelse(data$Mentoring == "Y", 1, 0)
avg_Mentoring <- mean(mentoringY)
avg_Proactivity <- mean(data$Proactivity)
autonomyY <- ifelse(data$Autonomy == "Y", 1, 0)
avg_Autonomy <- mean(autonomyY)
avg_MonthlyIncome <- mean(data$MonthlyIncome)
overtimeY <- ifelse(data$Tools == "Yes", 1, 0)
avg_OverTime <- mean(overtimeY)
toolsH <- ifelse(data$Tools == "H", 1, 0)
avg_Tools <- mean(toolsY)
trainingY <- ifelse(data$Training == "Y", 1, 0)
avg_Training <- mean(trainingY)

intercept <- 33.3116708780
DistanceFromHome <- -0.0697216650
JobInvolvement <- 1.1496629440
JobLevel <- -1.2807453219 
JobRotation <- 2.6515473427
FeedbackFromManager <- -1.6901699335
Mentoring <- 6.6006777297
Proactivity <- 0.5988167975
Autonomy <- 4.3709193766
MonthlyIncome <- 0.0003627075 
OverTime <- -1.5298041395
Tools <- -2.3458114720
Training <- 14.3693398245

prediction_cp <- (intercept + DistanceFromHome*avg_DistanceFromHome + 
                    JobInvolvement*avg_JobInvolvement 
                    + JobLevel*avg_JobLevel + 
                    JobRotation*avg_JobRotation + 
                    FeedbackFromManager*avg_FeedbackFromManager
                    + Mentoring*avg_Mentoring + Proactivity*avg_Proactivity + 
                    Autonomy*avg_Autonomy
                    + MonthlyIncome*avg_MonthlyIncome + OverTime*avg_OverTime + 
                    Tools*avg_Tools
                    + Training*avg_Training)
prediction_cp # 55.66472

improved_prediction_cp <- (intercept + DistanceFromHome*avg_DistanceFromHome + 
                             JobInvolvement*avg_JobInvolvement 
                              + JobLevel*avg_JobLevel + 
                             JobRotation*avg_JobRotation + 
                             FeedbackFromManager*1
                              + Mentoring*1 + Proactivity*avg_Proactivity + 
                             Autonomy*avg_Autonomy
                              + MonthlyIncome*avg_MonthlyIncome + 
                             OverTime*avg_OverTime + Tools*1
                              + Training*1)
improved_prediction_cp # 62.15957

(62.15957 - 55.66472)/62.15957 # 10.4% increase!

# BIC PREDICTIONS
prediction_bic <- (intercept + JobRotation*avg_JobRotation + 
                     Mentoring*avg_Mentoring + 
                     Autonomy*avg_Autonomy + Tools*avg_Tools + 
                     Training*avg_Training)
prediction_bic

improvement_prediction_bic <- (intercept + JobRotation*avg_JobRotation + 
                                 Mentoring*avg_Mentoring + 
                                 Autonomy*avg_Autonomy + Tools*avg_Tools + 
                                 Training*avg_Training)
improvement_prediction_bic<- (intercept + JobRotation*avg_JobRotation + 
                                Mentoring*1 + Autonomy*avg_Autonomy + Tools*1 + 
                                Training*1)
improvement_prediction_bic