Untitled
unknown
plain_text
3 years ago
6.9 kB
10
Indexable
allData <- read.csv("ChrisData.csv")
ndata <- nrow(allData)
set.seed(5)
train <- sample(ndata, ndata*0.8)
data_all_train <- allData[train, ]
data_all_test <- allData[-train, ]
lm_all_data <- lm(Performance ~ ., data = data_all_train)
summary(lm_all_data)
data <- allData[,c(-3, -15)]
dim(data)
data <- na.omit(data)
library(leaps)
library("ggplot2")
ndata <- nrow(data)
set.seed(5)
train <- sample(ndata, ndata*0.8)
data_train <- data[train, ]
data_test <- data[-train, ]
lm_all <- lm(Performance ~ ., data = data_train)
summary(lm_all)
pred_test_all <- predict(lm_all, newdata = data_test)
MSE_all <- mean((data_test$Performance - pred_test_all)^2)
MSE_all # mean square error 120.9725
# Find best subset
best_subset <- regsubsets(Performance ~ ., data = data_train, nvmax = 29)
best_subset_summary <- summary(best_subset)
best_adjr2 <- which.max(best_subset_summary$adjr2)
best_rss <- which.min(best_subset_summary$rss)
best_cp <- which.min(best_subset_summary$cp)
best_bic <- which.min(best_subset_summary$bic)
best_rss; best_bic; best_cp; best_adjr2 #rss = 29, bic = 5, cp = 12, adjr2 = 17
par(mfrow = c(2,2))
plot(best_subset_summary$rss, xlab = "Number of Variables", ylab = "RSS",
main = "No. of variables vs RSS", type = 'l')
points(best_rss, best_subset_summary$rss[best_rss], col = "mediumpurple", pch = "x")
plot(best_subset_summary$adjr2, xlab = "Number of Variables",
ylab = "Adjusted R^2", main = "No. of variables vs Adjusted R^2",
type = 'l')
points(best_adjr2, best_subset_summary$adjr2[best_adjr2], col = "olivedrab",
pch = "x")
plot(best_subset_summary$cp, xlab = "Number of Variables", ylab = "Mallows CP",
main = "No. of variables vs Mallows CP", type = 'l')
points(best_cp, best_subset_summary$cp[best_cp], col = "lightseagreen", pch = "x")
plot(best_subset_summary$bic, xlab = "Number of Variables", ylab = "BIC",
main = "No. of variables BIC", type = 'l')
points(best_bic, best_subset_summary$bic[best_bic], col = "orchid", pch = "x")
# Bic Model
coef(best_subset, 5) # Variables in the selected BIC model
lm_bic <- lm(Performance ~ JobRotation + Mentoring + Autonomy + Tools +
Training, data = data_train)
summary(lm_bic)
pred_test_bic <- predict(lm_bic, newdata = data_test)
MSE_bic <- mean((data_test$Performance - pred_test_bic)^2)
MSE_bic # 121.0996
# Cp Model
coef(best_subset, 12) # Variables in the selected BIC model
lm_cp <- lm(Performance ~ DistanceFromHome + JobInvolvement + + JobLevel +
JobRotation + FeedbackFromManager + Mentoring + Proactivity +
Autonomy + MonthlyIncome + OverTime + Tools + Training,
data = data_train)
summary(lm_cp)
pred_test_cp <- predict(lm_cp, newdata = data_test)
MSE_cp <- mean((data_test$Performance - pred_test_cp)^2)
MSE_cp # 117.7484
# Adj R^2 Model
coef(best_subset, 17) # Variables in the selected BIC model
lm_adjr2 <- lm(Performance ~ DistanceFromHome + EnvironmentSatisfaction +
JobInvolvement + JobLevel + JobRotation + FeedbackFromManager +
Mentoring + Proactivity + Autonomy + MonthlyIncome + OverTime +
PercentSalaryHike + StockOptionLevel + Tools + TotalWorkingYears +
Training + YearsWithCurrManager, data = data_train)
summary(lm_adjr2)
pred_test_adjr2 <- predict(lm_adjr2, newdata = data_test)
MSE_adjr2 <- mean((data_test$Performance - pred_test_adjr2)^2)
MSE_adjr2 # 119.9606
labels <- c("RSS", "BIC", "CP", "Adj R^2")
MSE <- c(MSE_all, MSE_bic, MSE_cp, MSE_adjr2)
var <- c(best_rss, best_bic, best_cp, best_adjr2)
table <- data.frame(labels, MSE, var)
table
table_plot <- as.data.frame(table)
table_plot
(121-117)/117 # 3.4% better MSE
avg_DistanceFromHome <- mean(data$DistanceFromHome)
avg_JobInvolvement <- mean(data$JobInvolvement)
avg_JobLevel <- mean(data$JobLevel)
avg_JobRotation <- mean(data$JobRotation)
feedbackH <- ifelse(data$FeedbackFromManager == "H", 1, 0)
avg_FeedbackFromManager <- mean(feedbackH)
mentoringY <- ifelse(data$Mentoring == "Y", 1, 0)
avg_Mentoring <- mean(mentoringY)
avg_Proactivity <- mean(data$Proactivity)
autonomyY <- ifelse(data$Autonomy == "Y", 1, 0)
avg_Autonomy <- mean(autonomyY)
avg_MonthlyIncome <- mean(data$MonthlyIncome)
overtimeY <- ifelse(data$Tools == "Yes", 1, 0)
avg_OverTime <- mean(overtimeY)
toolsH <- ifelse(data$Tools == "H", 1, 0)
avg_Tools <- mean(toolsY)
trainingY <- ifelse(data$Training == "Y", 1, 0)
avg_Training <- mean(trainingY)
intercept <- 33.3116708780
DistanceFromHome <- -0.0697216650
JobInvolvement <- 1.1496629440
JobLevel <- -1.2807453219
JobRotation <- 2.6515473427
FeedbackFromManager <- -1.6901699335
Mentoring <- 6.6006777297
Proactivity <- 0.5988167975
Autonomy <- 4.3709193766
MonthlyIncome <- 0.0003627075
OverTime <- -1.5298041395
Tools <- -2.3458114720
Training <- 14.3693398245
prediction_cp <- (intercept + DistanceFromHome*avg_DistanceFromHome +
JobInvolvement*avg_JobInvolvement
+ JobLevel*avg_JobLevel +
JobRotation*avg_JobRotation +
FeedbackFromManager*avg_FeedbackFromManager
+ Mentoring*avg_Mentoring + Proactivity*avg_Proactivity +
Autonomy*avg_Autonomy
+ MonthlyIncome*avg_MonthlyIncome + OverTime*avg_OverTime +
Tools*avg_Tools
+ Training*avg_Training)
prediction_cp # 55.66472
improved_prediction_cp <- (intercept + DistanceFromHome*avg_DistanceFromHome +
JobInvolvement*avg_JobInvolvement
+ JobLevel*avg_JobLevel +
JobRotation*avg_JobRotation +
FeedbackFromManager*1
+ Mentoring*1 + Proactivity*avg_Proactivity +
Autonomy*avg_Autonomy
+ MonthlyIncome*avg_MonthlyIncome +
OverTime*avg_OverTime + Tools*1
+ Training*1)
improved_prediction_cp # 62.15957
(62.15957 - 55.66472)/62.15957 # 10.4% increase!
# BIC PREDICTIONS
prediction_bic <- (intercept + JobRotation*avg_JobRotation +
Mentoring*avg_Mentoring +
Autonomy*avg_Autonomy + Tools*avg_Tools +
Training*avg_Training)
prediction_bic
improvement_prediction_bic <- (intercept + JobRotation*avg_JobRotation +
Mentoring*avg_Mentoring +
Autonomy*avg_Autonomy + Tools*avg_Tools +
Training*avg_Training)
improvement_prediction_bic<- (intercept + JobRotation*avg_JobRotation +
Mentoring*1 + Autonomy*avg_Autonomy + Tools*1 +
Training*1)
improvement_prediction_bic
Editor is loading...