Untitled
unknown
plain_text
2 years ago
6.9 kB
0
Indexable
Never
allData <- read.csv("ChrisData.csv") ndata <- nrow(allData) set.seed(5) train <- sample(ndata, ndata*0.8) data_all_train <- allData[train, ] data_all_test <- allData[-train, ] lm_all_data <- lm(Performance ~ ., data = data_all_train) summary(lm_all_data) data <- allData[,c(-3, -15)] dim(data) data <- na.omit(data) library(leaps) library("ggplot2") ndata <- nrow(data) set.seed(5) train <- sample(ndata, ndata*0.8) data_train <- data[train, ] data_test <- data[-train, ] lm_all <- lm(Performance ~ ., data = data_train) summary(lm_all) pred_test_all <- predict(lm_all, newdata = data_test) MSE_all <- mean((data_test$Performance - pred_test_all)^2) MSE_all # mean square error 120.9725 # Find best subset best_subset <- regsubsets(Performance ~ ., data = data_train, nvmax = 29) best_subset_summary <- summary(best_subset) best_adjr2 <- which.max(best_subset_summary$adjr2) best_rss <- which.min(best_subset_summary$rss) best_cp <- which.min(best_subset_summary$cp) best_bic <- which.min(best_subset_summary$bic) best_rss; best_bic; best_cp; best_adjr2 #rss = 29, bic = 5, cp = 12, adjr2 = 17 par(mfrow = c(2,2)) plot(best_subset_summary$rss, xlab = "Number of Variables", ylab = "RSS", main = "No. of variables vs RSS", type = 'l') points(best_rss, best_subset_summary$rss[best_rss], col = "mediumpurple", pch = "x") plot(best_subset_summary$adjr2, xlab = "Number of Variables", ylab = "Adjusted R^2", main = "No. of variables vs Adjusted R^2", type = 'l') points(best_adjr2, best_subset_summary$adjr2[best_adjr2], col = "olivedrab", pch = "x") plot(best_subset_summary$cp, xlab = "Number of Variables", ylab = "Mallows CP", main = "No. of variables vs Mallows CP", type = 'l') points(best_cp, best_subset_summary$cp[best_cp], col = "lightseagreen", pch = "x") plot(best_subset_summary$bic, xlab = "Number of Variables", ylab = "BIC", main = "No. of variables BIC", type = 'l') points(best_bic, best_subset_summary$bic[best_bic], col = "orchid", pch = "x") # Bic Model coef(best_subset, 5) # Variables in the selected BIC model lm_bic <- lm(Performance ~ JobRotation + Mentoring + Autonomy + Tools + Training, data = data_train) summary(lm_bic) pred_test_bic <- predict(lm_bic, newdata = data_test) MSE_bic <- mean((data_test$Performance - pred_test_bic)^2) MSE_bic # 121.0996 # Cp Model coef(best_subset, 12) # Variables in the selected BIC model lm_cp <- lm(Performance ~ DistanceFromHome + JobInvolvement + + JobLevel + JobRotation + FeedbackFromManager + Mentoring + Proactivity + Autonomy + MonthlyIncome + OverTime + Tools + Training, data = data_train) summary(lm_cp) pred_test_cp <- predict(lm_cp, newdata = data_test) MSE_cp <- mean((data_test$Performance - pred_test_cp)^2) MSE_cp # 117.7484 # Adj R^2 Model coef(best_subset, 17) # Variables in the selected BIC model lm_adjr2 <- lm(Performance ~ DistanceFromHome + EnvironmentSatisfaction + JobInvolvement + JobLevel + JobRotation + FeedbackFromManager + Mentoring + Proactivity + Autonomy + MonthlyIncome + OverTime + PercentSalaryHike + StockOptionLevel + Tools + TotalWorkingYears + Training + YearsWithCurrManager, data = data_train) summary(lm_adjr2) pred_test_adjr2 <- predict(lm_adjr2, newdata = data_test) MSE_adjr2 <- mean((data_test$Performance - pred_test_adjr2)^2) MSE_adjr2 # 119.9606 labels <- c("RSS", "BIC", "CP", "Adj R^2") MSE <- c(MSE_all, MSE_bic, MSE_cp, MSE_adjr2) var <- c(best_rss, best_bic, best_cp, best_adjr2) table <- data.frame(labels, MSE, var) table table_plot <- as.data.frame(table) table_plot (121-117)/117 # 3.4% better MSE avg_DistanceFromHome <- mean(data$DistanceFromHome) avg_JobInvolvement <- mean(data$JobInvolvement) avg_JobLevel <- mean(data$JobLevel) avg_JobRotation <- mean(data$JobRotation) feedbackH <- ifelse(data$FeedbackFromManager == "H", 1, 0) avg_FeedbackFromManager <- mean(feedbackH) mentoringY <- ifelse(data$Mentoring == "Y", 1, 0) avg_Mentoring <- mean(mentoringY) avg_Proactivity <- mean(data$Proactivity) autonomyY <- ifelse(data$Autonomy == "Y", 1, 0) avg_Autonomy <- mean(autonomyY) avg_MonthlyIncome <- mean(data$MonthlyIncome) overtimeY <- ifelse(data$Tools == "Yes", 1, 0) avg_OverTime <- mean(overtimeY) toolsH <- ifelse(data$Tools == "H", 1, 0) avg_Tools <- mean(toolsY) trainingY <- ifelse(data$Training == "Y", 1, 0) avg_Training <- mean(trainingY) intercept <- 33.3116708780 DistanceFromHome <- -0.0697216650 JobInvolvement <- 1.1496629440 JobLevel <- -1.2807453219 JobRotation <- 2.6515473427 FeedbackFromManager <- -1.6901699335 Mentoring <- 6.6006777297 Proactivity <- 0.5988167975 Autonomy <- 4.3709193766 MonthlyIncome <- 0.0003627075 OverTime <- -1.5298041395 Tools <- -2.3458114720 Training <- 14.3693398245 prediction_cp <- (intercept + DistanceFromHome*avg_DistanceFromHome + JobInvolvement*avg_JobInvolvement + JobLevel*avg_JobLevel + JobRotation*avg_JobRotation + FeedbackFromManager*avg_FeedbackFromManager + Mentoring*avg_Mentoring + Proactivity*avg_Proactivity + Autonomy*avg_Autonomy + MonthlyIncome*avg_MonthlyIncome + OverTime*avg_OverTime + Tools*avg_Tools + Training*avg_Training) prediction_cp # 55.66472 improved_prediction_cp <- (intercept + DistanceFromHome*avg_DistanceFromHome + JobInvolvement*avg_JobInvolvement + JobLevel*avg_JobLevel + JobRotation*avg_JobRotation + FeedbackFromManager*1 + Mentoring*1 + Proactivity*avg_Proactivity + Autonomy*avg_Autonomy + MonthlyIncome*avg_MonthlyIncome + OverTime*avg_OverTime + Tools*1 + Training*1) improved_prediction_cp # 62.15957 (62.15957 - 55.66472)/62.15957 # 10.4% increase! # BIC PREDICTIONS prediction_bic <- (intercept + JobRotation*avg_JobRotation + Mentoring*avg_Mentoring + Autonomy*avg_Autonomy + Tools*avg_Tools + Training*avg_Training) prediction_bic improvement_prediction_bic <- (intercept + JobRotation*avg_JobRotation + Mentoring*avg_Mentoring + Autonomy*avg_Autonomy + Tools*avg_Tools + Training*avg_Training) improvement_prediction_bic<- (intercept + JobRotation*avg_JobRotation + Mentoring*1 + Autonomy*avg_Autonomy + Tools*1 + Training*1) improvement_prediction_bic