Untitled
unknown
r
a year ago
13 kB
2
Indexable
Never
rm(list = ls()) weather <- read.csv("Data/weather.csv") library(ggplot2) ## Transformed precipitation Y = (weather$rain)^(1/3) (plot.data <- ggplot(data = weather, aes(x = pressure, y = Y^3)) + geom_point(size = 3) + xlab("Pressure (hPa)") + ylab("Precipitation (mm)") + labs(title = "Rain against pressure") + theme(text = element_text(size = 18)) ) ## Creating all models #### weather$location <- as.factor(weather$location) weather$location <- relevel(weather$location, "Katterjåkk") weather$monthnr <- as.factor(substr(weather$month, 6, 7)) model.null <- lm(Y ~ 1, data = weather) model.full <- lm(Y ~ pressure*location*speed*temp*monthnr, data = weather) model.1a <- lm(Y ~ pressure, data = weather) model.2b <- lm(Y ~ pressure + location, data = weather) model.2c <- lm(Y ~ pressure + location + speed, data = weather) model.2d <- lm(Y ~ pressure + location*speed, data = weather) model.BIC <- lm(Y ~ pressure + location + speed + monthnr + location:speed + pressure:location + pressure:speed, data = weather) ## Testing of significance #### #Task 2(a) t-test # The t-test for pressure is in: (sum.1a <- summary(model.1a)$coefficients) # P-value = Pr(>abs(t)) (tvalue <- sum.1a["pressure", "t value"]) #Task 2b test if pressure still significant with t-test (sum.2b <- summary(model.2b)$coefficients) # P-value = Pr(>abs(t)) (tvalue <- sum.2b["pressure", "t value"]) # Task 2b test if model 2b significant improvement # using partial F-test with anova because 1a nested in 2b ## ANOVA# (model.1a.2b.anova <- anova(model.1a, model.2b)) # P-value = Pr(>F) model.1a.2b.anova$`Pr(>F)`[2] # Task 2c test if this model significant improvement to 2b # we do anova for location with reduced model and anova for comparison # between models model.reduced <- lm(Y ~ pressure + speed, data = weather) (model.reduced.2c.anova <- anova(model.reduced, model.2c)) # Task 2c test if model 2c significant improvement # using partial F-test with anova because 2b nested in 2c ## ANOVA# (model.2b.2c.anova <- anova(model.2b, model.2c)) # P-value = Pr(>F) model.2b.2c.anova$`Pr(>F)`[2] ## Task 2d testing same as last task test (model.2c.2d.anova <- anova(model.2c, model.2d)) # P-value = Pr(>F) model.2c.2d.anova$`Pr(>F)`[2] ## Compare the F-value with upper F-quantile### (Fvalue <- model.1a.2b.anova$F[2]) qf(1 - 0.05, 2, 26) ## Prediction and confidence intervals for all models #### confint(model.1a, level = 0.95) confint(model.2b, level = 0.95) confint(model.2c, level = 0.95) confint(model.2d, level = 0.95) model.1a.pred <- cbind(weather, fit = predict(model.1a), conf = predict(model.1a, interval = "confidence"), pred = predict(model.1a, interval = "prediction")) model.2b.pred <- cbind(weather, fit = predict(model.2b), conf = predict(model.2b, interval = "confidence"), pred = predict(model.2b, interval = "prediction")) model.2c.pred <- cbind(weather, fit = predict(model.2c), conf = predict(model.2c, interval = "confidence"), pred = predict(model.2c, interval = "prediction")) model.2d.pred <- cbind(weather, fit = predict(model.2d), conf = predict(model.2d, interval = "confidence"), pred = predict(model.2d, interval = "prediction"), r = rstudent(model.2d)) # task 3(b) model.BIC.pred <- cbind(weather, fit = predict(model.BIC), conf = predict(model.BIC, interval = "confidence"), pred = predict(model.BIC, interval = "prediction"), r = rstudent(model.BIC)) # task 3(b) # Getting rid of extra fits model.1a.pred$conf.fit <- model.1a.pred$pred.fit <- NULL model.2b.pred$conf.fit <- model.2b.pred$pred.fit <- NULL model.2c.pred$conf.fit <- model.2c.pred$pred.fit <- NULL model.2d.pred$conf.fit <- model.2d.pred$pred.fit <- NULL model.BIC.pred$conf.fit <- model.BIC.pred$pred.fit <- NULL # Adding fitted lines weather$yhat.1a <- predict(model.1a) weather$yhat.2b <- predict(model.2b) weather$yhat.2c <- predict(model.2c) weather$yhat.2d <- predict(model.2d) weather$yhat.BIC <- predict(model.BIC) ## Predict expected precipitation for specific pressure #### x0 <- data.frame(location = 'Uppsala', pressure = 1011, speed = 5) y0.1a <- predict(model.1a, x0, se.fit = TRUE) y0.2b <- predict(model.2b, x0, se.fit = TRUE) y0.2c <- predict(model.2c, x0, se.fit = TRUE) y0.2d <- predict(model.2d, x0, se.fit = TRUE) y0.1a$fit^3 y0.2b$fit^3 y0.2c$fit^3 y0.2d$fit^3 ## Residue calculations for all models #### model.1a.pred$e <- model.1a$residuals model.2b.pred$e <- model.2b$residuals model.2c.pred$e <- model.2c$residuals model.2d.pred$e <- model.2d$residuals (max.e.1a <- max(abs(model.1a.pred$e))) (max.e.2b <- max(abs(model.2b.pred$e))) (max.e.2c <- max(abs(model.2c.pred$e))) (max.e.2d <- max(abs(model.2d.pred$e))) (model.1a.elims <- c(-max.e.1a, max.e.1a)) (model.2b.elims <- c(-max.e.2b, max.e.2b)) (model.2c.elims <- c(-max.e.2c, max.e.2c)) (model.2d.elims <- c(-max.e.2d, max.e.2d)) ## Calculating leverage for model 2d #### model.2d.pred$v <- influence(model.2d)$hat leverages <- model.2d.pred$v tail(sort(leverages),7) lst <- sort(leverages, index.return=TRUE, decreasing=TRUE) leverages <- lapply(lst, `[`, lst$x %in% head(unique(lst$x),7)) leverages$x weather$location[leverages$ix] weather$month[leverages$ix] model.2d.pred$D <- cooks.distance(model.2d) cd.idx <- which.max(model.2d.pred$D) weather$location[cd.idx] weather$month[cd.idx] ## Calculating derivatives #### p = 1030 -0.0555*exp(60.0099)*exp(p*-0.0555) 3*-0.0620*(66.4547 + -0.0620*p)^2 ## Advanced residual analysis (3(b)) #### ## Variable selection #### BIC(model.null) BIC(model.1a) BIC(model.2b) BIC(model.2c) BIC(model.2d) BIC(model.BIC) BIC(model.full) AIC(model.null) AIC(model.1a) AIC(model.2b) AIC(model.2c) AIC(model.2d) AIC(model.BIC) AIC(model.full) sum.null <- summary(model.null) sum.1a <- summary(model.1a) sum.2b <- summary(model.2b) sum.2c <- summary(model.2c) sum.2d <- summary(model.2d) sum.BIC <- summary(model.BIC) sum.full <- summary(model.full) sum.null$r.squared sum.1a$r.squared sum.2b$r.squared sum.2c$r.squared sum.2d$r.squared sum.BIC$r.squared sum.full$r.squared sum.null$adj.r.squared sum.1a$adj.r.squared sum.2b$adj.r.squared sum.2c$adj.r.squared sum.2d$adj.r.squared sum.BIC$adj.r.squared sum.full$adj.r.squared step(model.2d, data = weather, scope = list(lower = model.null, upper = model.full), direction = "both", k =log(nrow(weather))) ## Plots for model 1a #### # Adding lines and confidence/prediction intervals to the plot (plot.1a <- plot.data + geom_line(data = weather, aes(y = yhat.1a^3), color = "blue", linewidth = 1) + geom_ribbon(data = model.1a.pred, aes(ymin = conf.lwr^3, ymax = conf.upr^3), alpha = 0.2) + geom_line(data = model.1a.pred, aes(y = pred.lwr^3), color = "red", linetype = "dashed", linewidth = 1) + geom_line(data = model.1a.pred, aes(y = pred.upr^3), color = "red", linetype = "dashed", linewidth = 1) ) ## QQ ggplot(data = model.1a.pred, aes(sample = e)) + geom_qq(size = 3) + geom_qq_line() + labs(title = "Normal Q-Q-plot of the residuals") + #facet_wrap(~location) + theme(text = element_text(size = 18)) ggplot(data = model.1a.pred, aes(x = fit, y = e)) + geom_point(size = 3) + geom_hline(yintercept = 0) + expand_limits(y = model.1a.elims) + xlab("Predicted rain") + ylab("Residual") + labs(title = "Residuals vs predicted values Y-hat") + theme(text = element_text(size = 18)) ## Boxplot of residue against location ggplot(data = model.1a.pred, aes(x = location, y = e, fill = location)) + geom_boxplot() ## Plots for model 2b #### # Transformed plot ggplot(data = model.2b.pred, aes(x = pressure, y = Y, shape = location, color = location)) + geom_point(size = 3) + geom_line(aes(y = fit), linewidth = 1) + geom_ribbon(aes(ymin = conf.lwr, ymax = conf.upr, group = location), alpha = 0.1) + geom_line(aes(y = pred.lwr), linetype = "dashed", linewidth = 1) + geom_line(aes(y = pred.upr), linetype = "dashed", linewidth = 1) + #facet_wrap(~ location) + xlab("Pressure (hPa)") + ylab("Transformed precipitation (mm^(1/3))") + labs(title = "Transformed rain against pressure over each location for model 2b") + labs(caption = "data, fitted line, conf. and pred. interval") + theme(text = element_text(size = 18)) #expand_limits(y = c(0, 8)) # Non transformed observation with fitted lines and intervals ggplot(data = model.2b.pred, aes(x = pressure, y = rain, shape = location, color = location)) + geom_point(size = 3) + geom_line(aes(y = fit^3), linewidth = 1) + geom_ribbon(aes(ymin = conf.lwr^3, ymax = conf.upr^3, group = location), alpha = 0.1) + geom_line(aes(y = pred.lwr^3), linetype = "dashed", linewidth = 1) + geom_line(aes(y = pred.upr^3), linetype = "dashed", linewidth = 1) + facet_wrap(~ location) + xlab("Pressure (hPa)") + ylab("Precipitation (mm)") + labs(title = "Rain against pressure over each location for model 2b") + theme(text = element_text(size = 18)) #expand_limits(y = c(0, 8)) ## Boxplot ggplot(data = model.2b.pred, aes(x = location, y = e, fill = location)) + geom_boxplot() ## Plots for model 2c #### ggplot(data = weather, aes(x = location, y = speed, fill = location)) + geom_boxplot() ggplot(data = model.2c.pred, aes(x = speed, y = e, fill = location)) + facet_wrap(~location, scales = "free_x") + geom_hline(yintercept = 0) + geom_smooth() ggplot(data = weather, aes(x = speed, y = Y)) + geom_point(size = 3) + xlab("Speed (m/s)") + ylab("Cube root of rain (mm^(1/3))") + labs(title = "Rain against speed") + theme(text = element_text(size = 18)) ggplot(data = weather, aes(x = pressure, y = speed)) + geom_point(size = 3) + xlab("Pressure (hPa)") + ylab("Speed (m/s)") + labs(title = "Rain against speed") + theme(text = element_text(size = 18)) ## Plots for model 2d #### ggplot(data = model.2d.pred, aes(x = speed, y = e, fill = location)) + facet_wrap(~location, scales = "free_x") + geom_hline(yintercept = 0) + geom_smooth() # Plotting with highlight of seven high-leverage observations and the high # Cook's distance observation ggplot(data = weather, aes(x = pressure, y = speed, col = factor(location))) + geom_point(size = 1) + geom_point(data = weather[leverages$ix,], aes(x = pressure, y = speed, col = ) , size = 2, colour = 'chocolate4', shape = 24) + geom_point(data = weather[cd.idx,], aes(x = pressure, y = speed), size = 2, colour = 'black', shape = 21) + facet_wrap(~location, scales = "free_x") + labs(title = "speed against Air Pressure (hPa)") dfbetas(model.2d)[cd.idx,] # speed has largest dfbetas in absolute value (2 / sqrt(length(weather$month))) # for comparison if dfbeta is large ggplot(data = weather, aes(x = speed, y = rain^(1/3))) + geom_point(size = 1) + geom_point(data = weather[cd.idx,], size = 3, colour = 'red', shape = 24) + #facet_wrap(~location) + labs(title = "Precipitation (mm) against wind speed") ggplot(data = model.2d.pred, aes(x = fit, y = rstudent(model.2d))) + geom_point(size = 2) + geom_hline(yintercept = c(-2, 0, 2)) + geom_hline(yintercept = c(-3, 3), linetype = 2) + geom_smooth() + labs(title = "Studentized residuals against fitted values") + xlab("fitted values") + ylab("studentized residuals") + theme(text = element_text(size = 18)) ggplot(data = model.2d.pred, aes(x = fit, y = sqrt(abs(rstudent(model.2d))))) + geom_point(size = 2) + geom_hline(yintercept = c(-2, 0, 2)) + geom_hline(yintercept = c(-3, 3), linetype = 2) + geom_smooth() + labs(title = "Root of absolute value of studentized residuals against fitted values") xlab("fitted values") + ylab("studentized residuals") + theme(text = element_text(size = 18)) ## Plots for BIC #### model.BIC.pred$r <- rstudent(model.BIC) ggplot(data = model.BIC.pred, aes(x = fit, y = r)) + geom_point(size = 2) + geom_hline(yintercept = c(-2, 0, 2)) + geom_hline(yintercept = c(-3, 3), linetype = 2) + geom_smooth() + labs(title = "Studentized residuals against fitted values") xlab("fitted values") + ylab("studentized residuals") + theme(text = element_text(size = 18))