rm(list = ls())
weather <- read.csv("Data/weather.csv")
library(ggplot2)
## Transformed precipitation
Y = (weather$rain)^(1/3)
(plot.data <-
ggplot(data = weather, aes(x = pressure, y = Y)) +
geom_point(size = 3) +
xlab("Pressure (hPa)") +
ylab("Cube root of rain (mm^(1/3))") +
labs(title = "Rain against pressure") +
theme(text = element_text(size = 18))
)
## Creating all models ####
weather$location <- as.factor(weather$location)
weather$location <- relevel(weather$location, "Katterjåkk")
model.1a <- lm(Y ~ pressure, data = weather)
model.2b <- lm(Y ~ pressure + location, data = weather)
model.2c <- lm(Y ~ pressure + location + speed, data = weather)
model.2d <- lm(Y ~ pressure + location*speed, data = weather)
## Testing of significance ####
#Task 2(a) t-test
# The t-test for pressure is in:
(sum.1a <- summary(model.1a)$coefficients) # P-value = Pr(>abs(t))
(tvalue <- sum.1a["pressure", "t value"])
#Task 2b test if pressure still significant with t-test
(sum.2b <- summary(model.2b)$coefficients) # P-value = Pr(>abs(t))
(tvalue <- sum.2b["pressure", "t value"])
# Task 2b test if model 2b significant improvement
# using partial F-test with anova because 1a nested in 2b
## ANOVA#
(model.1a.2b.anova <- anova(model.1a, model.2b)) # P-value = Pr(>F)
model.1a.2b.anova$`Pr(>F)`[2]
# Task 2c test if this model significant improvement to 2b
# we do in the same way with t-test for location and anova for comparison
# between models
#Task 2c t-test if location still significant with t-test
(sum.2c <- summary(model.2c)$coefficients) # P-value = Pr(>abs(t))
# Task 2c test if model 2c significant improvement
# using partial F-test with anova because 2b nested in 2c
## ANOVA#
(model.2b.2c.anova <- anova(model.2b, model.2c)) # P-value = Pr(>F)
model.2b.2c.anova$`Pr(>F)`[2]
## Task 2d testing same as last task test
(model.2c.2d.anova <- anova(model.2c, model.2d)) # P-value = Pr(>F)
model.2c.2d.anova$`Pr(>F)`[2]
## Compare the F-value with upper F-quantile###
(Fvalue <- model.1a.2b.anova$F[2])
qf(1 - 0.05, 2, 26)
## Prediction and confidence intervals for all models ####
confint(model.1a, level = 0.95)
confint(model.2b, level = 0.95)
confint(model.2c, level = 0.95)
confint(model.2d, level = 0.95)
model.1a.pred <-
cbind(weather,
fit = predict(model.1a),
conf = predict(model.1a, interval = "confidence"),
pred = predict(model.1a, interval = "prediction"))
model.2b.pred <-
cbind(weather,
fit = predict(model.2b),
conf = predict(model.2b, interval = "confidence"),
pred = predict(model.2b, interval = "prediction"))
model.2c.pred <-
cbind(weather,
fit = predict(model.2c),
conf = predict(model.2c, interval = "confidence"),
pred = predict(model.2c, interval = "prediction"))
model.2d.pred <-
cbind(weather,
fit = predict(model.2d),
conf = predict(model.2d, interval = "confidence"),
pred = predict(model.2d, interval = "prediction"),
r = rstudent(model.2d)) # task 3(b)
# Getting rid of extra fits
model.1a.pred$conf.fit <- model.1a.pred$pred.fit <- NULL
model.2b.pred$conf.fit <- model.2b.pred$pred.fit <- NULL
model.2c.pred$conf.fit <- model.2c.pred$pred.fit <- NULL
model.2d.pred$conf.fit <- model.2d.pred$pred.fit <- NULL
# Adding fitted lines
weather$yhat.1a <- predict(model.1a)
weather$yhat.2b <- predict(model.2b)
weather$yhat.2c <- predict(model.2c)
weather$yhat.2d <- predict(model.2d)
## Predict expected precipitation for specific pressure ####
x0 <- data.frame(location = 'Katterjåkk', pressure = 1011, speed = 5)
y0.1a <- predict(model.1a, x0, se.fit = TRUE)
y0.2b <- predict(model.2b, x0, se.fit = TRUE)
y0.2c <- predict(model.2c, x0, se.fit = TRUE)
y0.2d <- predict(model.2d, x0, se.fit = TRUE)
y0.1a$fit^3
y0.2b$fit^3
y0.2c$fit^3
y0.2d$fit^3
## Residue calculations for all models ####
model.1a.pred$e <- model.1a$residuals
model.2b.pred$e <- model.2b$residuals
model.2c.pred$e <- model.2c$residuals
model.2d.pred$e <- model.2d$residuals
(max.e.1a <- max(abs(model.1a.pred$e)))
(max.e.2b <- max(abs(model.2b.pred$e)))
(max.e.2c <- max(abs(model.2c.pred$e)))
(max.e.2d <- max(abs(model.2d.pred$e)))
(model.1a.elims <- c(-max.e.1a, max.e.1a))
(model.2b.elims <- c(-max.e.2b, max.e.2b))
(model.2c.elims <- c(-max.e.2c, max.e.2c))
(model.2d.elims <- c(-max.e.2d, max.e.2d))
## Calculating leverage for model 2d ####
model.2d.pred$v <- influence(model.2d)$hat
leverages <- model.2d.pred$v
tail(sort(leverages),7)
lst <- sort(leverages, index.return=TRUE, decreasing=TRUE)
leverages <- lapply(lst, `[`, lst$x %in% head(unique(lst$x),7))
leverages$x
weather$location[leverages$ix]
weather$month[leverages$ix]
model.2d.pred$D <- cooks.distance(model.2d)
cd.idx <- which.max(model.2d.pred$D)
weather$location[cd.idx]
weather$month[cd.idx]
## Advanced residual analysis (3(b)) ####
## Variable selection ####
# Fattar 0.
weather$monthnr <- as.factor(substr(weather$month, 6, 7))
null.model <- lm(rain^(1/3) ~ 1, data = weather)
full.model <- lm(rain^(1/3)~pressure*relevel(location, "Katterjåkk")*speed*temp*monthnr, data = weather)
BIC(null.model)
BIC(full.model)
step(model.2d, data = weather, scope = list(lower = null.model, upper = full.model), direction = "both", k =log(nrow(weather)))
## Plots for model 1a ####
# Adding lines and confidence/prediction intervals to the plot
(plot.1a <- plot.data +
geom_line(data = weather, aes(y = yhat.1a), color = "blue", linewidth = 2) +
geom_ribbon(data = model.1a.pred, aes(ymin = conf.lwr, ymax = conf.upr), alpha = 0.2) +
geom_line(data = model.1a.pred, aes(y = pred.lwr),
color = "red", linetype = "dashed", linewidth = 2) +
geom_line(data = model.1a.pred, aes(y = pred.upr),
color = "red", linetype = "dashed", linewidth = 2)
)
## QQ
ggplot(data = model.1a.pred, aes(sample = e)) +
geom_qq(size = 3) +
geom_qq_line() +
labs(tag = "C") +
labs(title = "Normal Q-Q-plot of the residuals") +
facet_wrap(~location) +
theme(text = element_text(size = 18))
ggplot(data = model.1a.pred, aes(x = fit, y = e)) +
geom_point(size = 3) +
geom_hline(yintercept = 0) +
expand_limits(y = model.1a.elims) +
xlab("Predicted rain") +
ylab("Residual") +
labs(tag = "B") +
labs(title = "Residuals vs predicted values Y-hat") +
theme(text = element_text(size = 18))
## Boxplot of residue against location
ggplot(data = model.1a.pred, aes(x = location, y = e, fill = location)) +
geom_boxplot()
## Plots for model 2b ####
# Transformed plot
ggplot(data = model.2b.pred, aes(x = pressure, y = Y,
shape = location, color = location)) +
geom_point(size = 3) +
geom_line(aes(y = fit), linewidth = 1) +
geom_ribbon(aes(ymin = conf.lwr, ymax = conf.upr,
group = location),
alpha = 0.1) +
geom_line(aes(y = pred.lwr), linetype = "dashed", linewidth = 1) +
geom_line(aes(y = pred.upr), linetype = "dashed", linewidth = 1) +
#facet_wrap(~ location) +
xlab("Pressure (hPa)") +
ylab("Transformed precipitation (mm^(1/3))") +
labs(title = "Transformed rain against pressure over each location for model 2b") +
labs(caption = "data, fitted line, conf. and pred. interval") +
theme(text = element_text(size = 18))
#expand_limits(y = c(0, 8))
# Non transformed observation with fitted lines and intervals
ggplot(data = model.2b.pred, aes(x = pressure, y = rain,
shape = location, color = location)) +
geom_point(size = 3) +
geom_line(aes(y = fit^3), linewidth = 1) +
geom_ribbon(aes(ymin = conf.lwr^3, ymax = conf.upr^3,
group = location),
alpha = 0.1) +
geom_line(aes(y = pred.lwr^3), linetype = "dashed", linewidth = 1) +
geom_line(aes(y = pred.upr^3), linetype = "dashed", linewidth = 1) +
facet_wrap(~ location) +
xlab("Pressure (hPa)") +
ylab("Transformed precipitation (mm^(1/3))") +
labs(title = "Transformed rain against pressure over each location for model 2b") +
labs(caption = "data, fitted line, conf. and pred. interval") +
theme(text = element_text(size = 18))
#expand_limits(y = c(0, 8))
## Boxplot
ggplot(data = model.2b.pred, aes(x = location, y = e, fill = location)) + geom_boxplot()
## Plots for model 2c ####
ggplot(data = weather, aes(x = location, y = speed, fill = location)) + geom_boxplot()
ggplot(data = model.2c.pred, aes(x = speed, y = e, fill = location)) +
facet_wrap(~location, scales = "free_x") +
geom_hline(yintercept = 0) +
geom_smooth()
ggplot(data = weather, aes(x = speed, y = Y)) +
geom_point(size = 3) +
xlab("Speed (m/s)") +
ylab("Cube root of rain (mm^(1/3))") +
labs(title = "Rain against speed") +
theme(text = element_text(size = 18))
ggplot(data = weather, aes(x = pressure, y = speed)) +
geom_point(size = 3) +
xlab("Pressure (hPa)") +
ylab("Speed (m/s)") +
labs(title = "Rain against speed") +
theme(text = element_text(size = 18))
## Plots for model 2d ####
ggplot(data = model.2d.pred, aes(x = speed, y = e, fill = location)) +
facet_wrap(~location, scales = "free_x") +
geom_hline(yintercept = 0) +
geom_smooth()
# Plotting with highlight of seven high-leverage observations and the high
# Cook's distance observation
ggplot(data = weather, aes(x = pressure, y = speed, col = factor(location))) +
geom_point(size = 1) +
geom_point(data = weather[leverages$ix,], aes(x = pressure, y = speed, col = )
, size = 2, colour = 'chocolate4', shape = 24) +
geom_point(data = weather[cd.idx,],
aes(x = pressure, y = speed), size = 2, colour = 'black',
shape = 21) +
facet_wrap(~location, scales = "free_x") +
labs(title = "speed against Air Pressure (hPa)")
dfbetas(model.2d)[cd.idx,] # speed has largest dfbetas in absolute value
(2 / sqrt(length(weather$month))) # for comparison if dfbeta is large
ggplot(data = weather, aes(x = speed, y = rain^(1/3))) + geom_point(size = 1) +
geom_point(data = weather[cd.idx,], size = 3, colour = 'red', shape = 24) +
#facet_wrap(~location) +
labs(title = "Precipitation (mm) against Air speed")
ggplot(data = model.2d.pred, aes(x = fit, y = rstudent(model.2d))) +
geom_point(size = 2) +
geom_hline(yintercept = c(-2, 0, 2)) +
geom_hline(yintercept = c(-3, 3), linetype = 2) +
geom_smooth() +
labs(title = "Studentized residuals against fitted values") +
xlab("fitted values") +
ylab("studentized residuals") +
theme(text = element_text(size = 18))
ggplot(data = model.2d.pred, aes(x = fit,
y = sqrt(abs(rstudent(model.2d))))) +
geom_point(size = 2) +
geom_hline(yintercept = c(-2, 0, 2)) +
geom_hline(yintercept = c(-3, 3), linetype = 2) +
geom_smooth() +
labs(title = "Root of absolute value of studentized residuals against fitted values")
xlab("fitted values") +
ylab("studentized residuals") +
theme(text = element_text(size = 18))