Untitled

mail@pastecode.io avatar
unknown
r
a year ago
13 kB
2
Indexable
Never
rm(list = ls())
weather <- read.csv("Data/weather.csv")
library(ggplot2)

## Transformed precipitation
Y = (weather$rain)^(1/3)
(plot.data <- 
    ggplot(data = weather, aes(x = pressure, y = Y^3)) + 
    geom_point(size = 3) +
    xlab("Pressure (hPa)") +
    ylab("Precipitation (mm)") +
    labs(title = "Rain against pressure") +
    theme(text = element_text(size = 18))
)

## Creating all models ####
weather$location <- as.factor(weather$location)
weather$location <- relevel(weather$location, "Katterjåkk")
weather$monthnr <- as.factor(substr(weather$month, 6, 7))
model.null <- lm(Y ~ 1, data = weather)
model.full <- lm(Y ~ pressure*location*speed*temp*monthnr, data = weather)
model.1a <- lm(Y ~ pressure, data = weather)
model.2b <- lm(Y ~ pressure + location, data = weather)
model.2c <- lm(Y ~ pressure + location + speed, data = weather)
model.2d <- lm(Y ~ pressure + location*speed, data = weather)
model.BIC <- lm(Y ~ pressure + location + speed + monthnr + location:speed + 
                  pressure:location + pressure:speed, data = weather)

## Testing of significance ####

#Task 2(a) t-test
# The t-test for pressure is in:
(sum.1a <- summary(model.1a)$coefficients) # P-value = Pr(>abs(t))
(tvalue <- sum.1a["pressure", "t value"])

#Task 2b test if pressure still significant with t-test
(sum.2b <- summary(model.2b)$coefficients) # P-value = Pr(>abs(t))
(tvalue <- sum.2b["pressure", "t value"])

# Task 2b test if model 2b significant improvement 
# using partial F-test with anova because 1a nested in 2b
## ANOVA#
(model.1a.2b.anova <- anova(model.1a, model.2b)) # P-value = Pr(>F)
model.1a.2b.anova$`Pr(>F)`[2]

# Task 2c test if this model significant improvement to 2b 
# we do anova for location with reduced model and anova for comparison
# between models
model.reduced <- lm(Y ~ pressure + speed, data = weather)
(model.reduced.2c.anova <- anova(model.reduced, model.2c))

# Task 2c test if model 2c significant improvement 
# using partial F-test with anova because 2b nested in 2c
## ANOVA#
(model.2b.2c.anova <- anova(model.2b, model.2c)) # P-value = Pr(>F)
model.2b.2c.anova$`Pr(>F)`[2]

## Task 2d testing same as last task test
(model.2c.2d.anova <- anova(model.2c, model.2d)) # P-value = Pr(>F)
model.2c.2d.anova$`Pr(>F)`[2]

## Compare the F-value with upper F-quantile###
(Fvalue <- model.1a.2b.anova$F[2]) 
qf(1 - 0.05, 2, 26)


## Prediction and confidence intervals for all models ####
confint(model.1a, level = 0.95)
confint(model.2b, level = 0.95)
confint(model.2c, level = 0.95)
confint(model.2d, level = 0.95)

model.1a.pred <-
  cbind(weather,
        fit = predict(model.1a),
        conf = predict(model.1a, interval = "confidence"),
        pred = predict(model.1a, interval = "prediction"))

model.2b.pred <-
  cbind(weather,
        fit = predict(model.2b),
        conf = predict(model.2b, interval = "confidence"),
        pred = predict(model.2b, interval = "prediction"))

model.2c.pred <-
  cbind(weather,
        fit = predict(model.2c),
        conf = predict(model.2c, interval = "confidence"),
        pred = predict(model.2c, interval = "prediction"))

model.2d.pred <-
  cbind(weather,
        fit = predict(model.2d),
        conf = predict(model.2d, interval = "confidence"),
        pred = predict(model.2d, interval = "prediction"),
        r = rstudent(model.2d)) # task 3(b)

model.BIC.pred <-
  cbind(weather,
        fit = predict(model.BIC),
        conf = predict(model.BIC, interval = "confidence"),
        pred = predict(model.BIC, interval = "prediction"),
        r = rstudent(model.BIC)) # task 3(b)


# Getting rid of extra fits
model.1a.pred$conf.fit <- model.1a.pred$pred.fit <- NULL
model.2b.pred$conf.fit <- model.2b.pred$pred.fit <- NULL
model.2c.pred$conf.fit <- model.2c.pred$pred.fit <- NULL
model.2d.pred$conf.fit <- model.2d.pred$pred.fit <- NULL
model.BIC.pred$conf.fit <- model.BIC.pred$pred.fit <- NULL

# Adding fitted lines
weather$yhat.1a <- predict(model.1a)
weather$yhat.2b <- predict(model.2b)
weather$yhat.2c <- predict(model.2c)
weather$yhat.2d <- predict(model.2d)
weather$yhat.BIC <- predict(model.BIC)

## Predict expected precipitation for specific pressure ####
x0 <- data.frame(location = 'Uppsala', pressure = 1011, speed = 5)
y0.1a <- predict(model.1a, x0, se.fit = TRUE)
y0.2b <- predict(model.2b, x0, se.fit = TRUE)
y0.2c <- predict(model.2c, x0, se.fit = TRUE)
y0.2d <- predict(model.2d, x0, se.fit = TRUE)
y0.1a$fit^3
y0.2b$fit^3
y0.2c$fit^3
y0.2d$fit^3




## Residue calculations for all models ####
model.1a.pred$e <- model.1a$residuals
model.2b.pred$e <- model.2b$residuals
model.2c.pred$e <- model.2c$residuals
model.2d.pred$e <- model.2d$residuals

(max.e.1a <- max(abs(model.1a.pred$e)))
(max.e.2b <- max(abs(model.2b.pred$e)))
(max.e.2c <- max(abs(model.2c.pred$e)))
(max.e.2d <- max(abs(model.2d.pred$e)))

(model.1a.elims <- c(-max.e.1a, max.e.1a))
(model.2b.elims <- c(-max.e.2b, max.e.2b))
(model.2c.elims <- c(-max.e.2c, max.e.2c))
(model.2d.elims <- c(-max.e.2d, max.e.2d))


## Calculating leverage for model 2d ####
model.2d.pred$v <- influence(model.2d)$hat
leverages <- model.2d.pred$v
tail(sort(leverages),7)

lst <- sort(leverages, index.return=TRUE, decreasing=TRUE)
leverages <- lapply(lst, `[`, lst$x %in% head(unique(lst$x),7))

leverages$x
weather$location[leverages$ix]
weather$month[leverages$ix]

model.2d.pred$D <- cooks.distance(model.2d)
cd.idx <- which.max(model.2d.pred$D)
weather$location[cd.idx]
weather$month[cd.idx]

## Calculating derivatives ####
p = 1030
-0.0555*exp(60.0099)*exp(p*-0.0555)
3*-0.0620*(66.4547 + -0.0620*p)^2



## Advanced residual analysis (3(b)) ####
## Variable selection ####

BIC(model.null)
BIC(model.1a)
BIC(model.2b)
BIC(model.2c)
BIC(model.2d)
BIC(model.BIC)
BIC(model.full)

AIC(model.null)
AIC(model.1a)
AIC(model.2b)
AIC(model.2c)
AIC(model.2d)
AIC(model.BIC)
AIC(model.full)

sum.null <- summary(model.null)
sum.1a <- summary(model.1a)
sum.2b <- summary(model.2b)
sum.2c <- summary(model.2c)
sum.2d <- summary(model.2d)
sum.BIC <- summary(model.BIC)
sum.full <- summary(model.full)

sum.null$r.squared
sum.1a$r.squared
sum.2b$r.squared
sum.2c$r.squared
sum.2d$r.squared
sum.BIC$r.squared
sum.full$r.squared

sum.null$adj.r.squared
sum.1a$adj.r.squared
sum.2b$adj.r.squared
sum.2c$adj.r.squared
sum.2d$adj.r.squared
sum.BIC$adj.r.squared
sum.full$adj.r.squared

step(model.2d, data = weather, scope = list(lower = model.null, 
                                            upper = model.full), 
     direction = "both", k =log(nrow(weather)))      

## Plots for model 1a ####

# Adding lines and confidence/prediction intervals to the plot
(plot.1a <- plot.data + 
    geom_line(data = weather, aes(y = yhat.1a^3), color = "blue", linewidth = 1) +
    geom_ribbon(data = model.1a.pred, aes(ymin = conf.lwr^3, ymax = conf.upr^3), alpha = 0.2) +
    geom_line(data = model.1a.pred, aes(y = pred.lwr^3),
              color = "red", linetype = "dashed", linewidth = 1) +
    geom_line(data = model.1a.pred, aes(y = pred.upr^3),
              color = "red", linetype = "dashed", linewidth = 1) 
)

## QQ
ggplot(data = model.1a.pred, aes(sample = e)) +
  geom_qq(size = 3) +
  geom_qq_line() +
  labs(title = "Normal Q-Q-plot of the residuals") +
  #facet_wrap(~location) +
  theme(text = element_text(size = 18))


ggplot(data = model.1a.pred, aes(x = fit, y = e)) +
  geom_point(size = 3) +
  geom_hline(yintercept = 0) +
  expand_limits(y = model.1a.elims) +
  xlab("Predicted rain") +
  ylab("Residual") +
  labs(title = "Residuals vs predicted values Y-hat") +
  theme(text = element_text(size = 18))

## Boxplot of residue against location
ggplot(data = model.1a.pred, aes(x = location, y = e, fill = location)) + 
  geom_boxplot()

## Plots for model 2b ####

# Transformed plot
ggplot(data = model.2b.pred, aes(x = pressure, y = Y,
                                 shape = location, color = location)) +
  geom_point(size = 3) +
  geom_line(aes(y = fit), linewidth = 1) +
  geom_ribbon(aes(ymin = conf.lwr, ymax = conf.upr,
                  group = location),
              alpha = 0.1) +
  geom_line(aes(y = pred.lwr), linetype = "dashed", linewidth = 1) +
  geom_line(aes(y = pred.upr), linetype = "dashed", linewidth = 1) +
  #facet_wrap(~ location) +
  xlab("Pressure (hPa)") +
  ylab("Transformed precipitation (mm^(1/3))") + 
  labs(title = "Transformed rain against pressure over each location for model 2b") +
  labs(caption = "data, fitted line, conf. and pred. interval") +
  theme(text = element_text(size = 18))
#expand_limits(y = c(0, 8))



# Non transformed observation with fitted lines and intervals
ggplot(data = model.2b.pred, aes(x = pressure, y = rain,
                                 shape = location, color = location)) +
  geom_point(size = 3) +
  geom_line(aes(y = fit^3), linewidth = 1) +
  geom_ribbon(aes(ymin = conf.lwr^3, ymax = conf.upr^3,
                  group = location),
              alpha = 0.1) +
  geom_line(aes(y = pred.lwr^3), linetype = "dashed", linewidth = 1) +
  geom_line(aes(y = pred.upr^3), linetype = "dashed", linewidth = 1) +
  facet_wrap(~ location) +
  xlab("Pressure (hPa)") +
  ylab("Precipitation (mm)") + 
  labs(title = "Rain against pressure over each location for model 2b") +
  theme(text = element_text(size = 18))
  #expand_limits(y = c(0, 8))

## Boxplot
ggplot(data = model.2b.pred, aes(x = location, y = e, fill = location)) + geom_boxplot()

## Plots for model 2c ####
ggplot(data = weather, aes(x = location, y = speed, fill = location)) + geom_boxplot()
ggplot(data = model.2c.pred, aes(x = speed, y = e, fill = location)) + 
  facet_wrap(~location, scales = "free_x") + 
  geom_hline(yintercept = 0) + 
  geom_smooth()


ggplot(data = weather, aes(x = speed, y = Y)) + 
geom_point(size = 3) +
xlab("Speed (m/s)") +
ylab("Cube root of rain (mm^(1/3))") +
labs(title = "Rain against speed") +
theme(text = element_text(size = 18))

ggplot(data = weather, aes(x = pressure, y = speed)) + 
  geom_point(size = 3) +
  xlab("Pressure (hPa)") +
  ylab("Speed (m/s)") +
  labs(title = "Rain against speed") +
  theme(text = element_text(size = 18))


## Plots for model 2d ####
ggplot(data = model.2d.pred, aes(x = speed, y = e, fill = location)) + 
  facet_wrap(~location, scales = "free_x") + 
  geom_hline(yintercept = 0) + 
  geom_smooth()

# Plotting with highlight of seven high-leverage observations and the high 
# Cook's distance observation
ggplot(data = weather, aes(x = pressure, y = speed, col = factor(location))) + 
  geom_point(size = 1) +
  geom_point(data = weather[leverages$ix,], aes(x = pressure, y = speed, col = )
             , size = 2,  colour = 'chocolate4', shape = 24) +
  geom_point(data = weather[cd.idx,], 
             aes(x = pressure, y = speed), size = 2, colour = 'black', 
             shape = 21) +
  facet_wrap(~location, scales = "free_x") +
  labs(title = "speed against Air Pressure (hPa)")


dfbetas(model.2d)[cd.idx,] # speed has largest dfbetas in absolute value
(2 / sqrt(length(weather$month))) # for comparison if dfbeta is large 

ggplot(data = weather, aes(x = speed, y = rain^(1/3))) + geom_point(size = 1) +
  geom_point(data = weather[cd.idx,], size = 3, colour = 'red', shape = 24) +
  #facet_wrap(~location) +
  labs(title = "Precipitation (mm) against wind speed")


ggplot(data = model.2d.pred, aes(x = fit, y = rstudent(model.2d))) +
  geom_point(size = 2) +
  geom_hline(yintercept = c(-2, 0, 2)) +
  geom_hline(yintercept = c(-3, 3), linetype = 2) +
  geom_smooth() +
  labs(title = "Studentized residuals against fitted values") +
  xlab("fitted values") +
  ylab("studentized residuals") +
  theme(text = element_text(size = 18))


ggplot(data = model.2d.pred, aes(x = fit, 
                            y = sqrt(abs(rstudent(model.2d))))) +
  geom_point(size = 2) +
  geom_hline(yintercept = c(-2, 0, 2)) +
  geom_hline(yintercept = c(-3, 3), linetype = 2) +
  geom_smooth() +
  labs(title = "Root of absolute value of studentized residuals against fitted values")
  xlab("fitted values") +
  ylab("studentized residuals") +
  theme(text = element_text(size = 18))


## Plots for BIC ####
model.BIC.pred$r <- rstudent(model.BIC)
  ggplot(data = model.BIC.pred, aes(x = fit, 
                                   y = r)) +
    geom_point(size = 2) +
    geom_hline(yintercept = c(-2, 0, 2)) +
    geom_hline(yintercept = c(-3, 3), linetype = 2) +
    geom_smooth() +
    labs(title = "Studentized residuals against fitted values")
    xlab("fitted values") +
    ylab("studentized residuals") +
    theme(text = element_text(size = 18))