library(ggplot2)
library(patchwork)
# This library is for calculating ROC-curves, AUC and
# confidence interval for AUc:
library(pROC)
# Hosmer-Lemeshow goodness of fit test:
library(ResourceSelection)
weather <- read.csv("Data/weather.csv")
weather$location <- relevel(as.factor(weather$location), "Katterjåkk")
weather$monthnr <- as.factor(substr(weather$month, 6, 7))
weather$drymonth <- as.numeric(weather$rain < 0.5)
n = length(weather$drymonth)
Y = sum(weather$drymonth)
p = Y/n
p
odds = p/(1-p)
odds
log(odds)
my.model <- glm(drymonth ~ 1, family = "binomial", data = weather)
summary(my.model)
p + 1.96*sqrt(p*(1-p)/n)
p - 1.96*sqrt(p*(1-p)/n)
upper <- exp( log(odds) + 1.96*sqrt(1/Y + 1/(n-Y)) )
(upperCi <- upper/(1+upper))
upperl<- exp( log(odds) - 1.96*sqrt(1/Y + 1/(n-Y)) )
(upperlCi <- upperl/(1+upperl))
exp(log(odds) - 1.96*sqrt(1/Y + 1/(n-Y)))
exp(confint(my.model))
##### 1b #####
Q1 <- quantile(weather$rain, 0.25)
weather$lowrain <- as.numeric(weather$rain < Q1)
weather$low_cat <- factor(weather$lowrain,
levels = c(0, 1),
labels = c("high", "low"))
ggplot(data = weather, aes(x = month, y = lowrain, color = low_cat)) + geom_point(size = 0.1) +
labs(title = "")
table(weather$location, weather$low_cat)
pk = 102/(498+102)
pl = 117/(333+117)
pu = 155/(291+155)
oneminusprobs = rbind(1- pk, 1- pl,1- pu)
probs = rbind( pk, pl, pu)
odds = rbind( pk/(1-pk), pl/(1-pl), pu/(1-pu))
oddsrat = rbind( (pk/(1-pk))/(pk/(1-pk)), (pl/(1-pl))/(pk/(1-pk)), (pu/(1-pu))/(pk/(1-pk)))
cbind(table(weather$location, weather$low_cat), oneminusprobs, probs, odds, oddsrat)
##### 1c #####
Loc.model <- glm(low_cat ~ location, family = "binomial", data = weather)
summary(Loc.model)
confint(Loc.model)
# Lägg till e^beta estimates!
##### 1d #####
Press.model <- glm(low_cat ~ pressure, family = "binomial", data = weather)
summary(Press.model)
confint(Press.model)
ggplot(data = weather, aes(x = pressure, y = lowrain)) + geom_point(size = 1) +
geom_smooth() +
labs(title = "")
## mer att lägga till ; test och annat e^beta estimates
##### 1e #####
n = length(weather$lowrain)
Y = sum(weather$lowrain)
p = Y/n
ggplot(data = weather, aes(x = pressure, y = influence(Press.model)$hat)) + geom_point(size = 1) +
geom_line(aes(y = 1/n)) +
geom_line(aes(y = 2*(p+1)/n)) +
labs(title = "") + ylim(-0.0001, 0.006)
##### 1f #####
aic <- AIC(Press.model, Loc.model)
bic <- BIC(Press.model, Loc.model)
(collect.AIC <- data.frame(aic, bic))
null.model <- glm(low_cat ~ 1, family = "binomial", data = weather)
logLik(null.model)
(lnL0 <- logLik(null.model)[1])
(R2CS.max <- 1 - (exp(lnL0))^(2/n))
# Collect the log likelihoods L(betahat)
logliks <-
c(logLik(Press.model)[1],
logLik(Loc.model)[1])
##R2_McF###
(R2s <- 1 - logliks/lnL0)
##R2_McF,adj.###
# Note that p+1 = df (and df.1):
(R2s.adj <- 1 - (logliks - (collect.AIC$df - 1)/2)/lnL0)
##### 2a #####
null.model <- glm(low_cat ~ 1, family = "binomial", data = weather)
full.model <- glm(low_cat ~ pressure*location*speed*temp*monthnr, family = "binomial", data = weather)
StepNull <- step(null.model, scope = list(lower = null.model, upper = full.model), direction = "both", k = log(nrow(weather)))
StepFull <- step(full.model, scope = list(lower = null.model, upper = full.model), direction = "both", k = log(nrow(weather)))
BIC(StepNull)
BIC(StepFull)
AIC(StepNull)
AIC(StepFull)
aic <- AIC(StepNull, StepFull)
bic <- BIC(StepNull, StepFull)
(collect.AIC <- data.frame(aic, bic))
logLik(null.model)
(lnL0 <- logLik(null.model)[1])
(R2CS.max <- 1 - (exp(lnL0))^(2/n))
# Collect the log likelihoods L(betahat)
logliks <-
c(logLik(StepNull)[1],
logLik(StepFull)[1])
##R2_McF###
(R2s <- 1 - logliks/lnL0)
##R2_McF,adj.###
# Note that p+1 = df (and df.1):
(R2s.adj <- 1 - (logliks - (collect.AIC$df - 1)/2)/lnL0)
##### 2b #####
lst <- sort(cooks.distance(StepFull), index.return=TRUE, decreasing=TRUE)
coksD <- lapply(lst, `[`, lst$x %in% head(unique(lst$x),6))
weather$lowrain[1107]
weather$lowrain[664]
ggplot(data = weather, aes(x = StepFull$fitted.values, y = cooks.distance(StepFull), col = lowrain)) + geom_point(size = 1) +
geom_line(aes(y = 0.01), colour = "black") +
geom_point(aes(x = StepFull$fitted.values[1107], y = cooks.distance(StepFull)[1107]), size = 2, colour = 'magenta') +
geom_point(aes(x = StepFull$fitted.values[664], y = cooks.distance(StepFull)[664]), size = 2, colour = 'red') +
labs(title = "")
#magenta highest cooks D with 1 lowrain, red highest cooks D with 0 lowrain
##### 3a #####
pred.phat <- cbind(
weather,
p.0 = predict(StepFull, type = "response"))
pred.phat$yhat <- as.numeric(pred.phat$p.0 > 0.5)
(row.01 <- table(weather$low_cat))
(col.01.StepFull <- table(pred.phat$yhat))
(confusion.StepFull <- table(pred.phat$low_cat, pred.phat$yhat))
(spec.StepFull <- confusion.StepFull[1, 1] / row.01[1])
(sens.StepFull <- confusion.StepFull[2, 2] / row.01[2])
(accu.StepFull <- sum(diag(confusion.StepFull)) / sum(confusion.StepFull))
(prec.StepFull <- confusion.StepFull[2, 2] / col.01.StepFull[2])
##### 3b #####
pred.phat <- cbind(
weather,
p.0 = predict(Press.model, type = "response"),
p.1 = predict(Loc.model, type = "response"),
p.2 = predict(null.model, type = "response"),
p.3 = predict(full.model, type = "response"),
p.4 = predict(StepNull, type = "response"),
p.5 = predict(StepFull, type = "response"))
head(pred.phat)
# ROC-curves for all models##
roc.0 <- roc(low_cat ~ p.0, data = pred.phat)
roc.df.0 <- coords(roc.0, transpose = FALSE)
roc.df.0$model <- "Pressure"
roc.1 <- roc(low_cat ~ p.1, data = pred.phat)
roc.df.1 <- coords(roc.1, transpose = FALSE)
roc.df.1$model <- "Location"
roc.2 <- roc(low_cat ~ p.2, data = pred.phat)
roc.df.2 <- coords(roc.2, transpose = FALSE)
roc.df.2$model <- "Null"
roc.3 <- roc(low_cat ~ p.3, data = pred.phat)
roc.df.3 <- coords(roc.3, transpose = FALSE)
roc.df.3$model <- "Full"
roc.4 <- roc(low_cat ~ p.4, data = pred.phat)
roc.df.4 <- coords(roc.4, transpose = FALSE)
roc.df.4$model <- "Stepwise Fw"
roc.5 <- roc(low_cat ~ p.5, data = pred.phat)
roc.df.5 <- coords(roc.5, transpose = FALSE)
roc.df.5$model <- "Stepwise Bw"
roc.df <- rbind(roc.df.0, roc.df.1, roc.df.2, roc.df.3,
roc.df.4, roc.df.5)
## Plot all the curves, in different colors##
ggplot(roc.df, aes(specificity, sensitivity,
color = model)) +
geom_path(size = 1) +
coord_fixed() + # square plotting area
scale_x_reverse() + # Reverse scale on the x-axis!
labs(title = "ROC-curves for all the models") +
theme(text = element_text(size = 14))
# AUC##
(aucs <-
data.frame(
model = c("Pressure", "Location", "Null", "Full", "Stepwise Fw", "Stepwise Bw"),
auc = c(auc(roc.0), auc(roc.1), auc(roc.2),
auc(roc.3), auc(roc.4), auc(roc.5)),
lwr = c(ci(roc.0)[1], ci(roc.1)[1],
ci(roc.2)[1],
ci(roc.3)[1], ci(roc.4)[1],
ci(roc.5)[1]),
upr = c(ci(auc(roc.0))[3], ci(auc(roc.1))[3],
ci(auc(roc.2))[3],
ci(auc(roc.3))[3], ci(auc(roc.4))[3],
ci(auc(roc.5))[3])))
## Compare AUC for the models#
roc.test(roc.0, roc.5)
roc.test(roc.1, roc.5)
roc.test(roc.2, roc.5)
roc.test(roc.3, roc.5)
roc.test(roc.4, roc.5)
##### 3c #####
##Find best sens-spec#
### Experiment with different values#
# of levels for sens and spec, level.ss, to find
# the one that gives the optimal combination of sens and spec.
level.ss <- 0.75935
roc.df.5[roc.df.5$sensitivity > level.ss &
roc.df.5$specificity > level.ss, ]
##find the rownumber:
(I_max.5 <- which(roc.df.5$sensitivity > level.ss &
roc.df.5$specificity > level.ss))
roc.df.5[I_max.5, ]
### Pick out the corresponding threshold for p#
roc.df.5[I_max.5, "threshold"]
ggplot(roc.df, aes(specificity, sensitivity,
color = model)) +
geom_path(size = 1) +
geom_point(data = roc.df.5[I_max.5, ], color = "black", size = 3) +
coord_fixed() + # square plotting area
scale_x_reverse() + # Reverse scale on the x-axis!
labs(title = "ROC-curves for all the models") +
theme(text = element_text(size = 14))
pred.phat2 <- cbind(
weather,
p.0 = predict(StepFull, type = "response"))
pred.phat2$yhat <- as.numeric(pred.phat$p.0 > 0.25)
(row.01 <- table(weather$low_cat))
(col.01.StepFull <- table(pred.phat2$yhat))
(confusion.StepFull <- table(pred.phat2$low_cat, pred.phat$yhat))
(spec.StepFull <- confusion.StepFull[1, 1] / row.01[1])
(sens.StepFull <- confusion.StepFull[2, 2] / row.01[2])
(accu.StepFull <- sum(diag(confusion.StepFull)) / sum(confusion.StepFull))
(prec.StepFull <- confusion.StepFull[2, 2] / col.01.StepFull[2])
##### 3d #####
## HL using hoslem.test#
###Model 3##
# p+1:
length(StepFull$coefficients)
# so we need g > 3
# while the smallest expected value is at least approx 5:
# Allowing 4 here and have experimented with g:
pred.sort <- pred.phat[order(pred.phat$p.5), ]
pred.sort$rank <- seq(1, nrow(pred.sort))
head(pred.sort)
(HL.3 <- hoslem.test(pred.sort$lowrain, pred.sort$p.5, g = 30))
HL.3$expected
HL.3$observed
# All yhat0- and yhat1-values should be at least approx 5.
# Collect the data in a useful form for plotting:
(HL.df.3 <- data.frame(group = seq(1, 30),
Obs0 = HL.3$observed[, 1],
Obs1 = HL.3$observed[, 2],
Exp0 = HL.3$expected[, 1],
Exp1 = HL.3$expected[, 2]))
ggplot(HL.df.3, aes(x = group)) +
geom_line(aes(y = Obs0, linetype = "observed", color = "Y = 0"), size = 1) +
geom_line(aes(y = Obs1, linetype = "observed", color = "Y = 1"), size = 1) +
geom_line(aes(y = Exp0, linetype = "expected", color = "Y = 0"), size = 1) +
geom_line(aes(y = Exp1, linetype = "expected", color = "Y = 1"), size = 1) +
labs(title = "Model 3: Observed and expected in each group",
y = "number of observations") +
scale_x_continuous(breaks = seq(1, 30)) +
theme(text = element_text(size = 14))