Untitled

mail@pastecode.io avatar
unknown
r
2 years ago
10 kB
3
Indexable
Never
library(ggplot2)
library(patchwork)

# This library is for calculating ROC-curves, AUC and
# confidence interval for AUc:
library(pROC)
# Hosmer-Lemeshow goodness of fit test:
library(ResourceSelection)


weather <- read.csv("Data/weather.csv")

weather$location <- relevel(as.factor(weather$location), "Katterjåkk")
weather$monthnr <- as.factor(substr(weather$month, 6, 7))

weather$drymonth <- as.numeric(weather$rain < 0.5)
n = length(weather$drymonth)
Y = sum(weather$drymonth)

p = Y/n
p
odds = p/(1-p)
odds
log(odds)

my.model <- glm(drymonth ~ 1, family = "binomial", data = weather)
summary(my.model)

p + 1.96*sqrt(p*(1-p)/n)
p - 1.96*sqrt(p*(1-p)/n)

upper <- exp( log(odds) + 1.96*sqrt(1/Y + 1/(n-Y)) )
(upperCi <- upper/(1+upper))

upperl<- exp( log(odds) - 1.96*sqrt(1/Y + 1/(n-Y)) )
(upperlCi <- upperl/(1+upperl))

exp(log(odds) - 1.96*sqrt(1/Y + 1/(n-Y)))

exp(confint(my.model))

##### 1b #####
Q1 <- quantile(weather$rain, 0.25)
weather$lowrain <- as.numeric(weather$rain < Q1)

weather$low_cat <- factor(weather$lowrain,
                          levels = c(0, 1),
                          labels = c("high", "low"))

ggplot(data = weather, aes(x = month, y = lowrain, color = low_cat)) + geom_point(size = 0.1) +
  labs(title = "")


table(weather$location, weather$low_cat)

pk = 102/(498+102)
pl = 117/(333+117)
pu = 155/(291+155)
oneminusprobs = rbind(1- pk, 1- pl,1- pu)
probs = rbind( pk, pl, pu)
odds = rbind( pk/(1-pk), pl/(1-pl), pu/(1-pu))
oddsrat = rbind( (pk/(1-pk))/(pk/(1-pk)), (pl/(1-pl))/(pk/(1-pk)), (pu/(1-pu))/(pk/(1-pk)))
cbind(table(weather$location, weather$low_cat), oneminusprobs, probs, odds, oddsrat)


##### 1c #####
Loc.model <- glm(low_cat ~ location, family = "binomial", data = weather)
summary(Loc.model)
confint(Loc.model)

# Lägg till e^beta estimates!
##### 1d #####
Press.model <- glm(low_cat ~ pressure, family = "binomial", data = weather)
summary(Press.model)
confint(Press.model)

ggplot(data = weather, aes(x = pressure, y = lowrain)) + geom_point(size = 1) +
  geom_smooth() +
  labs(title = "")
## mer att lägga till ; test och annat e^beta estimates

##### 1e #####
n = length(weather$lowrain)
Y = sum(weather$lowrain)
p = Y/n

ggplot(data = weather, aes(x = pressure, y = influence(Press.model)$hat)) + geom_point(size = 1) +
  geom_line(aes(y = 1/n)) +
  geom_line(aes(y = 2*(p+1)/n)) +
  labs(title = "") + ylim(-0.0001, 0.006)


##### 1f #####

aic <- AIC(Press.model, Loc.model)
bic <- BIC(Press.model, Loc.model)
(collect.AIC <- data.frame(aic, bic))


null.model <- glm(low_cat ~ 1, family = "binomial", data = weather)

logLik(null.model)
(lnL0 <- logLik(null.model)[1])

(R2CS.max <- 1 - (exp(lnL0))^(2/n))
# Collect the log likelihoods L(betahat)
logliks <-
  c(logLik(Press.model)[1],
    logLik(Loc.model)[1])
##R2_McF###
(R2s <- 1 - logliks/lnL0)
##R2_McF,adj.###
# Note that p+1 = df (and df.1):
(R2s.adj <- 1 - (logliks - (collect.AIC$df - 1)/2)/lnL0)


##### 2a #####
null.model <- glm(low_cat ~ 1, family = "binomial", data = weather)
full.model <- glm(low_cat ~ pressure*location*speed*temp*monthnr, family = "binomial", data = weather)


StepNull <- step(null.model, scope = list(lower = null.model, upper = full.model), direction = "both", k = log(nrow(weather)))
StepFull <- step(full.model, scope = list(lower = null.model, upper = full.model), direction = "both", k = log(nrow(weather)))


BIC(StepNull)
BIC(StepFull)

AIC(StepNull)
AIC(StepFull)

aic <- AIC(StepNull, StepFull)
bic <- BIC(StepNull, StepFull)
(collect.AIC <- data.frame(aic, bic))


logLik(null.model)
(lnL0 <- logLik(null.model)[1])

(R2CS.max <- 1 - (exp(lnL0))^(2/n))
# Collect the log likelihoods L(betahat)
logliks <-
  c(logLik(StepNull)[1],
    logLik(StepFull)[1])
##R2_McF###
(R2s <- 1 - logliks/lnL0)
##R2_McF,adj.###
# Note that p+1 = df (and df.1):
(R2s.adj <- 1 - (logliks - (collect.AIC$df - 1)/2)/lnL0)

##### 2b #####

lst <- sort(cooks.distance(StepFull), index.return=TRUE, decreasing=TRUE)
coksD <- lapply(lst, `[`, lst$x %in% head(unique(lst$x),6))
weather$lowrain[1107]
weather$lowrain[664]

ggplot(data = weather, aes(x = StepFull$fitted.values, y = cooks.distance(StepFull), col = lowrain)) + geom_point(size = 1) +
  geom_line(aes(y = 0.01), colour = "black") +
  geom_point(aes(x = StepFull$fitted.values[1107], y = cooks.distance(StepFull)[1107]), size = 2, colour = 'magenta') +
  geom_point(aes(x = StepFull$fitted.values[664], y = cooks.distance(StepFull)[664]), size = 2, colour = 'red') +
  labs(title = "")
#magenta highest cooks D with 1 lowrain, red highest cooks D with 0 lowrain


##### 3a #####


pred.phat <- cbind(
  weather,
  p.0 = predict(StepFull, type = "response"))


pred.phat$yhat <- as.numeric(pred.phat$p.0 > 0.5)

(row.01 <- table(weather$low_cat))

(col.01.StepFull <- table(pred.phat$yhat))
(confusion.StepFull <- table(pred.phat$low_cat, pred.phat$yhat))
(spec.StepFull <- confusion.StepFull[1, 1] / row.01[1])
(sens.StepFull <- confusion.StepFull[2, 2] / row.01[2])
(accu.StepFull <- sum(diag(confusion.StepFull)) / sum(confusion.StepFull))
(prec.StepFull <- confusion.StepFull[2, 2] / col.01.StepFull[2])

##### 3b #####

pred.phat <- cbind(
  weather,
  p.0 = predict(Press.model, type = "response"),
  p.1 = predict(Loc.model, type = "response"),
  p.2 = predict(null.model, type = "response"),
  p.3 = predict(full.model, type = "response"),
  p.4 = predict(StepNull, type = "response"),
  p.5 = predict(StepFull, type = "response"))
head(pred.phat)

# ROC-curves for all models##
roc.0 <- roc(low_cat ~ p.0, data = pred.phat)
roc.df.0 <- coords(roc.0, transpose = FALSE)
roc.df.0$model <- "Pressure"
roc.1 <- roc(low_cat ~ p.1, data = pred.phat)
roc.df.1 <- coords(roc.1, transpose = FALSE)
roc.df.1$model <- "Location"
roc.2 <- roc(low_cat ~ p.2, data = pred.phat)
roc.df.2 <- coords(roc.2, transpose = FALSE)
roc.df.2$model <- "Null"
roc.3 <- roc(low_cat ~ p.3, data = pred.phat)
roc.df.3 <- coords(roc.3, transpose = FALSE)
roc.df.3$model <- "Full"
roc.4 <- roc(low_cat ~ p.4, data = pred.phat)
roc.df.4 <- coords(roc.4, transpose = FALSE)
roc.df.4$model <- "Stepwise Fw"
roc.5 <- roc(low_cat ~ p.5, data = pred.phat)
roc.df.5 <- coords(roc.5, transpose = FALSE)
roc.df.5$model <- "Stepwise Bw"

roc.df <- rbind(roc.df.0, roc.df.1, roc.df.2, roc.df.3,
                roc.df.4, roc.df.5)

## Plot all the curves, in different colors##
ggplot(roc.df, aes(specificity, sensitivity,
                   color = model)) +
  geom_path(size = 1) +
  coord_fixed() +       # square plotting area
  scale_x_reverse() +   # Reverse scale on the x-axis!
  labs(title = "ROC-curves for all the models") +
  theme(text = element_text(size = 14))


# AUC##
(aucs <-
   data.frame(
     model = c("Pressure", "Location", "Null", "Full", "Stepwise Fw", "Stepwise Bw"),
     auc = c(auc(roc.0), auc(roc.1), auc(roc.2),
             auc(roc.3), auc(roc.4), auc(roc.5)),
     lwr = c(ci(roc.0)[1], ci(roc.1)[1],
             ci(roc.2)[1],
             ci(roc.3)[1], ci(roc.4)[1],
             ci(roc.5)[1]),
     upr = c(ci(auc(roc.0))[3], ci(auc(roc.1))[3],
             ci(auc(roc.2))[3],
             ci(auc(roc.3))[3], ci(auc(roc.4))[3],
             ci(auc(roc.5))[3])))

## Compare AUC for the models#
roc.test(roc.0, roc.5)
roc.test(roc.1, roc.5)
roc.test(roc.2, roc.5)
roc.test(roc.3, roc.5)
roc.test(roc.4, roc.5)



##### 3c #####


##Find best sens-spec#
### Experiment with different values#
# of levels for sens and spec, level.ss, to find
# the one that gives the optimal combination of sens and spec.
level.ss <- 0.75935
roc.df.5[roc.df.5$sensitivity > level.ss &
           roc.df.5$specificity > level.ss, ]
##find the rownumber:
(I_max.5 <- which(roc.df.5$sensitivity > level.ss &
                    roc.df.5$specificity > level.ss))
roc.df.5[I_max.5, ]
### Pick out the corresponding threshold for p#
roc.df.5[I_max.5, "threshold"]

ggplot(roc.df, aes(specificity, sensitivity,
                   color = model)) +
  geom_path(size = 1) +
  geom_point(data = roc.df.5[I_max.5, ], color = "black", size = 3) +
  coord_fixed() +       # square plotting area
  scale_x_reverse() +   # Reverse scale on the x-axis!
  labs(title = "ROC-curves for all the models") +
  theme(text = element_text(size = 14))



pred.phat2 <- cbind(
  weather,
  p.0 = predict(StepFull, type = "response"))


pred.phat2$yhat <- as.numeric(pred.phat$p.0 > 0.25)

(row.01 <- table(weather$low_cat))

(col.01.StepFull <- table(pred.phat2$yhat))
(confusion.StepFull <- table(pred.phat2$low_cat, pred.phat$yhat))
(spec.StepFull <- confusion.StepFull[1, 1] / row.01[1])
(sens.StepFull <- confusion.StepFull[2, 2] / row.01[2])
(accu.StepFull <- sum(diag(confusion.StepFull)) / sum(confusion.StepFull))
(prec.StepFull <- confusion.StepFull[2, 2] / col.01.StepFull[2])



##### 3d #####


## HL using hoslem.test#
###Model 3##
# p+1:
length(StepFull$coefficients)
# so we need g > 3
# while the smallest expected value is at least approx 5:
# Allowing 4 here and have experimented with g:
pred.sort <- pred.phat[order(pred.phat$p.5), ]
pred.sort$rank <- seq(1, nrow(pred.sort))
head(pred.sort)

(HL.3 <- hoslem.test(pred.sort$lowrain, pred.sort$p.5, g = 30))
HL.3$expected
HL.3$observed
# All yhat0- and yhat1-values should be at least approx 5.

# Collect the data in a useful form for plotting:
(HL.df.3 <- data.frame(group = seq(1, 30),
                       Obs0 = HL.3$observed[, 1],
                       Obs1 = HL.3$observed[, 2],
                       Exp0 = HL.3$expected[, 1],
                       Exp1 = HL.3$expected[, 2]))

ggplot(HL.df.3, aes(x = group)) +
  geom_line(aes(y = Obs0, linetype = "observed", color = "Y = 0"), size = 1) +
  geom_line(aes(y = Obs1, linetype = "observed", color = "Y = 1"), size = 1) +
  geom_line(aes(y = Exp0, linetype = "expected", color = "Y = 0"), size = 1) +
  geom_line(aes(y = Exp1, linetype = "expected", color = "Y = 1"), size = 1) +
  labs(title = "Model 3: Observed and expected in each group",
       y = "number of observations") +
  scale_x_continuous(breaks = seq(1, 30)) +
  theme(text = element_text(size = 14))