Untitled

 avatar
unknown
r
19 days ago
1.9 kB
2
Indexable

install.packages("dplyr")
install.packages("MASS")
install.packages("ggeffects")

# load these libraries
library(MASS)
library(dplyr)
library(ggeffects)

# loading the dataset here by equating a variable called "df"
# to birthwt, this dataset has to do with birth weights whilst
# controlling for different variables such as bwt or smoking status
df = birthwt
View(df)

# manipulating the dataset here to edit the values of bwt
# bwt signifies birth weight
df$bwt = (2000 + df$bwt) - (df$age^2)

# make a scatter plot with age as x axis, bwt as y axis
# the $ operator means we are accessing all the rows of the specific
# column 
# e.g df is our dataframe, df$age accesses the age column in df
plot(df$age, log(df$bwt))

# we need to filter the outlier out
df = subset(df, age < 40)

plot(df$age, df$bwt)

# making a regression line plot
abline(lm(df$bwt ~ df$age), data = df, col = "red")

# our null model i.e just using the mean to make that regression line
null_model <- lm(bwt ~ 1, data = df)
# our alternative model
alt_model <- lm(bwt ~ age, data = df)
# extending our alternative model by adding more predictors and a multiplicative term instead of an additive term 

# i.e the regression formula is now bwt = B0 + B1(Age) + B2(Smoke) + B3(Age x Smoke)
extended_model <- lm(bwt ~ age * smoke + lwt, data = df)

# testing normality of residuals with the Shapiro test
shapiro.test(residuals(alt_model))
# visually inspecting the histogram of residuals to see if it roughly follows
# a normal distribution
hist(residuals(alt_model))

# this is another way to visualise residual distribution
# dots should roughly follow the straight line from qqline()
qqnorm(residuals(alt_model))
qqline(residuals(alt_model))

# comparing our models
# a lower df on output means that model is better
anova(alt_model, model_2)


# generating predictions
ggpredict(model_2, terms = c("age[15:45]", "smoke", "lwt"), data = df) %>% plot()

Editor is loading...
Leave a Comment