##################### with interaction instead NAs in hmax and sst ########################
# setting the working directory #
setwd(paste(getwd(), "/Scripts", sep =""))
# uploading cleaned dataset #
faults.ready = read.table("thesisready.csv", sep = "\t", dec = ".", fill= TRUE, header = TRUE,
encoding = "UTF-8", stringsAsFactors = FALSE, quote= "")
faults.ready = faults.ready[,-c(1:8)]
faults.ready= faults.ready[,-2]
faults.ready= faults.ready[,-30]
# handling NAs hmax => 0's #
hmax0= 0
ind_missing= which(is.na(faults.ready$Average.of.hmax),arr.ind = T)
faults.ready$Average.of.hmax[ind_missing]=hmax0
ind_missing= which(is.na(faults.ready$X30Average.of.hmax),arr.ind = T)
faults.ready$X30Average.of.hmax[ind_missing]=hmax0
# handling NAs sst => creating interaction term offshore #
sst0= 0
ind_missing= which(is.na(faults.ready$Average.of.sst),arr.ind = T)
faults.ready$Average.of.sst[ind_missing]= sst0
ind_missing= which(is.na(faults.ready$X30Average.of.sst),arr.ind = T)
faults.ready$X30Average.of.sst[ind_missing]= sst0
faults.ready$Offshore = 1
faults.ready$Offshore = ifelse(faults.ready$Average.of.sst == 0,faults.ready$Offshore == 0, faults.ready$Offshore == 1)
faults.ready = na.omit(faults.ready)
# heatmap of correlation #
cor = cor(faults.ready, use= "complete.obs")
heatmap(cor, rowv = NA, Colv = NA, scale = "none")
# creating censoring = all of them died/failed #
faults.ready$Status= 1
# checking for outliers #
summary(faults.ready) # Lifetime interesting, wind interesting, not higher than 25 maybe bc then it turns off, min sum apcp #
# overall doesn't seem like there is a problem with outliers #
#### model ####
library("survival")
library("survminer")
library("fitdistrplus")
# proportional hazard #
cox.mod= coxph(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m + faults.ready$Average.of.r2m + faults.ready$Average.of.den2m
+ faults.ready$Average.of.wg10m + faults.ready$Average.of.wg100m + faults.ready$Sum.of.apcp +faults.ready$Sum.of.asnow
+ faults.ready$Average.of.prsfc + faults.ready$Average.of.tcc + faults.ready$Average.of.sst + faults.ready$Average.of.hmax
+ faults.ready$Average.of.ws10m + faults.ready$Average.of.ws100m + faults.ready$Average.of.cbh
+ faults.ready$X30Average.of.t2m + faults.ready$X30Average.of.r2m + faults.ready$X30Average.of.den2m
+ faults.ready$X30Average.of.wg10m + faults.ready$X30Average.of.wg100m + faults.ready$X30Sum.of.apcp +faults.ready$X30Sum.of.asnow
+ faults.ready$X30Average.of.prsfc + faults.ready$X30Average.of.tcc + faults.ready$X30Average.of.sst + faults.ready$X30Average.of.hmax
+ faults.ready$X30Average.of.ws10m + faults.ready$X30Average.of.ws100m + faults.ready$X30Average.of.cbh)
# checking linearity (also for Weibull or???) #
plot(predict(cox.mod), residuals(cox.mod, type= "martingale"),
xlab= "Fitted values", ylab= "Martingale residuals",
main= "Residual Plot", las=1)
abline(h=0, col="blue")
lines(smooth.spline(predict(cox.mod), residuals((cox.mod), type="martingale")), col="red") # slight non-linearity, but not a big problem #
# checking the proportional assumption using Shoenfeld residuals #
test.proportionality= cox.zph(cox.mod)
test.proportionality # statistically significant, proportional hazard cannot be assumed #
# fitting the failures to weibull distribution #
fit.wei= fitdist(faults.ready$Lifetime, "weibull")
# weibull regression #
model.w= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m + faults.ready$Average.of.r2m + faults.ready$Average.of.den2m
+ faults.ready$Average.of.wg10m + faults.ready$Average.of.wg100m + faults.ready$Sum.of.apcp +faults.ready$Sum.of.asnow
+ faults.ready$Average.of.prsfc + faults.ready$Average.of.tcc + faults.ready$Average.of.sst + faults.ready$Average.of.hmax
+ faults.ready$Average.of.ws10m + faults.ready$Average.of.ws100m + faults.ready$Average.of.cbh
+ faults.ready$X30Average.of.t2m + faults.ready$X30Average.of.r2m + faults.ready$X30Average.of.den2m
+ faults.ready$X30Average.of.wg10m + faults.ready$X30Average.of.wg100m + faults.ready$X30Sum.of.apcp +faults.ready$X30Sum.of.asnow
+ faults.ready$X30Average.of.prsfc + faults.ready$X30Average.of.tcc + faults.ready$X30Average.of.sst + faults.ready$X30Average.of.hmax
+ faults.ready$X30Average.of.ws10m + faults.ready$X30Average.of.ws100m + faults.ready$X30Average.of.cbh
, dist="weibull")
# individual regressions #
model1= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m, dist= "weibull")
model2= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.r2m, dist= "weibull")
model3= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.den2m, dist= "weibull")
model4= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.wg10m, dist= "weibull")
model5= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.wg100m, dist= "weibull")
model6= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Sum.of.apcp, dist= "weibull")
model7= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Sum.of.asnow, dist= "weibull")
model8= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.prsfc, dist= "weibull")
model9= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.tcc, dist= "weibull")
model10= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.sst*faults.ready$Offshore, dist= "weibull")
model11= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.hmax, dist= "weibull")
model12= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.ws10m, dist= "weibull")
model13= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.ws100m, dist= "weibull")
model14= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.cbh, dist= "weibull")
model15= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.t2m, dist= "weibull")
model6= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.r2m, dist= "weibull")
model17= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.den2m, dist= "weibull")
model18= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.wg10m, dist= "weibull")
model19= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.wg100m, dist= "weibull")
model20= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Sum.of.apcp, dist= "weibull")
model21= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Sum.of.asnow, dist= "weibull")
model22= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.prsfc, dist= "weibull")
model23= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.tcc, dist= "weibull")
model24= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.sst*faults.ready$Offshore, dist= "weibull")
model25= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.hmax, dist= "weibull")
model26= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.ws10m, dist= "weibull")
model27= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.ws100m, dist= "weibull")
model28= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.cbh, dist= "weibull")
# I want R-squared ffs #
library(PAmeasures)
fit1= survreg(Surv(Lifetime, Status) ~ Average.of.t2m, data= faults.ready, dist= "weibull",x=TRUE,y=TRUE)
pam.survreg(fit1)
p <- predict(fit1, type = "response") #https://stackoverflow.com/a/51142934
with(ovarian, pam.censor(futime, p, fustat))
# evaluating performance(tu chce miec MSE, MAPE etc co mozna miec uzywajac accuracy() #
library(forecast)
predict.survreg(model1, faults.ready$Lifetime)
predict(model1, faults.ready$Lifetime)
performance= accuracy(model1, faults.ready$Lifetime)
############ PCA ###############
pca= faults.ready[,c(-1, -29)]
pca = pca[, !(colnames(pca) %in% c("Offshore","Status"))]
faults.pca= prcomp(na.omit(pca), center= TRUE, scale=TRUE) #28 principal components, each explaining percentage of total variation in the dataset #
summary(faults.pca)
# what is the optimal number of principal components (line plot of eigenvalues of principal components in analysis) #
screeplot(faults.pca, type="l", main= "Scree plot")
abline(1,0, col="red", lty= 2) # checking which components have eigvenvalue higher than 1 #
# bar plot of the variation explained by each principal component #
faults.pca.var= faults.pca$sdev^2
faults.pca.per= round(faults.pca.var/sum(faults.pca.var)*100, 1)
barplot(faults.pca.var, main= "Bar plot of variation explained", xlab= "PC number", ylab= "% of explained variation")
# or cumulative variance plot #
cumpro = cumsum(faults.pca.per)
plot(cumpro[0:28], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 5, col="blue", lty=5) # at which component #
# 7 principal components #
library(psych)
pc7= principal(cor(pca, use= "complete.obs"), nfactors=7, rotate="none") # 7 most important components #
model.w= survreg(Surv(faults.ready$Lifetime,faults.ready$Status) ~ pc7 , dist="weibull")
predict(pc7, faults.ready$Lifetime)
# regression with 7 most important components (tu potrzebuje zrobić Weibull regression model z 7 najważniejszymi principal component) #
pcrmodel= survreg(Surv(faults.ready$Lifetime,faults.ready$Status) ~ faults.pca$x[,1:7], dist="weibull")
summary(faults.ready$X30Average.of.ws100m)
faults.ready$wind_x = ifelse(faults.ready$X30Average.of.ws100m > 10, 10, ifelse(faults.ready$X30Average.of.ws100m > 8, 8, 0))
assign_to_quantile <- function(x, q1, median, q3) {
if(x < q1){
result <- 0
} else if(x < median){
result <- 1
}
else if (x < q3){
result <- 2
}
else{
result <- 3
}
return(result)
}
n=nrow(faults.ready)
for(i in n){
faults.ready$wind_x[i] = assign_to_quantile(faults.ready$X30Average.of.ws100m[i], 5.8, 6.7, 11 )
}
faults.ready$wind_x = as.numeric(faults.ready$wind_x)
surv_fit <- survfit(Surv(Lifetime,Status) ~wind_x, data=faults.ready)
library(survminer)
ggsurvplot(surv_fit)