Untitled

 avatar
unknown
r
3 years ago
11 kB
2
Indexable
##################### with interaction instead NAs in hmax and sst ########################
# setting the working directory #
setwd(paste(getwd(), "/Scripts", sep =""))

# uploading cleaned dataset #

faults.ready = read.table("thesisready.csv", sep = "\t", dec = ".", fill= TRUE, header = TRUE,
                          encoding = "UTF-8", stringsAsFactors = FALSE, quote= "") 
faults.ready = faults.ready[,-c(1:8)]
faults.ready= faults.ready[,-2]
faults.ready= faults.ready[,-30]



# handling NAs hmax => 0's #
hmax0= 0
ind_missing= which(is.na(faults.ready$Average.of.hmax),arr.ind = T)
faults.ready$Average.of.hmax[ind_missing]=hmax0

ind_missing= which(is.na(faults.ready$X30Average.of.hmax),arr.ind = T)
faults.ready$X30Average.of.hmax[ind_missing]=hmax0

# handling NAs sst => creating interaction term offshore #
sst0= 0
ind_missing= which(is.na(faults.ready$Average.of.sst),arr.ind = T)
faults.ready$Average.of.sst[ind_missing]= sst0

ind_missing= which(is.na(faults.ready$X30Average.of.sst),arr.ind = T)
faults.ready$X30Average.of.sst[ind_missing]= sst0

faults.ready$Offshore = 1
faults.ready$Offshore = ifelse(faults.ready$Average.of.sst == 0,faults.ready$Offshore == 0, faults.ready$Offshore == 1)

faults.ready = na.omit(faults.ready)

# heatmap of correlation #
cor = cor(faults.ready, use= "complete.obs")
heatmap(cor, rowv = NA, Colv = NA, scale = "none")

# creating censoring = all of them died/failed #
faults.ready$Status= 1

# checking for outliers #
summary(faults.ready) # Lifetime interesting, wind interesting, not higher than 25 maybe bc then it turns off, min sum apcp #
# overall doesn't seem like there is a problem with outliers #

#### model ####
library("survival")
library("survminer")
library("fitdistrplus")

# proportional hazard #
cox.mod= coxph(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m + faults.ready$Average.of.r2m + faults.ready$Average.of.den2m
               + faults.ready$Average.of.wg10m + faults.ready$Average.of.wg100m + faults.ready$Sum.of.apcp +faults.ready$Sum.of.asnow
               + faults.ready$Average.of.prsfc + faults.ready$Average.of.tcc + faults.ready$Average.of.sst + faults.ready$Average.of.hmax
               + faults.ready$Average.of.ws10m + faults.ready$Average.of.ws100m + faults.ready$Average.of.cbh
               + faults.ready$X30Average.of.t2m + faults.ready$X30Average.of.r2m + faults.ready$X30Average.of.den2m
               + faults.ready$X30Average.of.wg10m + faults.ready$X30Average.of.wg100m + faults.ready$X30Sum.of.apcp +faults.ready$X30Sum.of.asnow
               + faults.ready$X30Average.of.prsfc + faults.ready$X30Average.of.tcc + faults.ready$X30Average.of.sst + faults.ready$X30Average.of.hmax
               + faults.ready$X30Average.of.ws10m + faults.ready$X30Average.of.ws100m + faults.ready$X30Average.of.cbh)

# checking linearity (also for Weibull or???) #
plot(predict(cox.mod), residuals(cox.mod, type= "martingale"),
     xlab= "Fitted values", ylab= "Martingale residuals",
     main= "Residual Plot", las=1)
abline(h=0, col="blue")
lines(smooth.spline(predict(cox.mod), residuals((cox.mod), type="martingale")), col="red") # slight non-linearity, but not a big problem #

# checking the proportional assumption using Shoenfeld residuals #
test.proportionality= cox.zph(cox.mod)
test.proportionality # statistically significant, proportional hazard cannot be assumed #


# fitting the failures to weibull distribution #
fit.wei= fitdist(faults.ready$Lifetime, "weibull")

# weibull regression #

model.w= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m + faults.ready$Average.of.r2m + faults.ready$Average.of.den2m
                 + faults.ready$Average.of.wg10m + faults.ready$Average.of.wg100m + faults.ready$Sum.of.apcp +faults.ready$Sum.of.asnow
                 + faults.ready$Average.of.prsfc + faults.ready$Average.of.tcc + faults.ready$Average.of.sst + faults.ready$Average.of.hmax
                 + faults.ready$Average.of.ws10m + faults.ready$Average.of.ws100m + faults.ready$Average.of.cbh
                 + faults.ready$X30Average.of.t2m + faults.ready$X30Average.of.r2m + faults.ready$X30Average.of.den2m
                 + faults.ready$X30Average.of.wg10m + faults.ready$X30Average.of.wg100m + faults.ready$X30Sum.of.apcp +faults.ready$X30Sum.of.asnow
                 + faults.ready$X30Average.of.prsfc + faults.ready$X30Average.of.tcc + faults.ready$X30Average.of.sst + faults.ready$X30Average.of.hmax
                 + faults.ready$X30Average.of.ws10m + faults.ready$X30Average.of.ws100m + faults.ready$X30Average.of.cbh
                 , dist="weibull")


# individual regressions #
model1= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m, dist= "weibull")
model2= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.r2m, dist= "weibull")
model3= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.den2m, dist= "weibull")
model4= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.wg10m, dist= "weibull")
model5= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.wg100m, dist= "weibull")
model6= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Sum.of.apcp, dist= "weibull")
model7= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Sum.of.asnow, dist= "weibull")
model8= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.prsfc, dist= "weibull")
model9= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.tcc, dist= "weibull")
model10= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.sst*faults.ready$Offshore, dist= "weibull")
model11= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.hmax, dist= "weibull")
model12= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.ws10m, dist= "weibull")
model13= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.ws100m, dist= "weibull")
model14= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.cbh, dist= "weibull")
model15= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.t2m, dist= "weibull")
model6= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.r2m, dist= "weibull")
model17= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.den2m, dist= "weibull")
model18= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.wg10m, dist= "weibull")
model19= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.wg100m, dist= "weibull")
model20= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Sum.of.apcp, dist= "weibull")
model21= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Sum.of.asnow, dist= "weibull")
model22= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.prsfc, dist= "weibull")
model23= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.tcc, dist= "weibull")
model24= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.sst*faults.ready$Offshore, dist= "weibull")
model25= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.hmax, dist= "weibull")
model26= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.ws10m, dist= "weibull")
model27= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.ws100m, dist= "weibull")
model28= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.cbh, dist= "weibull")

# I want R-squared ffs #
library(PAmeasures)
fit1= survreg(Surv(Lifetime, Status) ~ Average.of.t2m, data= faults.ready, dist= "weibull",x=TRUE,y=TRUE)
pam.survreg(fit1)

p <- predict(fit1, type = "response") #https://stackoverflow.com/a/51142934
with(ovarian, pam.censor(futime, p, fustat))


# evaluating performance(tu chce miec MSE, MAPE etc co mozna miec uzywajac accuracy() #
library(forecast)
predict.survreg(model1, faults.ready$Lifetime)
predict(model1, faults.ready$Lifetime)
performance= accuracy(model1, faults.ready$Lifetime)



############ PCA ###############
pca= faults.ready[,c(-1, -29)]
pca = pca[, !(colnames(pca) %in% c("Offshore","Status"))]
faults.pca= prcomp(na.omit(pca), center= TRUE, scale=TRUE) #28 principal components, each explaining percentage of total variation in the dataset #
summary(faults.pca)

# what is the optimal number of principal components (line plot of eigenvalues of principal components in analysis) #
screeplot(faults.pca, type="l", main= "Scree plot")
abline(1,0, col="red", lty= 2) # checking which components have eigvenvalue higher than 1 #

# bar plot of the variation explained by each principal component #
faults.pca.var= faults.pca$sdev^2
faults.pca.per= round(faults.pca.var/sum(faults.pca.var)*100, 1)
barplot(faults.pca.var, main= "Bar plot of variation explained", xlab= "PC number", ylab= "% of explained variation")

# or cumulative variance plot #
cumpro = cumsum(faults.pca.per)
plot(cumpro[0:28], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot")
abline(v = 5, col="blue", lty=5) # at which component #

# 7 principal components #
library(psych)
pc7= principal(cor(pca, use= "complete.obs"), nfactors=7, rotate="none") # 7 most important components #

model.w= survreg(Surv(faults.ready$Lifetime,faults.ready$Status) ~ pc7 , dist="weibull")
predict(pc7, faults.ready$Lifetime)

# regression with 7 most important components (tu potrzebuje zrobić Weibull regression model z 7 najważniejszymi principal component) #
pcrmodel= survreg(Surv(faults.ready$Lifetime,faults.ready$Status) ~ faults.pca$x[,1:7], dist="weibull")



summary(faults.ready$X30Average.of.ws100m)


faults.ready$wind_x = ifelse(faults.ready$X30Average.of.ws100m > 10, 10,  ifelse(faults.ready$X30Average.of.ws100m > 8, 8, 0))

assign_to_quantile <- function(x, q1, median, q3) {

  if(x < q1){
    result <- 0
  } else if(x < median){
    result <- 1
  }
  else if (x < q3){
    result <- 2
  }
  else{
    result <- 3
  }
  return(result)
  }



n=nrow(faults.ready)

for(i in n){
  faults.ready$wind_x[i] = assign_to_quantile(faults.ready$X30Average.of.ws100m[i], 5.8, 6.7, 11 )
}
faults.ready$wind_x = as.numeric(faults.ready$wind_x)

surv_fit <- survfit(Surv(Lifetime,Status) ~wind_x, data=faults.ready)
library(survminer)
ggsurvplot(surv_fit)