Untitled
unknown
r
3 years ago
11 kB
3
Indexable
##################### with interaction instead NAs in hmax and sst ######################## # setting the working directory # setwd(paste(getwd(), "/Scripts", sep ="")) # uploading cleaned dataset # faults.ready = read.table("thesisready.csv", sep = "\t", dec = ".", fill= TRUE, header = TRUE, encoding = "UTF-8", stringsAsFactors = FALSE, quote= "") faults.ready = faults.ready[,-c(1:8)] faults.ready= faults.ready[,-2] faults.ready= faults.ready[,-30] # handling NAs hmax => 0's # hmax0= 0 ind_missing= which(is.na(faults.ready$Average.of.hmax),arr.ind = T) faults.ready$Average.of.hmax[ind_missing]=hmax0 ind_missing= which(is.na(faults.ready$X30Average.of.hmax),arr.ind = T) faults.ready$X30Average.of.hmax[ind_missing]=hmax0 # handling NAs sst => creating interaction term offshore # sst0= 0 ind_missing= which(is.na(faults.ready$Average.of.sst),arr.ind = T) faults.ready$Average.of.sst[ind_missing]= sst0 ind_missing= which(is.na(faults.ready$X30Average.of.sst),arr.ind = T) faults.ready$X30Average.of.sst[ind_missing]= sst0 faults.ready$Offshore = 1 faults.ready$Offshore = ifelse(faults.ready$Average.of.sst == 0,faults.ready$Offshore == 0, faults.ready$Offshore == 1) faults.ready = na.omit(faults.ready) # heatmap of correlation # cor = cor(faults.ready, use= "complete.obs") heatmap(cor, rowv = NA, Colv = NA, scale = "none") # creating censoring = all of them died/failed # faults.ready$Status= 1 # checking for outliers # summary(faults.ready) # Lifetime interesting, wind interesting, not higher than 25 maybe bc then it turns off, min sum apcp # # overall doesn't seem like there is a problem with outliers # #### model #### library("survival") library("survminer") library("fitdistrplus") # proportional hazard # cox.mod= coxph(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m + faults.ready$Average.of.r2m + faults.ready$Average.of.den2m + faults.ready$Average.of.wg10m + faults.ready$Average.of.wg100m + faults.ready$Sum.of.apcp +faults.ready$Sum.of.asnow + faults.ready$Average.of.prsfc + faults.ready$Average.of.tcc + faults.ready$Average.of.sst + faults.ready$Average.of.hmax + faults.ready$Average.of.ws10m + faults.ready$Average.of.ws100m + faults.ready$Average.of.cbh + faults.ready$X30Average.of.t2m + faults.ready$X30Average.of.r2m + faults.ready$X30Average.of.den2m + faults.ready$X30Average.of.wg10m + faults.ready$X30Average.of.wg100m + faults.ready$X30Sum.of.apcp +faults.ready$X30Sum.of.asnow + faults.ready$X30Average.of.prsfc + faults.ready$X30Average.of.tcc + faults.ready$X30Average.of.sst + faults.ready$X30Average.of.hmax + faults.ready$X30Average.of.ws10m + faults.ready$X30Average.of.ws100m + faults.ready$X30Average.of.cbh) # checking linearity (also for Weibull or???) # plot(predict(cox.mod), residuals(cox.mod, type= "martingale"), xlab= "Fitted values", ylab= "Martingale residuals", main= "Residual Plot", las=1) abline(h=0, col="blue") lines(smooth.spline(predict(cox.mod), residuals((cox.mod), type="martingale")), col="red") # slight non-linearity, but not a big problem # # checking the proportional assumption using Shoenfeld residuals # test.proportionality= cox.zph(cox.mod) test.proportionality # statistically significant, proportional hazard cannot be assumed # # fitting the failures to weibull distribution # fit.wei= fitdist(faults.ready$Lifetime, "weibull") # weibull regression # model.w= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m + faults.ready$Average.of.r2m + faults.ready$Average.of.den2m + faults.ready$Average.of.wg10m + faults.ready$Average.of.wg100m + faults.ready$Sum.of.apcp +faults.ready$Sum.of.asnow + faults.ready$Average.of.prsfc + faults.ready$Average.of.tcc + faults.ready$Average.of.sst + faults.ready$Average.of.hmax + faults.ready$Average.of.ws10m + faults.ready$Average.of.ws100m + faults.ready$Average.of.cbh + faults.ready$X30Average.of.t2m + faults.ready$X30Average.of.r2m + faults.ready$X30Average.of.den2m + faults.ready$X30Average.of.wg10m + faults.ready$X30Average.of.wg100m + faults.ready$X30Sum.of.apcp +faults.ready$X30Sum.of.asnow + faults.ready$X30Average.of.prsfc + faults.ready$X30Average.of.tcc + faults.ready$X30Average.of.sst + faults.ready$X30Average.of.hmax + faults.ready$X30Average.of.ws10m + faults.ready$X30Average.of.ws100m + faults.ready$X30Average.of.cbh , dist="weibull") # individual regressions # model1= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.t2m, dist= "weibull") model2= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.r2m, dist= "weibull") model3= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.den2m, dist= "weibull") model4= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.wg10m, dist= "weibull") model5= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.wg100m, dist= "weibull") model6= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Sum.of.apcp, dist= "weibull") model7= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Sum.of.asnow, dist= "weibull") model8= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.prsfc, dist= "weibull") model9= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.tcc, dist= "weibull") model10= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.sst*faults.ready$Offshore, dist= "weibull") model11= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.hmax, dist= "weibull") model12= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.ws10m, dist= "weibull") model13= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.ws100m, dist= "weibull") model14= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$Average.of.cbh, dist= "weibull") model15= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.t2m, dist= "weibull") model6= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.r2m, dist= "weibull") model17= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.den2m, dist= "weibull") model18= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.wg10m, dist= "weibull") model19= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.wg100m, dist= "weibull") model20= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Sum.of.apcp, dist= "weibull") model21= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Sum.of.asnow, dist= "weibull") model22= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.prsfc, dist= "weibull") model23= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.tcc, dist= "weibull") model24= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.sst*faults.ready$Offshore, dist= "weibull") model25= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.hmax, dist= "weibull") model26= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.ws10m, dist= "weibull") model27= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.ws100m, dist= "weibull") model28= survreg(Surv(faults.ready$Lifetime,faults.ready$Status)~ faults.ready$X30Average.of.cbh, dist= "weibull") # I want R-squared ffs # library(PAmeasures) fit1= survreg(Surv(Lifetime, Status) ~ Average.of.t2m, data= faults.ready, dist= "weibull",x=TRUE,y=TRUE) pam.survreg(fit1) p <- predict(fit1, type = "response") #https://stackoverflow.com/a/51142934 with(ovarian, pam.censor(futime, p, fustat)) # evaluating performance(tu chce miec MSE, MAPE etc co mozna miec uzywajac accuracy() # library(forecast) predict.survreg(model1, faults.ready$Lifetime) predict(model1, faults.ready$Lifetime) performance= accuracy(model1, faults.ready$Lifetime) ############ PCA ############### pca= faults.ready[,c(-1, -29)] pca = pca[, !(colnames(pca) %in% c("Offshore","Status"))] faults.pca= prcomp(na.omit(pca), center= TRUE, scale=TRUE) #28 principal components, each explaining percentage of total variation in the dataset # summary(faults.pca) # what is the optimal number of principal components (line plot of eigenvalues of principal components in analysis) # screeplot(faults.pca, type="l", main= "Scree plot") abline(1,0, col="red", lty= 2) # checking which components have eigvenvalue higher than 1 # # bar plot of the variation explained by each principal component # faults.pca.var= faults.pca$sdev^2 faults.pca.per= round(faults.pca.var/sum(faults.pca.var)*100, 1) barplot(faults.pca.var, main= "Bar plot of variation explained", xlab= "PC number", ylab= "% of explained variation") # or cumulative variance plot # cumpro = cumsum(faults.pca.per) plot(cumpro[0:28], xlab = "PC #", ylab = "Amount of explained variance", main = "Cumulative variance plot") abline(v = 5, col="blue", lty=5) # at which component # # 7 principal components # library(psych) pc7= principal(cor(pca, use= "complete.obs"), nfactors=7, rotate="none") # 7 most important components # model.w= survreg(Surv(faults.ready$Lifetime,faults.ready$Status) ~ pc7 , dist="weibull") predict(pc7, faults.ready$Lifetime) # regression with 7 most important components (tu potrzebuje zrobić Weibull regression model z 7 najważniejszymi principal component) # pcrmodel= survreg(Surv(faults.ready$Lifetime,faults.ready$Status) ~ faults.pca$x[,1:7], dist="weibull") summary(faults.ready$X30Average.of.ws100m) faults.ready$wind_x = ifelse(faults.ready$X30Average.of.ws100m > 10, 10, ifelse(faults.ready$X30Average.of.ws100m > 8, 8, 0)) assign_to_quantile <- function(x, q1, median, q3) { if(x < q1){ result <- 0 } else if(x < median){ result <- 1 } else if (x < q3){ result <- 2 } else{ result <- 3 } return(result) } n=nrow(faults.ready) for(i in n){ faults.ready$wind_x[i] = assign_to_quantile(faults.ready$X30Average.of.ws100m[i], 5.8, 6.7, 11 ) } faults.ready$wind_x = as.numeric(faults.ready$wind_x) surv_fit <- survfit(Surv(Lifetime,Status) ~wind_x, data=faults.ready) library(survminer) ggsurvplot(surv_fit)
Editor is loading...