Untitled
unknown
plain_text
2 years ago
9.5 kB
5
Indexable
#bibliotecas necessárias library(corrplot) library(car) library(usdm) library(gvlma) library(MASS) library(leaps) library(ggplot2) ###### Função tabela ANOVA # Credits to: Russell Steele # Build Anova table in classic format anova_reg = function (object, reg_collapse=TRUE,...) { if (length(list(object, ...)) > 1L) return(anova.lmlist(object, ...)) if (!inherits(object, "lm")) warning("calling anova.lm(<fake-lm-object>) ...") w <- object$weights ssr <- sum(if (is.null(w)) object$residuals^2 else w * object$residuals^2) mss <- sum(if (is.null(w)) object$fitted.values^2 else w * object$fitted.values^2) if (ssr < 1e-10 * mss) warning("ANOVA F-tests on an essentially perfect fit are unreliable") dfr <- df.residual(object) p <- object$rank if (p > 0L) { p1 <- 1L:p comp <- object$effects[p1] asgn <- object$assign[stats:::qr.lm(object)$pivot][p1] nmeffects <- c("(Intercept)", attr(object$terms, "term.labels")) tlabels <- nmeffects[1 + unique(asgn)] ss <- c(vapply(split(comp^2, asgn), sum, 1), ssr) df <- c(lengths(split(asgn, asgn)), dfr) if(reg_collapse){ if(attr(object$terms, "intercept")){ collapse_p<-2:(length(ss)-1) ss<-c(ss[1],sum(ss[collapse_p]),ss[length(ss)]) df<-c(df[1],sum(df[collapse_p]),df[length(df)]) tlabels<-c(tlabels[1],"Source") } else{ collapse_p<-1:(length(ss)-1) ss<-c(sum(ss[collapse_p]),ss[length(ss)]) df<-c(df[1],sum(df[collapse_p]),df[length(df)]) tlabels<-c("Regression") } } }else { ss <- ssr df <- dfr tlabels <- character() if(reg_collapse){ collapse_p<-1:(length(ss)-1) ss<-c(sum(ss[collapse_p]),ss[length(ss)]) df<-c(df[1],sum(df[collapse_p]),df[length(df)]) } } ms <- ss/df f <- ms/(ssr/dfr) P <- pf(f, df, dfr, lower.tail = FALSE) table <- data.frame(df, ss, ms, f, P) table <- rbind(table, colSums(table)) if (attr(object$terms, "intercept")){ table$ss[nrow(table)]<- table$ss[nrow(table)] - table$ss[1] } table$ms[nrow(table)]<-table$ss[nrow(table)]/table$df[nrow(table)] table[length(P):(length(P)+1), 4:5] <- NA dimnames(table) <- list(c(tlabels, "Error","Total"), c("Df","SS", "MS", "F", "P")) if (attr(object$terms, "intercept")){ table <- table[-1, ] table$MS[nrow(table)]<-table$MS[nrow(table)]*(table$Df[nrow(table)])/(table$Df[nrow(table)]-1) table$Df[nrow(table)]<-table$Df[nrow(table)]-1 } structure(table, heading = c("Analysis of Variance Table\n"), class = c("anova", "data.frame")) } ########################### ####TRATAMENTO DE DADOS#### dados <- read.delim("Trabalho10_Dados.txt", header= FALSE, sep="") #escolha dos dados pedidos no enunciado dados <- dados[,2:13] names(dados)<- c("Age", "Height", "Sex", "Survival", "Shock.Type", "Systolic.Pressure", "Mean.Arterial.Pressure", "Heart.Rate", "Diastolic.Pressure", "Mean.Central.Venous.Pressure", "Body.Surface.Area", "Cardiac.Index") rows <- nrow(dados) even_rows <- seq_len(rows) %% 2 dados_inicial <- dados[even_rows == 1,] rownames(dados_inicial) <- NULL #renumerar linhas nrows <- nrow(dados_inicial) #transformar qualitativas qualitativas <- c("Sex", "Survival", "Shock.Type") for (i in 1:ncol(dados_inicial)){ if (names(dados_inicial[i]) %in% qualitativas){ dados_inicial[,i] <- factor(dados_inicial[,i]) } } #formato correto dados_inicial$Mean.Central.Venous.Pressure <- dados_inicial$Mean.Central.Venous.Pressure * 10^-1 dados_inicial$Body.Surface.Area <- dados_inicial$Body.Surface.Area * 10^-2 dados_inicial$Cardiac.Index <- dados_inicial$Cardiac.Index * 10^-2 ###ANALISE DE DADOS### summary(dados_inicial) #box.plot continuas dados_cont <- dados_inicial[ ,-c(match(qualitativas, names(dados_inicial)))] par(mfrow = c(3, ncol(dados_cont)/3), mar=c(2,2,2,2), cex=0.5) lapply(1:ncol(dados_cont), function(i) boxplot(dados_cont[,i], main=names(dados_cont)[i])) #correlações a <- cor(data.matrix(dados_cont)) dev.off() corrplot(a, method = 'color', addCoef.col = 'black', number.cex = 0.45, tl.cex = 0.5, tl.col="black") #variavel resposta y <- matrix(dados_inicial$Cardiac.Index) dados_inicial <- dados_inicial[,-ncol(dados_inicial)] dados_cont <- dados_cont[-length(dados_cont)] #eliminar Mean Arterial Pressure vifstep(dados_cont, th=10) dados_inicial <- dados_inicial[, -which(colnames(dados_inicial) %in% c("Mean.Arterial.Pressure"))] #analisar correlacoes das categoricas??? #separar dados de treino e de teste set.seed(73) teste_ind<-sort(sample(nrows,0.2*nrows)) #Treino dados_treino <-dados_inicial[-teste_ind,] y_treino <- y[-teste_ind] summary(dados_treino) #Teste dados_teste <- dados_inicial[teste_ind,] y_teste <- y[teste_ind] ####ANALISE PRELIMINAR DO MODELO COMPLETO#### #modelo completo com covariaveis eliminadas mrl.comp <- mrl.comp <-lm(y_treino ~ .,data=dados_treino) plot(mrl.comp) summary(mrl.comp) extractAIC(mrl.comp) gvlma(mrl.comp) #gráfico de previsões pred_comp <- predict(mrl.comp, newdata = dados_teste) d<-data.frame(pred_comp, y=y_teste) ggplot(d, aes(pred_comp, y)) + geom_point(shape = 16, size = 3, show.legend = FALSE) + geom_abline(intercept = 0, slope = 1, color = "red") #y não cumpre normalidade shapiro.test(y_treino) #p-value = 5.896e-05 #boxcox para por y normal b <- boxcox(lm(y_treino ~ 1)) lambda <- b$x[which.max(b$y)] #lambda=0.3434343... y_treino2 <- (y_treino^lambda-1)/lambda shapiro.test(y_treino2) #p-value = 0.5483 > 0.25 y_teste2 <- (y_teste^lambda-1)/lambda #usa-se o mesmo lambda para que treino e teste tenham o mesmo "significado" shapiro.test(y_teste2) #p-value = 0.9791 > 0.25 #plot(lm(y_treino2 ~.,data=dados_treino)) ##outliers y boxplot(y_treino2) boxplot(y_treino2)$out #modelo com y_treino normal mrl.comp2 <-lm(y_treino2 ~ .,data=dados_treino) #plot(mrl.comp2) #mais proximo de normal summary(mrl.comp2) extractAIC(mrl.comp2) gvlma(mrl.comp2) #grafico de previsoes pred_comp2 <- predict(mrl.comp2, newdata = dados_teste) d<-data.frame(pred_comp2, y=y_teste2) ggplot(d, aes(pred_comp2, y)) + geom_point(shape = 16, size = 3, show.legend = FALSE) + geom_abline(intercept = 0, slope = 1, color = "red") ###stepforward### #1 - sem interações mrl.base <-lm(y_treino2~1, data=dados_treino) mrl.stepforward <- step(mrl.base, scope = list(upper = formula(mrl.comp2), lower = formula(mrl.base)), direction = "forward", trace="FALSE") summary(mrl.stepforward) extractAIC(mrl.stepforward) anova(mrl.stepforward, mrl.comp2) formula(mrl.stepforward) #2- int 2 a 2 mrl.int2 <- lm(y_treino2 ~ (Shock.Type + Sex + Diastolic.Pressure + Systolic.Pressure + Age + Body.Surface.Area)^2, data = dados_treino) mrl.stepforward2 <- step(mrl.stepforward, scope = list(upper = formula(mrl.int2), lower = formula(mrl.stepforward)), direction = "forward", trace="FALSE") summary(mrl.stepforward2) extractAIC(mrl.stepforward2) anova(mrl.stepforward2, mrl.int2) formula(mrl.stepforward2) #testar se se considera a única interacao 3 a 3 possivel (nao) mrl.int3 <- update(mrl.stepforward2, .~. + Sex:Body.Surface.Area:Diastolic.Pressure) extractAIC(mrl.int3) #menor do que o do stepforward2, logo nao se considera a interacao 3 a 3 ###stepbackward### #1 - sem interacoes mrl.stepbackward <- step(mrl.comp2, direction = "backward", trace=FALSE) summary(mrl.stepbackward) extractAIC(mrl.stepbackward) anova(mrl.stepbackward, mrl.comp2) formula(mrl.stepbackward) #tem as mesmas variaveis que stepforward #2 - interacoes 2 a 2 mrl.stepbackward2 <- step(mrl.int2, direction = "backward", trace=FALSE) summary(mrl.stepbackward2) extractAIC(mrl.stepbackward2) anova(mrl.stepbackward2, mrl.int2) formula(mrl.stepbackward2) #testar se se considera a única interacao 3 a 3 possivel (nao) mrl.intb3 <- update(mrl.stepbackward2, .~. + Sex:Body.Surface.Area:Diastolic.Pressure) extractAIC(mrl.intb3) #menor do que o do stepbackward2, logo nao se considera a interacao 3 a 3 ###bestsubset### #bestsubset com r^2 ajustado bestsubsetsR <- leaps(dados_treino[,-c(3,4,5)], y_treino2, method="adjr2") best.fitR<-bestsubsetsR$which[which.max(bestsubsetsR$adjr2),] best.fitR<-which(best.fitR) best.fitR <- ifelse(best.fitR > 2, best.fitR + 3, best.fitR) best.fitR<-c(3,4,5,best.fitR) mrl.bestR <- lm(y_treino2 ~ .,data=dados_treino[,best.fitR]) summary(mrl.bestR) extractAIC(mrl.bestR) anova(mrl.bestR, mrl.comp2) formula(mrl.bestR) #escolhemos o stepforward2 ##OUTLIERS e ptos influentes outlierTest(mrl.stepforward2) influencePlot(mrl.stepforward2) ##VALIDACAO ##teste das hipoteses iniciais gvlma(mrl.stepforward2) anova_reg(mrl.stepforward2) plot(mrl.stepforward2) ncvTest(mrl.stepforward2) shapiro.test(mrl.stepforward2$residuals) durbinWatsonTest(mrl.stepforward2) plot(mrl.stepforward2$residuals, type = "l") #gráfico de previsões - desfazendo boxcox (???) pred_final <- predict(mrl.stepforward2, newdata = dados_teste) d<-data.frame(pred_comp, y=y_teste) ggplot(d, aes(pred_comp, y)) + geom_point(shape = 16, size = 3, show.legend = FALSE) + geom_abline(intercept = 0, slope = 1, color = "red")
Editor is loading...