# Setup of a Correlation Lower Panel in Scatterplot Matrix myPanel.hist <- function(x, ...){ usr <- par("usr"); on.exit(par(usr)) # Para definir regiĆ³n de graficiaciĆ³n par(usr = c(usr[1:2], 0, 1.5) ) # Para obtener una lista que guarde las marcas de clase y conteos en cada una: h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) # Para dibujar los histogramas rect(breaks[-nB], 0, breaks[-1], y, col="cyan", ...) } # Setup of a Boxplot Diagonal Panel in Scatterplot Matrix myPanel.box <- function(x, ...){ usr <- par("usr", bty = 'n') on.exit(par(usr)) par(usr = c(-1, 1, min(x) - 0.5, max(x) + 0.5)) b <- boxplot(x, plot = F) whisker.i <- b$stats[1,] whisker.s <- b$stats[5,] hinge.i <- b$stats[2,] mediana <- b$stats[3,] hinge.s <- b$stats[4,] rect(-0.5, hinge.i, 0.5, mediana, col = 'gray') segments(0, hinge.i, 0, whisker.i, lty = 2) segments(-0.1, whisker.i, 0.1, whisker.i) rect(-0.5, mediana, 0.5, hinge.s, col = 'gray') segments(0, hinge.s, 0, whisker.s, lty = 2) segments(-0.1, whisker.s, 0.1, whisker.s) } # Setup of a Correlation Lower Panel in Scatterplot Matrix myPanel.cor <- function(x, y, digits = 2, prefix = "", cex.cor){ usr <- par("usr"); on.exit(par(usr = usr)) par(usr = c(0, 1, 0, 1)) r <- cor(x, y) txt <- format(c(r, 0.123456789), digits = digits)[1] txt <- paste(prefix, txt, sep = "") if(missing(cex.cor)) cex = 0.4/strwidth(txt) text(0.5, 0.5, txt, cex = 1 + 1.5*abs(r)) } # Ordinary or Studentized residuals QQ-plot with Shapiro-Wilk normal test results myQQnorm <- function(modelo, student = F, ...){ if(student){ res <- rstandard(modelo) lab.plot <- "Normal Q-Q Plot of Studentized Residuals" } else { res <- residuals(modelo) lab.plot <- "Normal Q-Q Plot of Residuals" } shapiro <- shapiro.test(res) shapvalue <- ifelse(shapiro$p.value < 0.001, "P value < 0.001", paste("P value = ", round(shapiro$p.value, 4), sep = "")) shapstat <- paste("W = ", round(shapiro$statistic, 4), sep = "") q <- qqnorm(res, plot.it = FALSE) qqnorm(res, main = lab.plot, ...) qqline(res, lty = 2, col = 2) text(min(q$x, na.rm = TRUE), max(q$y, na.rm = TRUE)*0.95, pos = 4, 'Shapiro-Wilk Test', col = "blue", font = 2) text(min(q$x, na.rm = TRUE), max(q$y, na.rm = TRUE)*0.80, pos = 4, shapstat, col = "blue", font = 3) text(min(q$x, na.rm = TRUE), max(q$y, na.rm = TRUE)*0.65, pos = 4, shapvalue, col = "blue", font = 3) } # Table of Summary Statistics mySumStats <- function(lm.model){ stats <- summary(lm.model) RMSE <- stats$sigma R2 <- stats$r.squared adjR2 <- stats$adj.r.squared result <- data.frame(Root_MSE = RMSE, R_square = R2, Adj_R_square = adjR2, row.names = "") format(result, digits = 6) } # Extract estimated and standardized coefficients, their 95% CI's and VIF's myCoefficients <- function(lm.model, dataset){ coeff <- coef(lm.model) scaled.data <- as.data.frame(scale(dataset)) coef.std <- c(0, coef(lm(update(formula(lm.model), ~.+0), scaled.data))) limites <- confint(lm.model, level = 0.95) vifs <- c(0, vif(lm.model)) result <- data.frame(Estimation = coeff, Coef.Std = coef.std, Limits = limites, Vif = vifs) names(result)[3:4] <- c("Limit_2.5%","Limit_97.5%") cat("Estimated and standardized coefficients, their 95% CI's and VIF's", "\n") result } # Analysis of Variance Table myAnova <- function(lm.model){ SSq <- unlist(anova(lm.model)["Sum Sq"]) k <- length(SSq) - 1 SSR <- sum(SSq[1:k]) SSE <- SSq[(k + 1)] MSR <- SSR/k df.error <- unlist(anova(lm.model)["Df"])[k + 1] MSE <- SSE/df.error F0 <- MSR/MSE PV <- pf(F0, k, df.error, lower.tail = F) result<-data.frame(Sum_of_Squares = format(c(SSR, SSE), digits = 6), DF = format(c(k, df.error), digits = 6), Mean_Square = format(c(MSR, MSE), digits = 6), F_Value = c(format(F0, digits = 6), ''), P_value = c(format(PV, digits = 6), ''), row.names = c("Model", "Error")) result } # Diagnostics table for Leverage and Influence observations myInfluence <- function(model, infl = influence(model), covr = F){ is.influential <- function(infmat, n, covr = F){ d <- dim(infmat) colrm <- if(covr) 4L else 3L k <- d[[length(d)]] - colrm if (n <= k) stop("too few cases i with h_ii > 0), n < k") absmat <- abs(infmat) r <- if(!covr){ if(is.matrix(infmat)){ cbind(absmat[, 1L:k] > 2/sqrt(n), # > 1, absmat[, k + 1] > 2 * sqrt(k/n), # > 3 * sqrt(k/(n - k)), infmat[, k + 2] > 1, # pf(infmat[, k + 3], k, n - k) > 0.5, infmat[, k + 3] > 2 * p / n) # infmat[, k + 4] > (3 * k)/n) } else { c(absmat[, 1L:k] > 2/sqrt(n), # > 1, absmat[, k + 1] > 2 * sqrt(k/n), # > 3 * sqrt(k/(n - k)), infmat[, k + 3] > 1, # pf(infmat[, , k + 3], k, n - k) > 0.5, infmat[, k + 4] > 2 * p / n) # > (3 * k)/n) } } else { if(is.matrix(infmat)){ cbind(absmat[, 1L:k] > 2/sqrt(n), # > 1, absmat[, k + 1] > 2 * sqrt(k/n), # > 3 * sqrt(k/(n - k)), abs(1 - infmat[, k + 2]) > 3 * p / n, # > (3 * k)/(n - k), infmat[, k + 3] > 1, # pf(infmat[, k + 3], k, n - k) > 0.5, infmat[, k + 4] > 2 * p / n) # infmat[, k + 4] > (3 * k)/n) } else { c(absmat[, 1L:k] > 2/sqrt(n), # > 1, absmat[, k + 1] > 2 * sqrt(k/n), # > 3 * sqrt(k/(n - k)), abs(1 - infmat[, , k + 2]) > 3 * p / n, # > (3 * k)/(n - k), infmat[, k + 3] > 1, # pf(infmat[, , k + 3], k, n - k) > 0.5, infmat[, k + 4] > 2 * p / n) # > (3 * k)/n) } } attributes(r) <- attributes(infmat) r } p <- model$rank e <- weighted.residuals(model) s <- sqrt(sum(e^2, na.rm = TRUE)/df.residual(model)) mqr <- stats:::qr.lm(model) xxi <- chol2inv(mqr$qr, mqr$rank) si <- infl$sigma h <- infl$hat is.mlm <- is.matrix(e) cf <- if (is.mlm){ aperm(infl$coefficients, c(1L, 3:2)) } else infl$coefficients dfbetas <- cf/outer(infl$sigma, sqrt(diag(xxi))) vn <- variable.names(model) vn[vn == "(Intercept)"] <- "1_" dimnames(dfbetas)[[length(dim(dfbetas))]] <- paste0("dfb.", abbreviate(vn)) dffits <- e * sqrt(h)/(si * (1 - h)) if(any(ii <- is.infinite(dffits))) dffits[ii] <- NaN if(covr) cov.ratio <- (si/s)^(2 * p)/(1 - h) cooks.d <- if (inherits(model, "glm")){ (infl$pear.res/(1 - h))^2 * h/(summary(model)$dispersion * p) } else ((e/(s * (1 - h)))^2 * h)/p infmat <- if(is.mlm){ dns <- dimnames(dfbetas) dns[[3]] <- c(dns[[3]], "dffit", "cov.r", "cook.d", "hat") a <- array(dfbetas, dim = dim(dfbetas) + c(0, 0, 3 + 1), dimnames = dns) a[, , "dffit"] <- dffits if(covr) a[, , "cov.r"] <- cov.ratio a[, , "cook.d"] <- cooks.d a[, , "hat"] <- h a } else { if(covr){ cbind(dfbetas, dffit = dffits, cov.r = cov.ratio, cook.d = cooks.d, hat = h) } else cbind(dfbetas, dffit = dffits, cook.d = cooks.d, hat = h) } infmat[is.infinite(infmat)] <- NaN is.inf <- is.influential(infmat, sum(h > 0)) ans <- list(infmat = infmat, is.inf = is.inf, call = model$call) class(ans) <- "infl" ans } # Extract Collinearity Diagnostics myCollinDiag <- function(lm.model, center = F){ if(center == F){ X <- model.matrix(lm.model) eigen <- prcomp(X, center = FALSE, scale = TRUE)$sdev^2 cond.idx <- colldiag(lm.model) cond.idx$pi <- round(cond.idx$pi, 6) result <- data.frame(Eigen_Value = format(eigen, digits = 5), Condition_Index = cond.idx$condindx, cond.idx$pi) names(result)[2:3] <- c('Condition_Index','Intercept') cat("Collinearity Diagnostics", "\n", paste0(rep("", 3+sum(nchar(names(result)[1:2])))), "Variance Decomposition Proportions", "\n") } else{ X <- model.matrix(lm.model)[, -1] eigen <- prcomp(X, center = TRUE, scale = TRUE)$sdev^2 cond.idx <- colldiag(lm.model, center = TRUE, scale = TRUE) cond.idx$pi <- round(cond.idx$pi, 6) result <- data.frame(Eigen_Value = format(eigen, digits = 5), Condition_Index = cond.idx$condindx, cond.idx$pi) names(result)[2] <- 'Condition_Index' cat("Collinearity Diagnostics (intercept adjusted)", "\n", paste0(rep("", 3+sum(nchar(names(result)[1:2])))), "Variance Decomposition Proportions", "\n") } result } # All Posible Regressions Table myAllRegTable <- function(lm.model, response = model.response(model.frame(lm.model)), MSE = F){ regTable <- summary(regsubsets(model.matrix(lm.model)[, -1], response, nbest = 2^(lm.model$rank - 1) - 1, really.big = T)) pvCount <- as.vector(apply(regTable$which[, -1], 1, sum)) pvIDs <- apply(regTable$which[, -1], 1, function(x) as.character(paste(colnames(model.matrix(lm.model)[, -1])[x], collapse = " "))) result <- if(MSE){ data.frame(k = pvCount, R_sq = round(regTable$rsq, 3), adj_R_sq = round(regTable$adjr2, 3), MSE = round(regTable$rss/(nrow(model.matrix(lm.model)[,-1]) - (pvCount + 1)), 3), Cp = round(regTable$cp, 3), Variables_in_model = pvIDs) } else { data.frame(k = pvCount, R_sq = round(regTable$rsq, 3), adj_R_sq = round(regTable$adjr2, 3), SSE = round(regTable$rss, 3), Cp = round(regTable$cp, 3), Variables_in_model = pvIDs) } format(result, digits = 6) } # Summary table and Plots of the Best of All Posible Models by Criterion # Cp Criterion myCp_criterion <- function(lm.model, response = model.response(model.frame(lm.model))){ Cp <- leaps(model.matrix(lm.model)[, -1], response, method = "Cp", nbest = 1) # The Best model by number of parameters var_in_model <- apply(Cp$which, 1, function(x) as.character(paste(colnames(model.matrix(lm.model)[, -1])[x], collapse = " "))) Cp_result <- data.frame(k = Cp$size - 1, p = Cp$size, Cp = Cp$Cp, Variables.in.model = var_in_model) plot(Cp$size, Cp$Cp, type = "b", xlab = "p", ylab = '', xaxt = "n", cex = 2, ylim = c(0, max(Cp$Cp)), las = 1) axis(1, at = Cp$size, labels = Cp$size) mtext('Cp', 2, las = 1, adj = 3) abline(a = 0, b = 1, lty = 2, col = 2) cat("Models are Indexed in rows", "\n") print(Cp_result, row.names = F) } # R2 Criterion myR2_criterion <- function(lm.model, response = model.response(model.frame(lm.model))){ R2 <- leaps(model.matrix(lm.model)[, -1], response, method = "r2", nbest = 1) #Mejor modelo para cada p var_in_model <- apply(R2$which, 1, function(x) as.character(paste(colnames(model.matrix(lm.model)[, -1])[x], collapse = " "))) R2_result <- data.frame(k = R2$size - 1, p = R2$size, R2 = R2$r2, Variables.in.model = var_in_model) plot(R2$size, R2$r2, type = "b", xlab = "p", ylab = "", xaxt = "n", cex = 2, las = 1) axis(1, at = R2$size, labels = R2$size) mtext("R2", 2, las = 1, adj = 4) cat("Models are Indexed in rows", "\n") print(R2_result, row.names = F) } # adjR2 Criterion myAdj_R2_criterion <- function(lm.model, response = model.response(model.frame(lm.model))){ adjR2 <- leaps(model.matrix(lm.model)[, -1], response, method = "adjr2", nbest = 1) var_in_model <- apply(adjR2$which, 1, function(x) as.character(paste(colnames(model.matrix(lm.model)[, -1])[x], collapse = " "))) adjR2_result <- data.frame(k = adjR2$size - 1, p = adjR2$size, adjR2 = adjR2$adjr2, Variables.in.model = var_in_model) plot(adjR2$size, adjR2$adjr2, type = "b", xlab = "p", ylab = "", xaxt = "n", cex = 2, las = 1) axis(1, at = adjR2$size, labels = adjR2$size) mtext("adj_R2", 2, las = 1, adj = 2.2) cat("Models are Indexed in rows", "\n") print(adjR2_result, row.names = F) } myStepwise <- function(full.model, alpha.to.enter, alpha.to.leave, initial.model = lm(model.response(model.frame(full.model)) ~ 1)){ ################################################################################### # # # Function to perform a stepwise linear regression using F tests of significance, # # based on the function developed by Paul A. Rubin (rubin@msu.edu) # # URL = https://orinanobworld.blogspot.com/2011/02/stepwise-regression-in-r.html # # # ################################################################################### # # # full.model : model containing all possible terms # # alpha.to.enter: significance level above which a variable may enter # # alpha.to.leave: significance level below which a variable may be deleted # # initial.model : first model to consider. By default the first model is the one # # without predictors # ################################################################################### # # fit the full model full <- lm(full.model); # attach predictor variables in full model attach(as.data.frame(model.matrix(full.model)[, -1]), warn.conflicts = F); # MSE of full model msef <- (summary(full)$sigma)^2; # sample size n <- length(full$residuals); # this is the current model current <- lm(initial.model); # process each model until we break out of the loop while(TRUE){ # summary output for the current model temp <- summary(current); # list of terms in the current model rnames <- rownames(temp$coefficients); # write the model description print(temp$coefficients); # current model's size p <- dim(temp$coefficients)[1]; # MSE for current model mse <- (temp$sigma)^2; # Mallow's cp cp <- (n - p)*mse / msef - (n - 2 * p); # show the fit fit <- sprintf("\nS = %f, R-sq = %f, R-sq(adj) = %f, C-p = %f", temp$sigma, temp$r.squared, temp$adj.r.squared, cp); write(fit, file = ""); # print a separator write("=====", file = ""); # don't try to drop a term if only one is left if(p > 1){ # looks for significance of terms based on F tests d <- drop1(current, test = "F"); # maximum p-value of any term (have to skip the intercept to avoid an NA) pmax <- max(d[-1, 6]); # we have a candidate for deletion if(pmax > alpha.to.leave){ # name of variable to delete var <- rownames(d)[d[, 6] == pmax]; # if an intercept is present, it will be the first name in the list if(length(var) > 1){ # there also could be ties for worst p-value, a safe solution to # both issues is taking the second entry if there is more than one var <- var[2]; } # print out the variable to be dropped write(paste("--- Dropping", var, "\n"), file=""); # current formula f <- formula(current); # modify the formula to drop the chosen variable (by subtracting it) f <- as.formula(paste(f[2], "~", paste(f[3], var, sep=" - "))); # fit the modified model current <- lm(f); # return to the top of the loop next; } # if we get here, we failed to drop a term; try adding one } # note: add1 throws an error if nothing can be added (current == full), which # we trap with tryCatch # looks for significance of possible additions based on F tests a <- tryCatch(add1(current, scope = full.model, test = "F"), error = function(e) NULL); if(is.null(a)){ # there are no unused variables (or something went splat), so we bail out break; } # minimum p-value of any term (skipping the intercept again) pmin <- min(a[-1, 6]); # we have a candidate for addition to the model if(pmin < alpha.to.enter){ # name of variable to add var <- rownames(a)[a[,6] == pmin]; # same issue with ties, intercept as above if(length(var) > 1){ var <- var[2]; } # print the variable being added write(paste("+++ Adding", var, "\n"), file=""); # current formula f <- formula(current); # modify the formula to add the chosen variable f <- as.formula(paste(f[2], "~", paste(f[3], var, sep=" + "))); # fit the modified model current <- lm(f); # return to the top of the loop next; } # if we get here, we failed to make any changes to the model; time to punt break; } # detach predictor variables in full model detach(as.data.frame(model.matrix(full.model)[,-1])); current } myBackward <- function(base.full, alpha.to.leave = 0.05, verbose = T){ ################################################################################### # # # Function to perform a backward linear regression using F tests of significance, # # based on the function developed by Joris Meys # # URL = https://codeday.me/es/qa/20190117/101609.html # # # ################################################################################### # # # base.full : dataset(Y, X1...) # # alpha.to.leave: the significance level below which a variable may be deleted # # verbose : if TRUE, prints F-tests, dropped var and resulting model after # # # ################################################################################### # has.interaction <- function(x, terms){ ############################################################################### # # # Function has.interaction developed by Joris Meys, checks whether x is part # # of a term in terms, which is a vector with names of terms from a model # # # ############################################################################### # out <- sapply(terms, function(i){ sum(1 - (strsplit(x, ":")[[1]] %in% strsplit(i, ":")[[1]])) == 0 } ) return(sum(out) > 0) } counter <- 1 # check input #if(!is(model, "lm")) stop(paste(deparse(substitute(model)),"is not an lm object\n")) # calculate scope for drop1 function attach(base.full) model <- lm(base.full) terms <- attr(model$terms, "term.labels") # set scopevars to all terms scopevars <- terms # Backward model selection: while(TRUE){ # extract the test statistics from drop. test <- drop1(model, scope = scopevars, test = "F") if(verbose){ cat("-------------STEP ", counter, "-------------\n", "The drop statistics : \n") print(test) } pval <- test[, dim(test)[2]] names(pval) <- rownames(test) pval <- sort(pval, decreasing = T) if(sum(is.na(pval)) > 0){ stop(paste("Model", deparse(substitute(model)), "is invalid. Check if all coefficients are estimated.")) } # check if all significant if(pval[1] < alpha.to.leave){ # stops the loop if all remaining vars are sign. break } # select var to drop i <- 1 while(TRUE){ dropvar <- names(pval)[i] check.terms <- terms[-match(dropvar, terms)] x <- has.interaction(dropvar, check.terms) if(x){ i = i + 1 next } else { break } # end while(T) drop var } # stops the loop if var to remove is significant if(pval[i] < alpha.to.leave){ break } if(verbose){ cat("\n--------\nTerm dropped in step", counter, ":", dropvar, "\n--------\n\n") } # update terms, scopevars and model scopevars <- scopevars[-match(dropvar, scopevars)] terms <- terms[-match(dropvar, terms)] formul <- as.formula(paste(".~.-", dropvar)) model <- update(model, formul) if(length(scopevars) == 0){ warning("All variables are thrown out of the model.\n", "No model could be specified.") return() } counter <- counter + 1 # end while(T) main loop } detach(base.full) return(model) }