desctab by Daisung Jang

desctab generates a table that combines descriptive statistics with a correlation matrix, as often used in behavioural science papers.

Given a dataframe, desctab selects the numeric variables, and then generates an excel file that contains Ns, means, and SDs for each variable as well as a correlation matrix.

Suggestions for improvement are welcome:

########################### desctab by Daisung Jang with code borrowed from all over the internet (attributions where I've remembered them) provided without warranties may it help your project version 0.7
desctab <- function(x, y) {
    
    if (missing(y)) {
        y = "descriptives.xlsx"
    } else {
    }
    
    x <- Filter(is.numeric, x)
    
    # borrows heavily from the corstarsl function http://myowelt.blogspot.com/2008/04/beautiful-correlation-tables-in-r.html
    
    require(Hmisc)
    require(openxlsx)
    x <- as.matrix(x)
    R <- rcorr(x)$r
    p <- rcorr(x)$P
    
    ## define notions for significance levels; spacing is important.
    mystars <- ifelse(p < 0.01, "** ", ifelse(p < 0.05, "* ", ifelse(p < 0.1, "† ", " ")))
    ## trunctuate the matrix that holds the correlations to two decimal
    R <- format(round(cbind(rep(-1.11, ncol(x)), R), 2))[, -1]
    
    ## build a new matrix that includes the correlations with their appropriate stars
    Rnew <- matrix(paste(R, mystars, sep = ""), ncol = ncol(x))
    diag(Rnew) <- paste(diag(R), " ", sep = "")
    rownames(Rnew) <- colnames(x)
    colnames(Rnew) <- paste(colnames(x), "", sep = "")
    
    ## remove upper triangle
    Rnew <- as.matrix(Rnew)
    Rnew[upper.tri(Rnew, diag = TRUE)] <- ""
    Rnew <- as.data.frame(Rnew)
    
    ## create two versions of the table, one with just stars and another with just numbers
    Rnew <- cbind(Rnew[1:length(Rnew) - 1])
    temptab_star <- sapply(Rnew, function(x) gsub("[^\\*|^\\†]+", "", x))
    temptab_num <- sapply(Rnew, function(x) gsub("[^-|0-9|\\.|\\s]+", "", x))
    
    # create indexes for the desired order
    index <- order(c(1:ncol(temptab_num), 1:ncol(temptab_star)))  ## borrowed from here: https://stackoverflow.com/questions/24576548/merge-interleave-dataframes-in-r
    
    # cbind, interleaving columns
    cortab <- cbind(temptab_num, temptab_star)[, index]
    
    # generate column numbers
    colnum <- paste("(", seq(1:length(temptab_num[1, ])), ")", sep = "")
    
    colnames(cortab) <- c(rbind(colnum, rep("", ncol(temptab_star))))
    
    # Create a summary table and write to an Excel file, adapted from: https://stackoverflow.com/questions/50471794/sapply-retain-column-names
    
    sfsum = function(x) {
        mean = round(mean(x, na.rm = T), 2)
        sd = round(sd(x, na.rm = T), 2)
        n = sum(!is.na(x))
        return(c(N = n, Mean = mean, SD = sd))  #For column names
    }
    
    # Combine the two tables and add some useful notes.
    
    tab1 <- t(sapply(as.data.frame(x), sfsum, USE.NAMES = TRUE))  #USE.NAMES = TRUE to get names on top
    
    tab2 <- cbind(tab1, cortab)
    
    tab2 <- as.data.frame(tab2)
    
    tab2$Variables <- rownames(tab2)
    
    tab2 <- tab2[, c(which(colnames(tab2) == "Variables"), which(colnames(tab2) != "Variables"))]
    
    varnum <- paste(seq.int(nrow(tab2)), ". ", sep = "")
    
    tab2$Variables <- paste(varnum, tab2$Variables, sep = "")
    
    number <- rcorr(x)$n
    
    tabmin <- min(number)
    
    tabmax <- max(number)
    
    ns <- paste("The N for the correlation table ranges from ", tabmin, " to ", tabmax, ".", sep = "")
    
    note <- paste("Note. ", ns, " † p < .10, * p < .05, ** p < .01.", sep = "")
    
    noteline <- cbind(note, t(rep(NA, (ncol(tab2) - 1))))
    
    colnames(noteline) <- colnames(tab2)
    
    tab2 <- (rbind(tab2, noteline))
    
    wb <- createWorkbook()
    modifyBaseFont(wb, fontSize = 12, fontName = "Times New Roman")
    addWorksheet(wb, sheetName = "descriptives", gridLines = FALSE)
    
    l1 <- seq(5, ncol(tab2), by = 2)
    
    for (i in l1) {
        j = i + 1
        mergeCells(wb, 1, cols = i:j, rows = 1)
    }
    
    saveWorkbook(wb, y, overwrite = TRUE)
    
    rownames(tab2) <- c()
    
    excelcolnums <- c(rbind(colnum, rep("", ncol(temptab_star))))
    
    colnames(tab2) <- c("Variables", "N", "Mean", "SD", excelcolnums)
    
    return(tab2)
    
}

Example use

The first argument passed to desctab should be a dataframe. The second argument is an optional file name.

library(knitr)
data("iris")

iris_tab <- desctab(iris, "descriptives.xlsx")
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: openxlsx
kable(iris_tab)
Variables N Mean SD (1) (2) (3)
1. Sepal.Length 150 5.84 0.83
2. Sepal.Width 150 3.06 0.44 -0.12
3. Petal.Length 150 3.76 1.77 0.87 ** -0.43 **
4. Petal.Width 150 1.2 0.76 0.82 ** -0.37 ** 0.96 **
Note. The N for the correlation table ranges from 150 to 150. † p < .10, * p < .05, ** p < .01. NA NA NA NA NA NA NA NA NA