Paul Johnson
2012-Mar-20 15:24 UTC
[Rd] overriding "summary.default" or "summary.data.frame". How?
I suppose everybody who makes a package for the first time thinks "I can change anything!" and then runs into this same question. Has anybody written out information on how a package can override functions in R base in the R 2.14 (mandatory NAMESPACE era)? Suppose I want to alphabetize variables in a summary.data.frame, or return the standard deviation with the mean in summary output. I'm pasting in a working example below. It has new "summary.factor" method. It also has a function summarize that I might like to use in place of summary.data.frame. How would my new methods "drop on top" of R base functions? It appears my functions (summarizeFactors) can find my summary.factor, but R's own summary uses its own summary.factor. ## summarizeNumerics takes a data frame or matrix, scans the columns ## to select only the numeric variables. By default it alphabetizes ## the columns (use alphaSort = FALSE to stop that). It then ## calculates the quantiles for each variable, as well as the mean, ## standard deviation, and variance, and then packs those results into ## a matrix. The main benefits from this compared to R's default ## summary are 1) more summary information is returned for each ## variable, and the results are returned in a matrix that is easy to ## use in further analysis. summarizeNumerics <- function(dat, alphaSort = TRUE, digits = max(3, getOption("digits") - 3)){ if (!is.data.frame(dat)) dat <- as.data.frame(dat) nums <- sapply(dat, is.numeric) datn <- dat[ , nums, drop = FALSE] if (alphaSort) datn <- datn[ , sort(colnames(datn)), drop = FALSE] sumdat <- apply(datn, 2, stats::quantile, na.rm=TRUE) sumdat <- rbind(sumdat, mean= apply(datn, 2, mean, na.rm=TRUE)) sumdat <- rbind(sumdat, sd= apply(datn, 2, sd, na.rm=TRUE)) sumdat <- rbind(sumdat, var= apply(datn, 2, var, na.rm=TRUE)) sumdat <- rbind(sumdat, "NA's"=apply(datn, 2, function(x) sum(is.na(x)))) signif(sumdat, digits) } summary.factor <- function(y, numLevels) { ## 5 nested functions to be used later divr <- function(p=0){ ifelse ( p>0 & p < 1, -p * log2(p), 0) } entropy <- function(p){ sum ( divr(p) ) } maximumEntropy <- function(N) -log2(1/N) normedEntropy <- function(x) entropy(x)/ maximumEntropy(length(x)) nas <- is.na(y) y <- factor(y) ll <- levels(y) tbl <- table(y) tt <- c(tbl) names(tt) <- dimnames(tbl)[[1L]] o <- sort.list(tt, decreasing = TRUE) if (length(ll) > numLevels){ toExclude <- numLevels:length(ll) tt <- c(tt[o[-toExclude]], `(All Others)` = sum(tt[o[toExclude]]), `NA's`=sum(nas)) }else{ tt <- c(tt[o], `NA's`=sum(nas)) } props <- prop.table(tbl); tt <- c(tt, "Entropy"=entropy(props), "NormedEntropy"= normedEntropy(props)) } ## Takes a data frame or matrix, scans the columns to find the ## variables that are not numeric and keeps them. By default it ## alphabetizes them (alphaSort = FALSE to stop that). It then treats ## all non-numeric variables as if they were factors, and summarizes ## each in a say that I find useful. In particular, for each factor, ## it provides a table of the most frequently occurring values (the ## top "numLevels" values are represented). As a diversity indictor, ## it calculates the Entropy and NormedEntropy values for each ## variable. Note not all of this is original. It combines my code ## and R code from base/summary.R summarizeFactors <- function(dat = NULL, numLevels = 10, alphaSort TRUE, digits = max(3, getOption("digits") - 3)) { ##copies from R base::summary.R summary.data.frame ncw <- function(x) { z <- nchar(x, type="w") if (any(na <- is.na(z))) { # FIXME: can we do better z[na] <- nchar(encodeString(z[na]), "b") } z } if (!is.data.frame(dat)) dat <- as.data.frame(dat) ##treat any nonnumeric as a factor factors <- sapply(dat, function(x) {!is.numeric(x) }) ##If only one factor, need drop=FALSE. datf <- dat[ , factors, drop = FALSE] if (alphaSort) datf <- datf[ , sort(colnames(datf)), drop = FALSE] z <- lapply(datf, summary.factor, numLevels=numLevels) nv <- length(datf) nm <- names(datf) lw <- numeric(nv) nr <- max(unlist(lapply(z, NROW))) for(i in 1L:nv) { sms <- z[[i]] lbs <- format(names(sms)) sms <- paste(lbs, ":", format(sms, digits = digits), " ", sep = "") lw[i] <- ncw(lbs[1L]) length(sms) <- nr z[[i]] <- sms } z <- unlist(z, use.names=TRUE) dim(z) <- c(nr, nv) if(any(is.na(lw))) warning("probably wrong encoding in names(.) of column ", paste(which(is.na(lw)), collapse = ", ")) blanks <- paste(character(max(lw, na.rm=TRUE) + 2L), collapse = " ") pad <- floor(lw - ncw(nm)/2) nm <- paste(substring(blanks, 1, pad), nm, sep = "") dimnames(z) <- list(rep.int("", nr), nm) attr(z, "class") <- c("table") z } ## ## want to override summary.data.frame, but confusing. When ## will R find my summary.data.frame, when will it find the one in base. ## use ... for numLevels, digits, alphaSort summarize <- function(dat, ...) { dots <- list(...) dotnames <- names(dots) ## next should give c("digits", "alphaSort") nnames <- names(formals(summarizeNumerics))[-1L] ## names that need keeping if in dots: keepnames <- dotnames %in% nnames if( sum(keepnames) > 0 ) { argList <- modifyList( list("dat"=quote(dat)), dots[keepnames] ) datn <- do.call("summarizeNumerics", argList) } else { datn <- do.call("summarizeNumerics", args=list("dat"= quote(dat))) } ## all ... can go to summarizeFactors datf <- summarizeFactors(dat, ...) value <- list(numerics = datn, factors = datf) value } set.seed(23452345) x1 <- gl(12,2, labels=LETTERS[1:12]) x2 <- gl(8,3, labels=LETTERS[12:24]) z1 <- rnorm(24) a1 <- rnorm(24, mean=1.2, sd = 1.7) a2 <- rpois(24, lambda=10 + a1) a3 <- rgamma(24, 0.5, 4) b1 <- rnorm(24, mean=1.3, sd = 1.4) dat <- data.frame(z1, a1, x2, a2, x1, a3, b1) summary(dat) summarize(dat) summarizeNumerics(dat) summarizeFactors(dat, numLevels=5) summarize(dat, alphaSort=FALSE) summarize(dat, digits=6, alphaSort=FALSE) summarize(dat, digits=22, alphaSort=FALSE) summarize(dat, numLevels= 2) datsumm <- summarize(dat) datsumm$numerics datsumm[[1]] ## same: gets numerics datsumm$factors datsumm[[2]] ## Use numerics output to make plots. First, ## transpose gives varnames x summary stat matrix datsummNT <- t(datsumm$numerics) datsummNT <- as.data.frame(datsummNT) plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances") plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances", type="n") text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT)) ## Here's a little plot wrinkle. Note variable names are "out to the ## edge" of the plot. If names are longer they don't stay inside ## figure. See? ## Make the variable names longer rownames(datsummNT) rownames(datsummNT) <- c("boring var","var with long name", "tedious name var", "stupid varname", "buffoon not baboon") plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances", type="n") text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8) ## That's no good. Names across the edges ## We could brute force the names outside the edges like this par(xpd=T) text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8) ## but that is not much better par(xpd=F) ## Here is one fix. Make the unused space inside the plot larger by ## making xlim and ylim bigger. I use the magRange function from ## rockchalk to easily expand range to 1.2 times its current size. ## otherwise, long variable names do not fit inside plot. magRange ## could be asymmetric if we want, but this use is symmetric. library(rockchalk) rownames(datsummNT) rownames(datsummNT) <- c("boring var","var with long name", "tedious name var", "stupid varname", "buffoon not baboon") plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances", type="n", xlim=magRange(datsummNT$mean, 1.2), ylim=magRange(datsummNT$var, 1.2)) text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8) ## Here's another little plot wrinkle. ## If we don't do that to keep the names in bounds, we need some ## fancy footwork. Note when a point is near the edge, I make sure ## the text prints toward the center of the graph. plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances") ## calculate label positions. This is not as fancy as it could be. ## If there were lots of variables, we'd have to get smarter about ## positioning labels on above, below, left, or right. labelPos <- ifelse(datsummNT$mean - mean(datsummNT$mean, na.rm=T) > 0, 2, 4) text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8, pos=labelPos) x <- data.frame(x=rnorm(100), y = gl(50,2), z = rep(1:4, 25), ab = gl(2,50)) summarize(x) summarize(x, numLevels=15) sumry <- summarize(x) sumry[[1]] ##another way to get the numerics output sumry[[2]] ##another way to get the factors output dat <- data.frame(x=rnorm(100), y = gl(50,2), z = factor(rep(1:4, 25), labels=c("A","B","C","D")), animal=factor(ifelse(runif(100)< 0.2, "cow", ifelse(runif(100) < 0.5,"pig","duck")))) summarize(dat) dat <- read.table(url("http://pj.freefaculty.org/guides/stat/DataSets/USNewsCollege/USNewsCollege.csv"), sep=",") colnames(dat) <- c("fice", "name", "state", "private", "avemath", "aveverb", "avecomb", "aveact", "fstmath", "trdmath", "fstverb", "trdverb", "fstact", "trdact", "numapps", "numacc", "numenr", "pctten", "pctquart", "numfull", "numpart", "instate", "outstate", "rmbrdcst", "roomcst", "brdcst", "addfees", "bookcst", "prsnl", "pctphd", "pctterm", "stdtofac", "pctdonat", "instcst", "gradrate") dat$private <- factor(dat$private, labels=c("public","private")) sumry <- summarize(dat, digits=2) sumry sumry[[1]] sumry[[2]] summarize(dat[ , c("fice","name","private","fstverb","avemath")], digits=4) -- Paul E. Johnson Professor, Political Science ? ?Assoc. Director 1541 Lilac Lane, Room 504 ? ? Center for Research Methods University of Kansas ? ? ? ? ? ? ? University of Kansas http://pj.freefaculty.org ? ? ? ? ? ?http://quant.ku.edu
Uwe Ligges
2012-Mar-21 17:26 UTC
[Rd] overriding "summary.default" or "summary.data.frame". How?
Simple answer: Never ever override R base functionality. Best, Uwe Ligges On 20.03.2012 16:24, Paul Johnson wrote:> I suppose everybody who makes a package for the first time thinks "I > can change anything!" and then runs into this same question. Has > anybody written out information on how a package can override > functions in R base in the R 2.14 (mandatory NAMESPACE era)? > > Suppose I want to alphabetize variables in a summary.data.frame, or > return the standard deviation with the mean in summary output. I'm > pasting in a working example below. It has new "summary.factor" > method. It also has a function summarize that I might like to use in > place of summary.data.frame. > > How would my new methods "drop on top" of R base functions? It > appears my functions (summarizeFactors) can find my summary.factor, > but R's own summary uses its own summary.factor. > > > ## summarizeNumerics takes a data frame or matrix, scans the columns > ## to select only the numeric variables. By default it alphabetizes > ## the columns (use alphaSort = FALSE to stop that). It then > ## calculates the quantiles for each variable, as well as the mean, > ## standard deviation, and variance, and then packs those results into > ## a matrix. The main benefits from this compared to R's default > ## summary are 1) more summary information is returned for each > ## variable, and the results are returned in a matrix that is easy to > ## use in further analysis. > summarizeNumerics<- function(dat, alphaSort = TRUE, digits = max(3, > getOption("digits") - 3)){ > if (!is.data.frame(dat)) dat<- as.data.frame(dat) > nums<- sapply(dat, is.numeric) > datn<- dat[ , nums, drop = FALSE] > if (alphaSort) datn<- datn[ , sort(colnames(datn)), drop = FALSE] > sumdat<- apply(datn, 2, stats::quantile, na.rm=TRUE) > sumdat<- rbind(sumdat, mean= apply(datn, 2, mean, na.rm=TRUE)) > sumdat<- rbind(sumdat, sd= apply(datn, 2, sd, na.rm=TRUE)) > sumdat<- rbind(sumdat, var= apply(datn, 2, var, na.rm=TRUE)) > sumdat<- rbind(sumdat, "NA's"=apply(datn, 2, function(x) sum(is.na(x)))) > signif(sumdat, digits) > } > > > summary.factor<- function(y, numLevels) { > ## 5 nested functions to be used later > > divr<- function(p=0){ > ifelse ( p>0& p< 1, -p * log2(p), 0) > } > entropy<- function(p){ > sum ( divr(p) ) > } > maximumEntropy<- function(N) -log2(1/N) > normedEntropy<- function(x) entropy(x)/ maximumEntropy(length(x)) > nas<- is.na(y) > y<- factor(y) > ll<- levels(y) > tbl<- table(y) > tt<- c(tbl) > names(tt)<- dimnames(tbl)[[1L]] > o<- sort.list(tt, decreasing = TRUE) > if (length(ll)> numLevels){ > toExclude<- numLevels:length(ll) > tt<- c(tt[o[-toExclude]], `(All Others)` = sum(tt[o[toExclude]]), > `NA's`=sum(nas)) > }else{ > tt<- c(tt[o], `NA's`=sum(nas)) > } > props<- prop.table(tbl); > tt<- c(tt, "Entropy"=entropy(props), "NormedEntropy"= normedEntropy(props)) > } > > > ## Takes a data frame or matrix, scans the columns to find the > ## variables that are not numeric and keeps them. By default it > ## alphabetizes them (alphaSort = FALSE to stop that). It then treats > ## all non-numeric variables as if they were factors, and summarizes > ## each in a say that I find useful. In particular, for each factor, > ## it provides a table of the most frequently occurring values (the > ## top "numLevels" values are represented). As a diversity indictor, > ## it calculates the Entropy and NormedEntropy values for each > ## variable. Note not all of this is original. It combines my code > ## and R code from base/summary.R > summarizeFactors<- function(dat = NULL, numLevels = 10, alphaSort > TRUE, digits = max(3, getOption("digits") - 3)) > { > > ##copies from R base::summary.R summary.data.frame > ncw<- function(x) { > z<- nchar(x, type="w") > if (any(na<- is.na(z))) { > # FIXME: can we do better > z[na]<- nchar(encodeString(z[na]), "b") > } > z > } > > if (!is.data.frame(dat)) dat<- as.data.frame(dat) > ##treat any nonnumeric as a factor > factors<- sapply(dat, function(x) {!is.numeric(x) }) > ##If only one factor, need drop=FALSE. > datf<- dat[ , factors, drop = FALSE] > if (alphaSort) datf<- datf[ , sort(colnames(datf)), drop = FALSE] > z<- lapply(datf, summary.factor, numLevels=numLevels) > nv<- length(datf) > nm<- names(datf) > lw<- numeric(nv) > nr<- max(unlist(lapply(z, NROW))) > for(i in 1L:nv) { > sms<- z[[i]] > lbs<- format(names(sms)) > sms<- paste(lbs, ":", format(sms, digits = digits), " ", > sep = "") > lw[i]<- ncw(lbs[1L]) > length(sms)<- nr > z[[i]]<- sms > } > z<- unlist(z, use.names=TRUE) > dim(z)<- c(nr, nv) > if(any(is.na(lw))) > warning("probably wrong encoding in names(.) of column ", > paste(which(is.na(lw)), collapse = ", ")) > blanks<- paste(character(max(lw, na.rm=TRUE) + 2L), collapse = " ") > pad<- floor(lw - ncw(nm)/2) > nm<- paste(substring(blanks, 1, pad), nm, sep = "") > dimnames(z)<- list(rep.int("", nr), nm) > attr(z, "class")<- c("table") > z > } > > ## > ## want to override summary.data.frame, but confusing. When > ## will R find my summary.data.frame, when will it find the one in base. > ## use ... for numLevels, digits, alphaSort > summarize<- function(dat, ...) > { > dots<- list(...) > dotnames<- names(dots) > ## next should give c("digits", "alphaSort") > nnames<- names(formals(summarizeNumerics))[-1L] > ## names that need keeping if in dots: > keepnames<- dotnames %in% nnames > if( sum(keepnames)> 0 ) { > argList<- modifyList( list("dat"=quote(dat)), dots[keepnames] ) > datn<- do.call("summarizeNumerics", argList) > } else { > datn<- do.call("summarizeNumerics", args=list("dat"= quote(dat))) > } > > ## all ... can go to summarizeFactors > datf<- summarizeFactors(dat, ...) > > value<- list(numerics = datn, factors = datf) > value > } > > > > > set.seed(23452345) > x1<- gl(12,2, labels=LETTERS[1:12]) > x2<- gl(8,3, labels=LETTERS[12:24]) > z1<- rnorm(24) > a1<- rnorm(24, mean=1.2, sd = 1.7) > a2<- rpois(24, lambda=10 + a1) > a3<- rgamma(24, 0.5, 4) > b1<- rnorm(24, mean=1.3, sd = 1.4) > dat<- data.frame(z1, a1, x2, a2, x1, a3, b1) > summary(dat) > > > summarize(dat) > > > summarizeNumerics(dat) > summarizeFactors(dat, numLevels=5) > > summarize(dat, alphaSort=FALSE) > > summarize(dat, digits=6, alphaSort=FALSE) > > summarize(dat, digits=22, alphaSort=FALSE) > > summarize(dat, numLevels= 2) > > datsumm<- summarize(dat) > > datsumm$numerics > datsumm[[1]] ## same: gets numerics > > datsumm$factors > datsumm[[2]] > > > ## Use numerics output to make plots. First, > ## transpose gives varnames x summary stat matrix > datsummNT<- t(datsumm$numerics) > datsummNT<- as.data.frame(datsummNT) > > plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances") > > plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The > Variances", type="n") > text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT)) > > ## Here's a little plot wrinkle. Note variable names are "out to the > ## edge" of the plot. If names are longer they don't stay inside > ## figure. See? > > ## Make the variable names longer > > rownames(datsummNT) > rownames(datsummNT)<- c("boring var","var with long name", "tedious > name var", "stupid varname", "buffoon not baboon") > plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The > Variances", type="n") > text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8) > ## That's no good. Names across the edges > > ## We could brute force the names outside the edges like this > par(xpd=T) > text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8) > ## but that is not much better > par(xpd=F) > > ## Here is one fix. Make the unused space inside the plot larger by > ## making xlim and ylim bigger. I use the magRange function from > ## rockchalk to easily expand range to 1.2 times its current size. > ## otherwise, long variable names do not fit inside plot. magRange > ## could be asymmetric if we want, but this use is symmetric. > library(rockchalk) > > rownames(datsummNT) > rownames(datsummNT)<- c("boring var","var with long name", "tedious > name var", "stupid varname", "buffoon not baboon") > plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The > Variances", type="n", xlim=magRange(datsummNT$mean, 1.2), > ylim=magRange(datsummNT$var, 1.2)) > text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), cex=0.8) > > ## Here's another little plot wrinkle. > ## If we don't do that to keep the names in bounds, we need some > ## fancy footwork. Note when a point is near the edge, I make sure > ## the text prints toward the center of the graph. > plot(datsummNT$mean, datsummNT$var, xlab="The Means", ylab="The Variances") > ## calculate label positions. This is not as fancy as it could be. > ## If there were lots of variables, we'd have to get smarter about > ## positioning labels on above, below, left, or right. > labelPos<- ifelse(datsummNT$mean - mean(datsummNT$mean, na.rm=T)> 0, 2, 4) > text(datsummNT$mean, datsummNT$var, labels=rownames(datsummNT), > cex=0.8, pos=labelPos) > > > > x<- data.frame(x=rnorm(100), y = gl(50,2), z = rep(1:4, 25), ab = gl(2,50)) > > summarize(x) > summarize(x, numLevels=15) > > sumry<- summarize(x) > sumry[[1]] ##another way to get the numerics output > sumry[[2]] ##another way to get the factors output > > dat<- data.frame(x=rnorm(100), y = gl(50,2), z = factor(rep(1:4, 25), > labels=c("A","B","C","D")), animal=factor(ifelse(runif(100)< 0.2, > "cow", ifelse(runif(100)< 0.5,"pig","duck")))) > > summarize(dat) > > dat<- read.table(url("http://pj.freefaculty.org/guides/stat/DataSets/USNewsCollege/USNewsCollege.csv"), > sep=",") > > colnames(dat)<- c("fice", "name", "state", "private", "avemath", "aveverb", > "avecomb", "aveact", "fstmath", "trdmath", "fstverb", "trdverb", > "fstact", "trdact", "numapps", "numacc", "numenr", "pctten", > "pctquart", "numfull", "numpart", "instate", "outstate", > "rmbrdcst", "roomcst", "brdcst", "addfees", "bookcst", "prsnl", > "pctphd", "pctterm", "stdtofac", "pctdonat", "instcst", "gradrate") > > dat$private<- factor(dat$private, labels=c("public","private")) > sumry<- summarize(dat, digits=2) > sumry > > sumry[[1]] > sumry[[2]] > > summarize(dat[ , c("fice","name","private","fstverb","avemath")], digits=4) > >