dear R readers---I thought I would post the following snippet of R code that makes by() like operations easier and faster on multicore machines for R novices and amateurs. I hope it helps some. YMMV. feel free to ignore. PS: I wish R had a POD-like documentation system for end users that are not writing full libraries. because it does not, I did not provide documentation ala '?mc.by'. /iaw set.seed(0) library(multicore) ################################################################ ### ### these R functions are very type-limited wrappers for ### by()-like operations, using the multicore library. this ### means effort-less multi-CPU calculations. ### ### the user functions MUST return a numeric scalar or a vector. ### ### to enhance speed, internally the user function is wrapped, too. ### ### the output is ONE matrix, whose row-names are the categories. ### ################################################################ add.by.names <- function( mc.rv ) { for (i in 1:length(mc.rv)) { stopifnot( is.matrix( mc.rv[[i]] ) ) row.names(mc.rv[[i]]) <- rep( names(mc.rv)[i], nrow(mc.rv[[i]]) ) } mc.rv } mc.by <- function(data, INDICES, FUN, ...) { FUN.i <- function(.index, ...) { cbind(FUN(data[.index,], ...)) } ## we cbind, so we always get an array back si <- split(1:nrow(data), INDICES) soln <- mclapply( si, FUN.i, ... ) rv <- do.call("rbind", add.by.names(soln)) if (ncol(rv)==1) { nm <- rownames(rv) rv <- as.vector(rv) names(rv) <- nm } rv } mc.byallrows <- function(data, INDICES, FUN, ...) { FUN.i <- function(.index, ...) { cbind(FUN(data[.index,], ...)) } si <- as.list(1:nrow(data)) ## a little faster than the split for large data sets soln <- mclapply( si, FUN.i, ... ) rv <- do.call("rbind", soln) ## omits naming. if (ncol(rv)==1) rv <- as.vector(rv) rv } function.sample <- function(d) cbind(d$x+d$y, d$x, d$y) function.sample.simpler <- function(d) (d$x+d$y) d <- data.frame( i=c( rep(1,2), rep(2,3), rep(3,4) ), x=rnorm(9), y=rnorm(9) ) report <- function( text2print, f.output ) { cat("\n\n", text2print, ":\n"); print(f.output); cat("\n\n") } report( "the original R by() function", by( d, d$i, function.sample )) report( "wrappled multicore by mc.by with user function returning scalar", mc.by( d, d$i, function.sample.simpler )) report( "wrappled multicore by mc.by with user function returning vector", mc.by( d, d$i, function.sample )) report( "wrappled multicore by mc.byallrows ", mc.byallrows( d, d$i, function.sample ))