for the googleable r-help archives, I thought I would post what I wrote into my .Rprofile to automatically set some system information. the most relevant aspect is the determination of mc.cores. this is useful when users want to use the parallel package options(uname= system("uname", intern=TRUE)) options(os= if (getOption("uname")=="Darwin") "osx" else "linux") if ((getOption("os") != "osx") & (getOption("os") != "linux")) stop("You need to set options yourself. ?I only grok linux and osx\n") options(mc.cores= as.numeric(if (getOption("os")=="osx") system("sysctl -n hw.ncpu", intern=TRUE) else system("grep 'core id' /proc/cpuinfo | sort | uniq | wc -l", intern=TRUE))) options(hostname= system("hostname", intern=TRUE)) [below, I am also posting my current wrapper for the parallel library. I know it is amateurish, but it may be useful for novices exploring parallel calculations. it is a friendlier face.] [gripes: R is powerful, and the team that maintains it are saints. But R is not friendly. it lacks the ability to turn off recycling for enhanced error detection. it does not throw clear errors when one accesses a non-existing column in a data frame. it does not print out the user program line number where an error occurred. it lacks an end-user documentation system [like POD], though it does have good package documentation. it does have some unexpected behavior: mymatrix[1:2,] is a matrix, but mymatrix[1:1,] is a numeric. huh? data.table is necessary for reasonably fast data manipulation, but data.table giveth and taketh. it has some really strange unexpected behavior---mydatatable[,1] is not the second column, as one would expect it to be. yes, it is documented, but syntax should be as expected.] /iaw ---- Ivo Welch (ivo.welch at gmail.com) ################################################################ ### ### these R functions are very type-limited wrappers for ### by()-like operations, using the multicore library. this ### means effort-less multi-CPU calculations. ### ### the user functions MUST return a numeric scalar, a vector, a matrix, or ### a data frame. to enhance speed, internally the user function is ### wrapped, too. ### ### the output is ONE matrix, whose row-names are the categories. ################################################################ check.output <- function( mc.rv ) { ## check that we have a list of matrices, and that each matrix has the same number of columns numofcols <- (-1) for (i in 1:length(mc.rv)) { if (is.null(mc.rv[[i]])) next; if (! (is.matrix(mc.rv[[i]])|is.data.frame(mc.rv[[i]])) ) abort("iaw-mc.R:check.output: Element", i, "is not a matrix/dataframe, but a ", whatis(mc.rv[[i]])) if (numofcols<0) numofcols <- ncol(mc.rv[[i]]) if (numofcols<0) next if (ncol(mc.rv[[i]]) != numofcols) { print(head(mc.rv[[i]])) abort("iaw-mc.R:check.output: Element", i, "should have", numofcols, "columns, but has", ncol(mc.rv[[i]]), "columns instead.") } } } add.by.names <- function( mc.rv ) { for (i in 1:length(mc.rv)) if (!is.null(mc.rv[[i]])) row.names(mc.rv[[i]]) <- rep( names(mc.rv)[i], nrow(mc.rv[[i]]) ) mc.rv } .mc.by <- function(lcapplyversion, data, INDICES, FUN, ...) { si <- split(1:nrow(data), INDICES) ## input = set of row indexes ; output = one row in a matrix or data frame, that can be stacked up FUN.ON.ROWS <- function(.index, ...) { rv <- FUN(data[.index,], ...); if (is.null(rv)) rv else if (is.vector(rv)) matrix(rv, nrow=1) else rv } soln <- lcapplyversion( si, FUN.ON.ROWS, ... ) check.output(soln) rv <- do.call("rbind", add.by.names(soln)) if (is.null(rv)) { print(head(soln)); abort("Sorry, but in .mc.by, the rv is null!\n") } if (ncol(rv)==1) { nm <- rownames(rv) rv <- as.vector(rv) names(rv) <- nm } rv } mc.by <- function(data, INDICES, FUN, ...) { .mc.by(mclapply, data, INDICES, FUN, ...) } oc.by <- function(data, INDICES, FUN, ...) { .mc.by(lapply, data, INDICES, FUN, ...) } mc.byallrows <- function(data, FUN, ...) { si <- as.list(1:nrow(data)) ## a little faster than the split for large data sets FUN.ON.ROWS <- function(.index, ...) { rv <- FUN(data[.index,], ...); if (is.null(rv)) rv else if (is.vector(rv)) matrix(rv, nrow=1) else rv } soln <- mclapply( si, FUN.ON.ROWS, ..., mc.cores= 4 ) check.output(soln) rv <- do.call("rbind", soln) ## omits naming. if (ncol(rv)==1) rv <- as.vector(rv) rv } if (0) { function.sample <- function(d) cbind(d$x+d$y, d$x, d$y) function.sample.simpler <- function(d) (d$x+d$y) d <- data.frame( i=c( rep(1,2), rep(2,3), rep(3,4) ), x=rnorm(9), y=rnorm(9) ) report <- function( text2print, f.output ) { cat("\n\n", text2print, ":\n"); print(f.output); cat("\n\n") } report( "the original R by() function", by( d, d$i, function.sample )) report( "wrappled multicore by mc.by with user function returning scalar", mc.by( d, d$i, function.sample.simpler )) report( "wrappled multicore by mc.by with user function returning vector", mc.by( d, d$i, function.sample )) report( "wrappled multicore by mc.byallrows ", mc.byallrows( d, d$i, function.sample )) }