Benjamin Caldwell
2013-Apr-24 00:00 UTC
[R] help with execution of 'embarrassingly parallel' problem using foreach, doParallel on a windows system
Dear R helpers,
I have what another member on this forum described as
an embarrassingly parallel problem. I am trying to fit models on subsets of
some data based on unique combinations of two id factors in the dataset.
Total number of combinations is 30^5, and this takes a long time. So, I
would like fit models for each of the datasets produced by subsetting on
the unique combinations, splitting up the matrix of combinations into
pieces based on how many cores I have available to me (e.g. eight cores,
eight chunks).
I spent much of the day today trying to wrap myself around an expedient way
to do this on a windows system in R (seems like the 'best' way to do
this
on systems that support forking is much more settled), and picked foreach
and doParallel based on the package documentation for foreach; I'm open to
other suggestions.
Having tried it out though, it doesn't seem to be doing anything to speed
up the task - indeed, it takes more time! Also, compiling the functions
results in the 'winner' in terms of the serial and (I
hope) parallel implementations being reversed.
Most importantly, what have I done wrong here and how can I make this work?
What's the interaction of compile and foreach I'm seeing?
How can I tell whether multi-threading is occurring?
Suggestions for another approach?
Example:
###############################################################################################
require(foreach)
require(doParallel)
require(compile)
require(compiler)
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)
# foreach(i=1:100) %dopar% sqrt(i)
d <- sort(rep(letters[1:24], 5))
e <- rep(1:24, 5)
rand.int <- rnorm(n=length(e),mean=e, sd=4)
f <- 3+ (e*rand.int)^2
g <- sort(rep(1:6, 20))
one <- data.frame(d,e,f,g)
names(one) <- c('block1', 'ind',
'res','block2')
one[1:50,]
two <- expand.grid(one[,1], one[,4]) #actually is
expand.grid(unique(one[,1]), unique(one[,4]))
str(two)
names(two) <- c('block1', 'block2')
fitting <-
function(ndx.grd=two,dt.grd=one,ind.vr='ind',rsp.vr='res') {
ind.start<-10^8
item.out <- matrix(NA, ncol=3)
for(i in 1:length(ndx.grd[,1])){
tmp1 <- as.character(ndx.grd[i,1])
tmp2 <- as.character(ndx.grd[i,2])
wk.grd <- dt.grd[as.character(dt.grd[,1])==tmp1 &
as.character(dt.grd[,4])==tmp2,]
# browser()
try(ind.out <- summary(lm(ind~res, data=wk.grd))$sigma, silent=TRUE)
if(ind.out < ind.start) {
item.out[,1] <- tmp1
item.out[,2] <- tmp2
item.out[,3] <- ind.out
}
}
return(item.out)
}
fitting.c<-cmpfun(fitting)
#compiled
Rprof('myFunction.out', memory.profiling=T)
y <- fitting.c()
Rprof(NULL)
summaryRprof('myFunction.out', memory='both')
system.time(fitting.c())
Rprof('myFunction.out', memory.profiling=T)
y <- foreach(icount(length(two))) %dopar% fitting.c()
Rprof(NULL)
summaryRprof('myFunction.out', memory='both')
system.time(foreach(icount(length(two))) %dopar% fitting.c())
#uncompiled
Rprof('myFunction.out', memory.profiling=T)
y <- fitting()
Rprof(NULL)
summaryRprof('myFunction.out', memory='both')
system.time(fitting())
Rprof('myFunction.out', memory.profiling=T)
y <- foreach(icount(length(two))) %dopar% fitting()
Rprof(NULL)
summaryRprof('myFunction.out', memory='both')
system.time(foreach(icount(length(two))) %dopar% fitting())
###############################################################################################
*
*
*Ben Caldwell*
[[alternative HTML version deleted]]
