Dear list members,
Any help on this efficiency issue would be greatly appreciated.
I would like to find the most efficient way to run a non-vectorized function
(here: fisher exact test p-value) iteratively using 4 matrices with identical
dimensions. And as a result I aim for an array with identical dimensions
containing the corresponding p-values. Please consider some code using a trivial
example with 3x4 arrays below. Eventually I would like to run code on 2e3 x 7e6
arrays, for which someone suggested Amazon EC2 already...
Q1: would you agree that fisher.test() is not vectorizable? e.g. fisher.test(
matrix(c(Ax,Ay,Bx,By),ncol=2) ) does not work
Q2: direct use of Ax, Ay, Bx, By as input instead of a (list) transform for the
input would seem beneficial for speed
Q3: parallelization of the iterative process seems to make sense.
Q4: a progress bar seems to save peace of mind having no clue of the runtime.
Q5: avoidance of an output transform to get array from vector
Q6: for Q2/3/4/5 plyr seems to be ideal (e.g. maply)
Please also find some solutions below.
solution 1: using mapply
solution 2: using lapply
solution 3: using mclapply
attempt 4: stuck on plyr implementation
--Philip
### CODE START ###
Ax <- matrix(c(2,3,5,6,
3,7,8,9,
8,2,1,3), ncol = 4)
Ay <- matrix(c(9,8,5,7,
4,9,9,9,
8,7,5,4), ncol = 4)
Bx <- matrix(c(1,5,9,8,
4,7,8,9,
2,3,2,1), ncol = 4)
By <- matrix(c(5,5,9,9,
9,8,8,9,
5,5,3,2), ncol = 4)
### solution 1 using mapply
# proper answer, no input transform, output transform, no parallelization, no
progress update
sol1 <- function() {
res1 <- mapply(
function(i,j,k,l) { fisher.test( matrix(c(i,j,k,l), ncol=2),
conf.int=F)$p.value },
i=Ax, j=Ay, k=Bx, l=By,
SIMPLIFY=TRUE)
ans1 <- matrix(res1,ncol=4)
return(ans1)
}
s1 <- sol1()
### solution 2 using lapply
# proper answer, input transform, output transform, no parallelization, no
progress update
sol2 <- function() {
tmp.list <- as.data.frame(rbind(as.numeric(Ax), as.numeric(Ay),
as.numeric(Bx), as.numeric(By)))
# determine fisher.test p-values as list
res2 <- lapply(tmp.list,
function(x) { fisher.test( matrix(unlist(x), ncol=2), conf.int=F)$p.value })
ans2 <- matrix(unlist(res2),ncol=4)
return(ans2)
}
s2 <- sol2()
### solution 3 using mclapply
# proper answer using input transform, output transform, parallelization, no
progress update
library(multicore)
sol3 <- function() {
tmp.list <- as.data.frame(rbind(as.numeric(Ax), as.numeric(Ay),
as.numeric(Bx), as.numeric(By)))
# determine fisher.test p-values as list
res3 <- mclapply(tmp.list,
function(x) { fisher.test( matrix(unlist(x), ncol=2), conf.int=F)$p.value },
mc.cores=4)
ans3 <- matrix(unlist(res3),ncol=4)
}
s3 <- sol3()
### solution 4 using plyr::maply
# difficulty finding equivalent code
# benefit could be: no input transform, no output transform, parallelization,
and progress update
library(plyr)
library(abind)
library(doMC)
registerDoMC(cores=4)
sol4 <- function() {
ans4 <- maply(
#.data = abind(i=Ax,j=Ay,k=Bx,l=By,along=0),
#.data = abind(Ax,Ay,Bx,By,along=3),
#.data = data.frame(i=Ax, j=Ay, k=Bx, l=By),
#.data = cbind(i=as.vector(Ax), j=as.vector(Ay), k=as.vector(Bx),
l=as.vector(By)),
#.data = list(i=Ax, j=Ay, k=Bx, l=By),
.data = abind(i=Ax, j=Ay, k=Bx, l=By, along=3),
.fun = function(i,j,k,l) { fisher.test( matrix(c(i,j,k,l), ncol=2),
conf.int=F)$p.value },
.progress = "text",
.parralel = TRUE
)
return(ans4)
}
all.equal(s1,s2) # TRUE
all.equal(s1,s3) # TRUE
library(microbenchmark)
microbenchmark(sol1, times=1000)
microbenchmark(sol2, times=1000)
microbenchmark(sol3, times=1000)
microbenchmark(sol4, times=1000)
[[alternative HTML version deleted]]