> dt = data.table(d,key="grp1,grp2")
> system.time(ans1 <- dt[ , list(mean(x),mean(y)) , by=list(grp1,grp2)])
user system elapsed
3.89 0.00 3.91 # your 7.064 is 12.23 for me though, so this
3.9 should be faster for you
However, Rprof() shows that 3.9 is mostly dispatch of mean to mean.default
which then calls .Internal. Because there are so many groups here, dispatch
bites.
So ...
> system.time(ans2 <- dt[ , list(.Internal(mean(x)),.Internal(mean(y))),
> by=list(grp1,grp2)])
user system elapsed
0.20 0.00 0.21
> identical(ans1,ans2)
TRUE
"Hadley Wickham" <hadley at rice.edu> wrote in message
news:AANLkTilH_-3_CycF_fNQMhH6W2oG5Jj5U0YopX_qAgRU at
mail.gmail.com...> library(plyr)
>
> n<-100000
> grp1<-sample(1:750, n, replace=T)
> grp2<-sample(1:750, n, replace=T)
> d<-data.frame(x=rnorm(n), y=rnorm(n), grp1=grp1, grp2=grp2)
>
> system.time({
> d$avx1 <- ave(d$x, list(d$grp1, d$grp2))
> d$avy1 <- ave(d$y, list(d$grp1, d$grp2))
> })
> # user system elapsed
> # 39.300 0.279 40.809
> system.time({
> d$avx2 <- ave(d$x, interaction(d$grp1, d$grp2, drop = T))
> d$avy2 <- ave(d$y, interaction(d$grp1, d$grp2, drop = T))
> })
> # user system elapsed
> # 6.735 0.209 7.064
>
> all.equal(d$avy1, d$avy2)
> # TRUE
> all.equal(d$avx1, d$avx2)
> # TRUE
>
> i.e. ave should use g <- interaction(..., drop = TRUE)
>
> Hadley
>
> --
> Assistant Professor / Dobelman Family Junior Chair
> Department of Statistics / Rice University
> http://had.co.nz/
>