I am calculating the mean of each column grouped by the variable 'id'. I do this using aggregate, data.table, and plyr. My aggregate results do not match the other two, and I am trying to figure out what is incorrect with my syntax. Any suggestions? Thanks. Here is the data. myData <- structure(list(var1 = c(31.59, 32.21, 31.78, 31.34, 31.61, 31.61, 30.59, 30.84, 30.98, 30.79, 30.79, 30.94, 31.08, 31.27, 31.11, 30.42, 30.37, 30.29, 30.06, 30.3, 30.43, 30.61, 30.64, 30.75, 30.39, 30.1, 30.25, 31.55, 31.96, 31.87, 30.29, 30.15, 30.37, 29.59, 29.52, 28.96, 29.69, 29.58, 29.52, 30.21, 30.3, 30.25, 30.23, 30.29, 30.39), var2 = c(33.78, 33.25, NA, 32.05, 32.59, NA, 32.24, NA, NA, 32.15, 32.39, NA, 32.4, 31.6, NA, 30.5, 30.66, NA, 30.6, 29.95, NA, 31.24, 30.73, NA, 30.51, 30.43, 31.17, 31.44, 31.17, 31.18, 31.01, 30.98, 31.25, 30.44, 30.47, NA, 30.47, 30.56, NA, 30.6, 30.57, NA, 31, 30.8, NA), id = c("0m4", "0m4", "0m4", "0m5", "0m5", "0m5", "0m6", "0m6", "0m6", "0m11", "0m11", "0m11", "0m12", "0m12", "0m12", "205m1", "205m1", "205m1", "205m4", "205m4", "205m4", "205m5", "205m5", "205m5", "205m6", "205m6", "205m6", "205m7", "205m7", "205m7", "600m1", "600m1", "600m1", "600m3", "600m3", "600m3", "600m4", "600m4", "600m4", "600m5", "600m5", "600m5", "600m7", "600m7", "600m7")), .Names = c("var1", "var2", "id"), row.names = c(NA, -45L), class = "data.frame")> head(myData)var1 var2 id 1 31.59 33.78 0m4 2 32.21 33.25 0m4 3 31.78 NA 0m4 4 31.34 32.05 0m5 5 31.61 32.59 0m5 6 31.61 NA 0m5 results1 <- aggregate(. ~ id ,data=myData,FUN=mean,na.rm=T) head(results1,1) # id var1 var2 # 1 0m11 30.79 32.27 library(data.table) mydt <- data.table(myData) setkey(mydt,id) results2 <- mydt[,lapply(.SD,mean,na.rm=TRUE),by=id] head(results2,1) # id var1 var2 # [1,] 0m11 30.84 32.27 library(plyr) results3 <- ddply(myData,.(id),colwise(mean),na.rm=TRUE) head(results3,1) # id var1 var2 # 1 0m11 30.84 32.27> sessionInfo()R version 2.14.0 (2011-10-31) Platform: i386-pc-mingw32/i386 (32-bit) locale: [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252 LC_NUMERIC=C [5] LC_TIME=English_United States.1252 attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] plyr_1.6 data.table_1.7.3
The semantics for na.rm are different for aggregate than for the other options. The former removes any rows that contain an NA prior to performing the computation, the latter methods work column-wise. You have to decide which is correct for your purposes. --------------------------------------------------------------------------- Jeff Newmiller The ..... ..... Go Live... DCN:<jdnewmil at dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go... Live: OO#.. Dead: OO#.. Playing Research Engineer (Solar/Batteries O.O#. #.O#. with /Software/Embedded Controllers) .OO#. .OO#. rocks...1k --------------------------------------------------------------------------- Sent from my phone. Please excuse my brevity. Juliet Hannah <juliet.hannah at gmail.com> wrote:>I am calculating the mean of each column grouped by the variable 'id'. >I do this using aggregate, data.table, and plyr. My aggregate results >do not match the other two, and I am trying to figure out what is >incorrect with my syntax. Any suggestions? Thanks. > >Here is the data. > >myData <- structure(list(var1 = c(31.59, 32.21, 31.78, 31.34, 31.61, >31.61, >30.59, 30.84, 30.98, 30.79, 30.79, 30.94, 31.08, 31.27, 31.11, >30.42, 30.37, 30.29, 30.06, 30.3, 30.43, 30.61, 30.64, 30.75, >30.39, 30.1, 30.25, 31.55, 31.96, 31.87, 30.29, 30.15, 30.37, >29.59, 29.52, 28.96, 29.69, 29.58, 29.52, 30.21, 30.3, 30.25, >30.23, 30.29, 30.39), var2 = c(33.78, 33.25, NA, 32.05, 32.59, >NA, 32.24, NA, NA, 32.15, 32.39, NA, 32.4, 31.6, NA, 30.5, 30.66, >NA, 30.6, 29.95, NA, 31.24, 30.73, NA, 30.51, 30.43, 31.17, 31.44, >31.17, 31.18, 31.01, 30.98, 31.25, 30.44, 30.47, NA, 30.47, 30.56, >NA, 30.6, 30.57, NA, 31, 30.8, NA), id = c("0m4", "0m4", "0m4", >"0m5", "0m5", "0m5", "0m6", "0m6", "0m6", "0m11", "0m11", "0m11", >"0m12", "0m12", "0m12", "205m1", "205m1", "205m1", "205m4", "205m4", >"205m4", "205m5", "205m5", "205m5", "205m6", "205m6", "205m6", >"205m7", "205m7", "205m7", "600m1", "600m1", "600m1", "600m3", >"600m3", "600m3", "600m4", "600m4", "600m4", "600m5", "600m5", >"600m5", "600m7", "600m7", "600m7")), .Names = c("var1", "var2", >"id"), row.names = c(NA, -45L), class = "data.frame") > >> head(myData) > var1 var2 id >1 31.59 33.78 0m4 >2 32.21 33.25 0m4 >3 31.78 NA 0m4 >4 31.34 32.05 0m5 >5 31.61 32.59 0m5 >6 31.61 NA 0m5 > > > >results1 <- aggregate(. ~ id ,data=myData,FUN=mean,na.rm=T) > head(results1,1) ># id var1 var2 ># 1 0m11 30.79 32.27 > >library(data.table) >mydt <- data.table(myData) >setkey(mydt,id) >results2 <- mydt[,lapply(.SD,mean,na.rm=TRUE),by=id] > head(results2,1) ># id var1 var2 ># [1,] 0m11 30.84 32.27 > >library(plyr) >results3 <- ddply(myData,.(id),colwise(mean),na.rm=TRUE) > head(results3,1) ># id var1 var2 ># 1 0m11 30.84 32.27 > >> sessionInfo() >R version 2.14.0 (2011-10-31) >Platform: i386-pc-mingw32/i386 (32-bit) > >locale: >[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United >States.1252 LC_MONETARY=English_United States.1252 LC_NUMERIC=C >[5] LC_TIME=English_United States.1252 > >attached base packages: >[1] stats graphics grDevices utils datasets methods base > >other attached packages: >[1] plyr_1.6 data.table_1.7.3 > >______________________________________________ >R-help at r-project.org mailing list >https://stat.ethz.ch/mailman/listinfo/r-help >PLEASE do read the posting guide >http://www.R-project.org/posting-guide.html >and provide commented, minimal, self-contained, reproducible code.
look at just your data that is in that first id category and I bet you can figure it out!> myData[myData$id=='0m11',]var1 var2 id 10 30.79 32.15 0m11 11 30.79 32.39 0m11 12 30.94 NA 0m11 aggregate performs the na.rm step on the entire row thus, a mean of 30.79. data.table and plyr perform the na.rm on each column. Justin On Tue, Nov 29, 2011 at 12:21 PM, Juliet Hannah <juliet.hannah@gmail.com>wrote:> I am calculating the mean of each column grouped by the variable 'id'. > I do this using aggregate, data.table, and plyr. My aggregate results > do not match the other two, and I am trying to figure out what is > incorrect with my syntax. Any suggestions? Thanks. > > Here is the data. > > myData <- structure(list(var1 = c(31.59, 32.21, 31.78, 31.34, 31.61, 31.61, > 30.59, 30.84, 30.98, 30.79, 30.79, 30.94, 31.08, 31.27, 31.11, > 30.42, 30.37, 30.29, 30.06, 30.3, 30.43, 30.61, 30.64, 30.75, > 30.39, 30.1, 30.25, 31.55, 31.96, 31.87, 30.29, 30.15, 30.37, > 29.59, 29.52, 28.96, 29.69, 29.58, 29.52, 30.21, 30.3, 30.25, > 30.23, 30.29, 30.39), var2 = c(33.78, 33.25, NA, 32.05, 32.59, > NA, 32.24, NA, NA, 32.15, 32.39, NA, 32.4, 31.6, NA, 30.5, 30.66, > NA, 30.6, 29.95, NA, 31.24, 30.73, NA, 30.51, 30.43, 31.17, 31.44, > 31.17, 31.18, 31.01, 30.98, 31.25, 30.44, 30.47, NA, 30.47, 30.56, > NA, 30.6, 30.57, NA, 31, 30.8, NA), id = c("0m4", "0m4", "0m4", > "0m5", "0m5", "0m5", "0m6", "0m6", "0m6", "0m11", "0m11", "0m11", > "0m12", "0m12", "0m12", "205m1", "205m1", "205m1", "205m4", "205m4", > "205m4", "205m5", "205m5", "205m5", "205m6", "205m6", "205m6", > "205m7", "205m7", "205m7", "600m1", "600m1", "600m1", "600m3", > "600m3", "600m3", "600m4", "600m4", "600m4", "600m5", "600m5", > "600m5", "600m7", "600m7", "600m7")), .Names = c("var1", "var2", > "id"), row.names = c(NA, -45L), class = "data.frame") > > > head(myData) > var1 var2 id > 1 31.59 33.78 0m4 > 2 32.21 33.25 0m4 > 3 31.78 NA 0m4 > 4 31.34 32.05 0m5 > 5 31.61 32.59 0m5 > 6 31.61 NA 0m5 > > > > results1 <- aggregate(. ~ id ,data=myData,FUN=mean,na.rm=T) > head(results1,1) > # id var1 var2 > # 1 0m11 30.79 32.27 > > library(data.table) > mydt <- data.table(myData) > setkey(mydt,id) > results2 <- mydt[,lapply(.SD,mean,na.rm=TRUE),by=id] > head(results2,1) > # id var1 var2 > # [1,] 0m11 30.84 32.27 > > library(plyr) > results3 <- ddply(myData,.(id),colwise(mean),na.rm=TRUE) > head(results3,1) > # id var1 var2 > # 1 0m11 30.84 32.27 > > > sessionInfo() > R version 2.14.0 (2011-10-31) > Platform: i386-pc-mingw32/i386 (32-bit) > > locale: > [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United > States.1252 LC_MONETARY=English_United States.1252 LC_NUMERIC=C > [5] LC_TIME=English_United States.1252 > > attached base packages: > [1] stats graphics grDevices utils datasets methods base > > other attached packages: > [1] plyr_1.6 data.table_1.7.3 > > ______________________________________________ > R-help@r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. >[[alternative HTML version deleted]]