I need an analogue of "uniq -c" for a data frame. xtabs(), although dog slow, would have footed the bill nicely: --8<---------------cut here---------------start------------->8---> x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32) > system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 ))user system elapsed 12.788 4.288 17.224 --8<---------------cut here---------------end--------------->8--- but, alas, if fails on larger data: system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 )) Error in table(a = 1:32, b = 1:32, c = 1:32, d = 1:32, e = 1:32, f = 1:32, : attempt to make a table with >= 2^31 elements (apparently, because the product of the numbers of all the possible values of all the columns is too large). rle() seems to be what I really need, but I cannot figure out what it returns for a simple example: --8<---------------cut here---------------start------------->8---> x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32,f=1:32,g=1:32,h=1:32) > rle(x)Run Length Encoding lengths: int 8 values :'data.frame': 32 obs. of 1 variable: $ h: int 1 2 3 4 5 6 7 8 9 10 ... --8<---------------cut here---------------end--------------->8--- (where are all the other columns?) and it fails on my actual data (3 column of factors): Error in Ops.factor(left, right) : level sets of factors are different when I replace factors with strings, I get: Error in `[.data.frame`(x, i) : undefined columns selected dput: --8<---------------cut here---------------start------------->8--- structure(list(user = c("45ff768774777593", "45ff768774777593", "45ff768774777593", "45ff768774777593", "45ff768774777593", "4bbf9e94cbceb70c", "4bbf9e94cbceb70c", "4fbbf2c67e0fb867", "4fbbf2c67e0fb867", "5038d46739f9f516", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "506b92707a3aa65f", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "501b52fe24d88162", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4e717c219268b736", "4e717c219268b736", "4e717c219268b736", "506bb429eeab2af4", "506bb429eeab2af4", "506bb429eeab2af4", "4f6f91cb83e1a7ef", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48", "4edff2aeb4df7613", "4edff2aeb4df7613", "506bba652fa6bf78", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "506bb525ffce3add", "506bb6cf52819b5f", "506bb6cf52819b5f", "4fe02e08a6a2ce64", "4fe02e08a6a2ce64", "5056d247bc7dae0b", "5056d247bc7dae0b", "5056d247bc7dae0b", "506abfecf61bf4e6", "4ec00aa243745ee5", "4ec00aa243745ee5", "4ec00aa243745ee5", "4ec00aa243745ee5", "506c69d7fb598afa", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111"), country = c("AR", "AR", "AR", "AR", "AR", "BG", "BG", "SK", "SK", "US", "VE", "VE", "VE", "VE", "VE", "VE", "VE", "TH", "TH", "TH", "MY", "US", "US", "US", "US", "MX", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "US", "US", "US", "US", "US", "US", "NP", "US", "US", "US", "US", "MX", "MX", "US", "AR", "AR", "AR", "AR", "AR", "AR", "AR", "AR", "AR", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "US", "MM", "MM", "US", "US", "IN", "IN", "IN", "IN", "CA", "CA", "CA", "CA", "US", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE"), language = c("es", "es", "es", "es", "es", "bg", "bg", "sk", "sk", "en", "es", "es", "es", "es", "es", "es", "es", "th", "th", "th", "en", "en", "en", "en", "en", "es", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "es", "es", "en", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "es", "en", "en", "en", "en", "en", "en", "en", "en", "en", "fr", "fr", "fr", "fr", "en", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", "de")), .Names = c("user", "country", "language"), row.names = c(1L, 2L, 3229330L, 3229331L, 3229332L, 3L, 9504492L, 4L, 9504493L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 4989444L, 4989445L, 4989446L, 18L, 19L, 20L, 21L, 22L, 95114L, 95115L, 95116L, 95117L, 9504509L, 9599604L, 23L, 3319263L, 3319264L, 24L, 25L, 9504513L, 26L, 27L, 28L, 7548719L, 7548720L, 29L, 30L, 31L, 32L, 2498862L, 2498863L, 2498864L, 2498865L, 4560918L, 4560919L, 5938642L, 14065408L, 33L, 4348151L, 4348152L, 5627634L, 5627635L, 9504522L, 13852641L, 15132124L, 34L, 35L, 6400711L, 36L, 2173763L, 37L, 38L, 39L, 40L, 41L, 10210641L, 10672811L, 16158334L, 42L, 43L, 44L, 45L, 1974646L, 4032952L, 4032953L, 4032954L, 4032955L, 4032956L, 4032957L, 4032958L, 4475376L, 4475377L, 4475378L, 4475379L, 5500564L, 7871329L, 7871330L, 8670694L), class = "data.frame") --8<---------------cut here---------------end--------------->8--- thanks! -- Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000 http://www.childpsy.net/ http://thereligionofpeace.com http://dhimmi.com http://ffii.org http://truepeace.org http://mideasttruth.com Bus error -- please leave by the rear door.
Have you looked at using table() directly? If I understand what you want correctly something like: table(do.call(paste, x)) Also, if you take a look at the development version of R, changes are being put in place to allow much larger data sets. Cheers, Micael On Tue, Oct 16, 2012 at 4:03 PM, Sam Steingold <sds at gnu.org> wrote:> I need an analogue of "uniq -c" for a data frame. > > xtabs(), although dog slow, would have footed the bill nicely: > --8<---------------cut here---------------start------------->8--- >> x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32) >> system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 )) > user system elapsed > 12.788 4.288 17.224 > --8<---------------cut here---------------end--------------->8--- > but, alas, if fails on larger data: > system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 )) > Error in table(a = 1:32, b = 1:32, c = 1:32, d = 1:32, e = 1:32, f = 1:32, : > attempt to make a table with >= 2^31 elements > (apparently, because the product of the numbers of all the possible > values of all the columns is too large). > > rle() seems to be what I really need, but I cannot figure out what it > returns for a simple example: > --8<---------------cut here---------------start------------->8--- >> x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32,f=1:32,g=1:32,h=1:32) >> rle(x) > Run Length Encoding > lengths: int 8 > values :'data.frame': 32 obs. of 1 variable: > $ h: int 1 2 3 4 5 6 7 8 9 10 ... > --8<---------------cut here---------------end--------------->8--- > (where are all the other columns?) > > and it fails on my actual data (3 column of factors): > Error in Ops.factor(left, right) : level sets of factors are different > > when I replace factors with strings, I get: > Error in `[.data.frame`(x, i) : undefined columns selected > > dput: > --8<---------------cut here---------------start------------->8--- > structure(list(user = c("45ff768774777593", "45ff768774777593", > "45ff768774777593", "45ff768774777593", "45ff768774777593", "4bbf9e94cbceb70c", > "4bbf9e94cbceb70c", "4fbbf2c67e0fb867", "4fbbf2c67e0fb867", "5038d46739f9f516", > "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", > "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4fe9e0496ecfc55e", > "4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "506b92707a3aa65f", "502c0a9ba9ce5019", > "502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "501b52fe24d88162", > "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", > "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", > "4fd4852ed504b160", "4fd4852ed504b160", "4e717c219268b736", "4e717c219268b736", > "4e717c219268b736", "506bb429eeab2af4", "506bb429eeab2af4", "506bb429eeab2af4", > "4f6f91cb83e1a7ef", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48", > "506bb8b62bde3c48", "4edff2aeb4df7613", "4edff2aeb4df7613", "506bba652fa6bf78", > "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", > "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", > "506a4941b50ca422", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", > "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", > "5036993a16b323d1", "506bb525ffce3add", "506bb6cf52819b5f", "506bb6cf52819b5f", > "4fe02e08a6a2ce64", "4fe02e08a6a2ce64", "5056d247bc7dae0b", "5056d247bc7dae0b", > "5056d247bc7dae0b", "506abfecf61bf4e6", "4ec00aa243745ee5", "4ec00aa243745ee5", > "4ec00aa243745ee5", "4ec00aa243745ee5", "506c69d7fb598afa", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111"), country = c("AR", "AR", > "AR", "AR", "AR", "BG", "BG", "SK", "SK", "US", "VE", "VE", "VE", > "VE", "VE", "VE", "VE", "TH", "TH", "TH", "MY", "US", "US", "US", > "US", "MX", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", > "JP", "US", "US", "US", "US", "US", "US", "NP", "US", "US", "US", > "US", "MX", "MX", "US", "AR", "AR", "AR", "AR", "AR", "AR", "AR", > "AR", "AR", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "US", > "MM", "MM", "US", "US", "IN", "IN", "IN", "IN", "CA", "CA", "CA", > "CA", "US", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", > "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE"), > language = c("es", "es", "es", "es", "es", "bg", "bg", "sk", > "sk", "en", "es", "es", "es", "es", "es", "es", "es", "th", > "th", "th", "en", "en", "en", "en", "en", "es", "en", "en", > "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", > "en", "en", "en", "en", "en", "en", "en", "en", "en", "es", > "es", "en", "es", "es", "es", "es", "es", "es", "es", "es", > "es", "es", "es", "es", "es", "es", "es", "es", "es", "en", > "en", "en", "en", "en", "en", "en", "en", "en", "fr", "fr", > "fr", "fr", "en", "de", "de", "de", "de", "de", "de", "de", > "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", > "de", "de")), .Names = c("user", "country", "language"), row.names = c(1L, > 2L, 3229330L, 3229331L, 3229332L, 3L, 9504492L, 4L, 9504493L, > 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 4989444L, > 4989445L, 4989446L, 18L, 19L, 20L, 21L, 22L, 95114L, 95115L, > 95116L, 95117L, 9504509L, 9599604L, 23L, 3319263L, 3319264L, > 24L, 25L, 9504513L, 26L, 27L, 28L, 7548719L, 7548720L, 29L, 30L, > 31L, 32L, 2498862L, 2498863L, 2498864L, 2498865L, 4560918L, 4560919L, > 5938642L, 14065408L, 33L, 4348151L, 4348152L, 5627634L, 5627635L, > 9504522L, 13852641L, 15132124L, 34L, 35L, 6400711L, 36L, 2173763L, > 37L, 38L, 39L, 40L, 41L, 10210641L, 10672811L, 16158334L, 42L, > 43L, 44L, 45L, 1974646L, 4032952L, 4032953L, 4032954L, 4032955L, > 4032956L, 4032957L, 4032958L, 4475376L, 4475377L, 4475378L, 4475379L, > 5500564L, 7871329L, 7871330L, 8670694L), class = "data.frame") > --8<---------------cut here---------------end--------------->8--- > > thanks! > > -- > Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000 > http://www.childpsy.net/ http://thereligionofpeace.com http://dhimmi.com > http://ffii.org http://truepeace.org http://mideasttruth.com > Bus error -- please leave by the rear door. > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.
Sam Steingold <sds at gnu.org> writes:> I need an analogue of "uniq -c" for a data frame.The count.rows() function is the R analogue. See http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1 No need to install the package - just copy and paste the function into an R session. On cases I've tried that are big enough to matter, it is a good deal faster than the table( do.call( paste, x )) idiom. HTH, Chuck> > xtabs(), although dog slow, would have footed the bill nicely: > --8<---------------cut here---------------start------------->8--- >> x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32) >> system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 )) > user system elapsed > 12.788 4.288 17.224 > --8<---------------cut here---------------end--------------->8--- > but, alas, if fails on larger data: > system.time(subset(as.data.frame(xtabs( ~. , x )), Freq != 0 )) > Error in table(a = 1:32, b = 1:32, c = 1:32, d = 1:32, e = 1:32, f = 1:32, : > attempt to make a table with >= 2^31 elements > (apparently, because the product of the numbers of all the possible > values of all the columns is too large). > > rle() seems to be what I really need, but I cannot figure out what it > returns for a simple example: > --8<---------------cut here---------------start------------->8--- >> x <- data.frame(a=1:32,b=1:32,c=1:32,d=1:32,e=1:32,f=1:32,g=1:32,h=1:32) >> rle(x) > Run Length Encoding > lengths: int 8 > values :'data.frame': 32 obs. of 1 variable: > $ h: int 1 2 3 4 5 6 7 8 9 10 ... > --8<---------------cut here---------------end--------------->8--- > (where are all the other columns?) > > and it fails on my actual data (3 column of factors): > Error in Ops.factor(left, right) : level sets of factors are different > > when I replace factors with strings, I get: > Error in `[.data.frame`(x, i) : undefined columns selected > > dput: > --8<---------------cut here---------------start------------->8--- > structure(list(user = c("45ff768774777593", "45ff768774777593", > "45ff768774777593", "45ff768774777593", "45ff768774777593", "4bbf9e94cbceb70c", > "4bbf9e94cbceb70c", "4fbbf2c67e0fb867", "4fbbf2c67e0fb867", "5038d46739f9f516", > "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", > "4f39c65c2704e79e", "4f39c65c2704e79e", "4f39c65c2704e79e", "4fe9e0496ecfc55e", > "4fe9e0496ecfc55e", "4fe9e0496ecfc55e", "506b92707a3aa65f", "502c0a9ba9ce5019", > "502c0a9ba9ce5019", "502c0a9ba9ce5019", "502c0a9ba9ce5019", "501b52fe24d88162", > "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", > "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", "4fd4852ed504b160", > "4fd4852ed504b160", "4fd4852ed504b160", "4e717c219268b736", "4e717c219268b736", > "4e717c219268b736", "506bb429eeab2af4", "506bb429eeab2af4", "506bb429eeab2af4", > "4f6f91cb83e1a7ef", "506bb8b62bde3c48", "506bb8b62bde3c48", "506bb8b62bde3c48", > "506bb8b62bde3c48", "4edff2aeb4df7613", "4edff2aeb4df7613", "506bba652fa6bf78", > "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", > "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", "506a4941b50ca422", > "506a4941b50ca422", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", > "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", "5036993a16b323d1", > "5036993a16b323d1", "506bb525ffce3add", "506bb6cf52819b5f", "506bb6cf52819b5f", > "4fe02e08a6a2ce64", "4fe02e08a6a2ce64", "5056d247bc7dae0b", "5056d247bc7dae0b", > "5056d247bc7dae0b", "506abfecf61bf4e6", "4ec00aa243745ee5", "4ec00aa243745ee5", > "4ec00aa243745ee5", "4ec00aa243745ee5", "506c69d7fb598afa", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", "5065a0c59c59c111", > "5065a0c59c59c111", "5065a0c59c59c111"), country = c("AR", "AR", > "AR", "AR", "AR", "BG", "BG", "SK", "SK", "US", "VE", "VE", "VE", > "VE", "VE", "VE", "VE", "TH", "TH", "TH", "MY", "US", "US", "US", > "US", "MX", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", > "JP", "US", "US", "US", "US", "US", "US", "NP", "US", "US", "US", > "US", "MX", "MX", "US", "AR", "AR", "AR", "AR", "AR", "AR", "AR", > "AR", "AR", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "CO", "US", > "MM", "MM", "US", "US", "IN", "IN", "IN", "IN", "CA", "CA", "CA", > "CA", "US", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", > "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE", "DE"), > language = c("es", "es", "es", "es", "es", "bg", "bg", "sk", > "sk", "en", "es", "es", "es", "es", "es", "es", "es", "th", > "th", "th", "en", "en", "en", "en", "en", "es", "en", "en", > "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", > "en", "en", "en", "en", "en", "en", "en", "en", "en", "es", > "es", "en", "es", "es", "es", "es", "es", "es", "es", "es", > "es", "es", "es", "es", "es", "es", "es", "es", "es", "en", > "en", "en", "en", "en", "en", "en", "en", "en", "fr", "fr", > "fr", "fr", "en", "de", "de", "de", "de", "de", "de", "de", > "de", "de", "de", "de", "de", "de", "de", "de", "de", "de", > "de", "de")), .Names = c("user", "country", "language"), row.names = c(1L, > 2L, 3229330L, 3229331L, 3229332L, 3L, 9504492L, 4L, 9504493L, > 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 4989444L, > 4989445L, 4989446L, 18L, 19L, 20L, 21L, 22L, 95114L, 95115L, > 95116L, 95117L, 9504509L, 9599604L, 23L, 3319263L, 3319264L, > 24L, 25L, 9504513L, 26L, 27L, 28L, 7548719L, 7548720L, 29L, 30L, > 31L, 32L, 2498862L, 2498863L, 2498864L, 2498865L, 4560918L, 4560919L, > 5938642L, 14065408L, 33L, 4348151L, 4348152L, 5627634L, 5627635L, > 9504522L, 13852641L, 15132124L, 34L, 35L, 6400711L, 36L, 2173763L, > 37L, 38L, 39L, 40L, 41L, 10210641L, 10672811L, 16158334L, 42L, > 43L, 44L, 45L, 1974646L, 4032952L, 4032953L, 4032954L, 4032955L, > 4032956L, 4032957L, 4032958L, 4475376L, 4475377L, 4475378L, 4475379L, > 5500564L, 7871329L, 7871330L, 8670694L), class = "data.frame") > --8<---------------cut here---------------end--------------->8--- > > thanks!
> * Sam Steingold <fqf at tah.bet> [2012-10-16 11:03:27 -0400]: > > I need an analogue of "uniq -c" for a data frame.Summary of options: 1. William: isFirstInRun <- function(x) UseMethod("isFirstInRun") isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)]) isFirstInRun.data.frame <- function(x) { stopifnot(ncol(x)>0) retval <- isFirstInRun(x[[1]]) for(column in x) { retval <- retval | isFirstInRun(column) } retval } row.count.1 <- function (x) { i <- which(isFirstInRun(x)) data.frame(x[i,], count=diff(c(i, 1L+nrow(x)))) } 147 seconds 2. http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1 row.count.2 <- function (x) { equal.to.previous <- rowSums( x[2:nrow(x),] != x[1:(nrow(x)-1),] )==0 tf.runs <- rle(equal.to.previous) counts <- c(1, unlist(mapply(function(x,y) if (y) x+1 else (rep(1,x)), tf.runs$length, tf.runs$value))) counts <- counts[ c( diff( counts ) <= 0, TRUE ) ] unique.rows <- which( c(TRUE, !equal.to.previous ) ) cbind(x[ unique.rows, ,drop=FALSE ], counts) } 136 seconds 3. Micael: paste/strsplit row.count.3 <- function (x) { pa <- do.call(paste,x) rl <- rle(p) sp <- strsplit(as.character(rl$values)," ") data.frame(user = sapply(sp,"[",1), country = sapply(sp,"[",2), language = sapply(sp,"[",3), count = rl$length) } here I know the columns and rely on absense of spaces in values. 27 seconds. Thanks to all who answered. -- Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000 http://www.childpsy.net/ http://www.PetitionOnline.com/tap12009/ http://thereligionofpeace.com http://ffii.org http://camera.org A slave dreams not of Freedom, but of owning his own slaves.
Note that the relative speeds of these, which all use basically the same run-length-encoding algorithm, depend on the nature of the dataset. I made a million row data.frame with 10,000 unique users, 26 unique countries, and 6 unique languages with c. 3/4 million unique rows. Then the times for methods 1, 2, and 3 were 0.7, 6.2, and 10.5 seconds, respectively. With a million row data.frame with 100, 10, and 4 unique users, countries, and languages, with 4000 unique rows, the times were 0.3, 1.4, and 0.7. Bill Dunlap Spotfire, TIBCO Software wdunlap tibco.com> -----Original Message----- > From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf > Of Sam Steingold > Sent: Wednesday, October 17, 2012 12:58 PM > To: r-help at r-project.org > Subject: Re: [R] uniq -c > > > * Sam Steingold <fqf at tah.bet> [2012-10-16 11:03:27 -0400]: > > > > I need an analogue of "uniq -c" for a data frame. > > Summary of options: > > 1. William: > > isFirstInRun <- function(x) UseMethod("isFirstInRun") > isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)]) > isFirstInRun.data.frame <- function(x) { > stopifnot(ncol(x)>0) > retval <- isFirstInRun(x[[1]]) > for(column in x) { > retval <- retval | isFirstInRun(column) > } > retval > } > row.count.1 <- function (x) { > i <- which(isFirstInRun(x)) > data.frame(x[i,], count=diff(c(i, 1L+nrow(x)))) > } > > 147 seconds > > 2. http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1 > row.count.2 <- function (x) { > equal.to.previous <- rowSums( x[2:nrow(x),] != x[1:(nrow(x)-1),] )==0 > tf.runs <- rle(equal.to.previous) > counts <- c(1, unlist(mapply(function(x,y) if (y) x+1 else (rep(1,x)), > tf.runs$length, tf.runs$value))) > counts <- counts[ c( diff( counts ) <= 0, TRUE ) ] > unique.rows <- which( c(TRUE, !equal.to.previous ) ) > cbind(x[ unique.rows, ,drop=FALSE ], counts) > } > > 136 seconds > > 3. Micael: paste/strsplit > > row.count.3 <- function (x) { > pa <- do.call(paste,x) > rl <- rle(p) > sp <- strsplit(as.character(rl$values)," ") > data.frame(user = sapply(sp,"[",1), > country = sapply(sp,"[",2), > language = sapply(sp,"[",3), > count = rl$length) > } > > here I know the columns and rely on absense of spaces in values. > > 27 seconds. > > Thanks to all who answered. > > -- > Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000 > http://www.childpsy.net/ http://www.PetitionOnline.com/tap12009/ > http://thereligionofpeace.com http://ffii.org http://camera.org > A slave dreams not of Freedom, but of owning his own slaves. > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.
In addition, adding a factor method for isFirstInRun speeds it up on long factor variables by c. 60%. isFirstInRun.factor <- function(x)isFirstInRun(as.integer(x)) Bill Dunlap Spotfire, TIBCO Software wdunlap tibco.com> -----Original Message----- > From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf > Of William Dunlap > Sent: Wednesday, October 17, 2012 2:11 PM > To: sds at gnu.org; r-help at r-project.org > Subject: Re: [R] uniq -c > > Note that the relative speeds of these, which all use basically the same run-length- > encoding > algorithm, depend on the nature of the dataset. I made a million row data.frame with > 10,000 > unique users, 26 unique countries, and 6 unique languages with c. 3/4 million unique > rows. Then the times for methods 1, 2, and 3 were 0.7, 6.2, and 10.5 seconds, > respectively. With a million row data.frame with 100, 10, and 4 unique users, countries, > and languages, with 4000 unique rows, the times were 0.3, 1.4, and 0.7. > > Bill Dunlap > Spotfire, TIBCO Software > wdunlap tibco.com > > > > -----Original Message----- > > From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf > > Of Sam Steingold > > Sent: Wednesday, October 17, 2012 12:58 PM > > To: r-help at r-project.org > > Subject: Re: [R] uniq -c > > > > > * Sam Steingold <fqf at tah.bet> [2012-10-16 11:03:27 -0400]: > > > > > > I need an analogue of "uniq -c" for a data frame. > > > > Summary of options: > > > > 1. William: > > > > isFirstInRun <- function(x) UseMethod("isFirstInRun") > > isFirstInRun.default <- function(x) c(TRUE, x[-1] != x[-length(x)]) > > isFirstInRun.data.frame <- function(x) { > > stopifnot(ncol(x)>0) > > retval <- isFirstInRun(x[[1]]) > > for(column in x) { > > retval <- retval | isFirstInRun(column) > > } > > retval > > } > > row.count.1 <- function (x) { > > i <- which(isFirstInRun(x)) > > data.frame(x[i,], count=diff(c(i, 1L+nrow(x)))) > > } > > > > 147 seconds > > > > 2. http://orgmode.org/worg/org-contrib/babel/examples/Rpackage.html#sec-6-1 > > row.count.2 <- function (x) { > > equal.to.previous <- rowSums( x[2:nrow(x),] != x[1:(nrow(x)-1),] )==0 > > tf.runs <- rle(equal.to.previous) > > counts <- c(1, unlist(mapply(function(x,y) if (y) x+1 else (rep(1,x)), > > tf.runs$length, tf.runs$value))) > > counts <- counts[ c( diff( counts ) <= 0, TRUE ) ] > > unique.rows <- which( c(TRUE, !equal.to.previous ) ) > > cbind(x[ unique.rows, ,drop=FALSE ], counts) > > } > > > > 136 seconds > > > > 3. Micael: paste/strsplit > > > > row.count.3 <- function (x) { > > pa <- do.call(paste,x) > > rl <- rle(p) > > sp <- strsplit(as.character(rl$values)," ") > > data.frame(user = sapply(sp,"[",1), > > country = sapply(sp,"[",2), > > language = sapply(sp,"[",3), > > count = rl$length) > > } > > > > here I know the columns and rely on absense of spaces in values. > > > > 27 seconds. > > > > Thanks to all who answered. > > > > -- > > Sam Steingold (http://sds.podval.org/) on Ubuntu 12.04 (precise) X 11.0.11103000 > > http://www.childpsy.net/ http://www.PetitionOnline.com/tap12009/ > > http://thereligionofpeace.com http://ffii.org http://camera.org > > A slave dreams not of Freedom, but of owning his own slaves. > > > > ______________________________________________ > > R-help at r-project.org mailing list > > https://stat.ethz.ch/mailman/listinfo/r-help > > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > > and provide commented, minimal, self-contained, reproducible code. > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.