thr3ads.net - R devel - [Rd] a fast table() for the 1D case [Aug 2013]

If this information is useful, please help other people find it:
Share via:

Hervé Pagès

2013-Aug-09 08:19 UTC

[Rd] a fast table() for the 1D case

Hi,

table1D() below can be up to 60x faster than base::table() for the 1D
case. Here are the detailed speedups compared to base::table().

   o With a logical vector of length 5M:     11x faster
                                     (or more if
'useNA="always"')

   o With factor/integer/numeric/character of length 1M and 9 levels
     (or 9 distinct values for non-factors):
      - factor:                              60x faster
      - integer/numeric vector:              12x faster
      - character vector:                   2.4x faster

   o With factor/integer/numeric/character of length 1M and no
     duplicates:
       - factor:                              5x faster
       - integer vector:                      2x faster
       - numeric vector:                    1.7x faster
       - character vector:       no significant speedup

Would be great if this improvement could make it into base::table().

Thanks,
H.

   ## A fast table() implementation for the 1D case (replacing the '...'
   ## arg with 'x' and omitting the 'dnn' and
'deparse.level' arguments
   ## which are unrelated to performance).

   table1D <- function(x, exclude = if (useNA == "no") c(NA, NaN),
                       useNA = c("no", "ifany",
"always"))
   {
     if (!missing(exclude) && is.null(exclude)) {
         useNA <- "always"
     } else {
         useNA <- match.arg(useNA)
     }
     if (useNA == "always" && !missing(exclude))
         exclude <- setdiff(exclude, NA)
     if (is.factor(x)) {
         x2 <- levels(x)
         append_NA <- (useNA == "always" ||
                       useNA == "ifany" && any(is.na(x)))
&&
                      !any(is.na(x2))
         if (append_NA) {
             x2 <- c(x2, NA)
             x <- factor(x, levels=x2, exclude=NULL)
         }
         t2 <- tabulate(x, nbins=length(x2))
         if (!is.null(exclude)) {
             keep_idx <- which(!(x2 %in% exclude))
             x2 <- x2[keep_idx]
             t2 <- t2[keep_idx]
         }
     } else {
         xx <- match(x, x)
         t <- tabulate(xx, nbins=length(xx))
         keep_idx <- which(t != 0L)
         x2 <- x[keep_idx]
         t2 <- t[keep_idx]
         if (!is.null(exclude)) {
             exclude <- as.vector(exclude, typeof(x))
             keep_idx <- which(!(x2 %in% exclude))
             x2 <- x2[keep_idx]
             t2 <- t2[keep_idx]
         }
         oo <- order(x2)
         x2 <- x2[oo]
         t2 <- t2[oo]
         append_NA <- useNA == "always" && !any(is.na(x2))
         if (append_NA) {
             x2 <- c(x2, NA)
             t2 <- c(t2, 0L)
         }
     }
     ans <- array(t2)
     dimnames(ans) <- list(as.character(x2))
     names(dimnames(ans)) <- "x"  # always set to 'x'
     class(ans) <- "table"
     ans
   }

table1D() also fixes some issues with base::table() that can be exposed
by running the tests below.

   test_table <- function(FUN_NAME)
   {
     FUN <- match.fun(FUN_NAME)

     .make_target <- function(target_names, target_data)
     {
         ans <- array(target_data)
         dimnames(ans) <- list(as.character(target_names))
         names(dimnames(ans)) <- "x"
         class(ans) <- "table"
         ans
     }

     .check_identical <- function(target, current, varname, extra_args)
     {
         if (identical(target, current))
             return()
         if (extra_args != "")
             extra_args <- paste0(", ", extra_args)
         cat("unexpected result for '", FUN_NAME,
             "(x=", varname, extra_args, ")'\n",
sep="")
     }

     .test_exclude <- function(x, varname, target_names0, target_data0, 
exclude)
     {
         extra_args <- paste0("exclude=", deparse(exclude))
         current <- FUN(x=x, exclude=exclude)
         target_names <- target_names0
         target_data <- target_data0
         if (is.null(exclude)) {
             if (!any(is.na(target_names))) {
                 target_names <- c(target_names, NA)
                 target_data <- c(target_data, 0L)
             }
         } else {
             if (!is.factor(x)) {
                 exclude <- as.vector(exclude, typeof(x))
             } else if (!any(is.na(levels(x)))) {
                 exclude <- union(exclude, NA)
             }
             exclude_idx <- match(exclude, target_names, nomatch=0L)
             if (any(exclude_idx != 0L)) {
                 target_names <- target_names[-exclude_idx]
                 target_data <- target_data[-exclude_idx]
             }
         }
         target <- .make_target(target_names, target_data)
         .check_identical(target, current, varname, extra_args)
     }

     .do_exclude_tests <- function(x, varname, target_names0, target_data0,
                                   more_excludes=NULL)
     {
         .BASIC_EXCLUDES <- list(c(NA, NaN), NULL, numeric(0), NA, NaN)
         excludes <- c(.BASIC_EXCLUDES, more_excludes)
         for (exclude in excludes)
             .test_exclude(x, varname, target_names0, target_data0, exclude)
     }

     ## Test on a numeric vector.
     x0 <- numeric(0)
     .do_exclude_tests(x0, "x0", character(0), integer(0), list(5.3))

     x1_target_names0 <- c(-9, 4, 5.3, NaN, NA)
     x1_target_data0 <- c(1L, 2L, 1L, 2L, 3L)
     x1 <- c(5.3, 4, NaN, 4, NA, NA, NaN, -9, NA)
     excludes <- list(c(5.3, -9),
                      c(5.3, NA, -9),
                      c(5.3, NaN, -9),
                      c(5.3, 80, -9),
                      x1_target_names0)
     .do_exclude_tests(x1, "x1", x1_target_names0, x1_target_data0, 
excludes)

     x2_target_names0 <- c(-9, 4, 5.3, NA, NaN)
     x2_target_data0 <- c(1L, 2L, 1L, 3L, 2L)
     x2 <- rev(x1)
     .do_exclude_tests(x2, "x2", x2_target_names0, x2_target_data0, 
excludes)

     x3_target_names0 <- c(-9, 4, 5.3)
     x3_target_data0 <- c(1L, 2L, 1L)
     x3 <- c(5.3, 4, 4, -9)
     .do_exclude_tests(x3, "x3", x3_target_names0, x3_target_data0, 
excludes)

     ## Test on a factor.
     f0 <- factor()
     .do_exclude_tests(f0, "f0", character(0), integer(0), list(5.3))

     f1 <- factor(x1)
     .do_exclude_tests(f1, "f1", x1_target_names0, x1_target_data0, 
excludes)

     f2 <- factor(x1, exclude=NULL)
     .do_exclude_tests(f2, "f2", x1_target_names0, x1_target_data0, 
excludes)

     f3_target_names0 <- c(6.82, x1_target_names0, -7.66)
     f3_target_data0 <- c(0L, 1L, 2L, 1L, 0L, 0L, 0L)
     f3 <- factor(x3, levels=f3_target_names0, exclude=NULL)
     .do_exclude_tests(f3, "f3", f3_target_names0, f3_target_data0, 
excludes)

     x4_target_names0 <- c(6.82, -9, 5.3, 4, -7.66)
     x4_target_data0 <- c(0L, 1L, 1L, 2L, 0L)
     f4 <- factor(x3, levels=x4_target_names0, exclude=NULL)
     .do_exclude_tests(f4, "f4", x4_target_names0, x4_target_data0, 
excludes)

     ## Test on a character vector.
     c0 <- character(0)
     .do_exclude_tests(c0, "c0", character(0), integer(0),
list("Aa"))

     c1 <- c("b", "AA", "", "a",
"ab", "NaN", "4", "Aa", NA,
"NaN",
"ab", NA)
     c1_target_names0 <- sort(unique(c1), na.last=TRUE)
     c1_target_data0 <- c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L)
     excludes <- list(c("Aa", 4, ""),
                      c("Aa", NA, 4, "", "Z"),
                      c("Aa", NaN, 4, "", "Z"),
                      c("Aa", 4, "", "Z"))
     .do_exclude_tests(c1, "c1", c1_target_names0, c1_target_data0, 
excludes)

     c2 <- c("b", "AA", "", "a",
"ab", "", "", "4", "Aa",
"ab")
     c2_target_names0 <- sort(unique(c2), na.last=TRUE)
     c2_target_data0 <- c(3L, 1L, 1L, 1L, 1L, 2L, 1L)
     .do_exclude_tests(c2, "c2", c2_target_names0, c2_target_data0, 
excludes)

     ## Test on a logical vector.
     l0 <- logical(0)
     .do_exclude_tests(l0, "l0", character(0), integer(0),
list(c("Aa",
TRUE)))

     l1 <- c(FALSE, FALSE, NA, TRUE, FALSE, FALSE, NA, NA, TRUE)
     l1_target_names0 <- c(FALSE, TRUE, NA)
     l1_target_data0 <- c(4L, 2L, 3L)
     excludes <- list(c(TRUE, FALSE),
                      c("Aa", NA, TRUE),
                      c("Aa", NaN, TRUE),
                      l1_target_names0)
     .do_exclude_tests(l1, "l1", l1_target_names0, l1_target_data0, 
excludes)

     l2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, TRUE)
     l2_target_names0 <- c(FALSE, TRUE)
     l2_target_data0 <- c(4L, 2L)
     .do_exclude_tests(l2, "l2", l2_target_names0, l2_target_data0, 
excludes)
   }

   test_table("table")    # will display some issues
   test_table("table1D")  # should not display anything

> sessionInfo()R version 3.0.1 (2013-05-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
  [7] LC_PAPER=C                 LC_NAME=C
  [9] LC_ADDRESS=C               LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base

loaded via a namespace (and not attached):
[1] tools_3.0.1

-- 
Herv? Pag?s

Program in Computational Biology
Division of Public Health Sciences
Fred Hutchinson Cancer Research Center
1100 Fairview Ave. N, M1-B514
P.O. Box 19024
Seattle, WA 98109-1024

E-mail: hpages at fhcrc.org
Phone:  (206) 667-5791
Fax:    (206) 667-1319

Hervé Pagès

2013-Sep-16 19:55 UTC

head link

[Rd] a fast table() for the 1D case

Any chance some improvements can be made on table()?

table() is probably one of the most used R functions when working
interactively. Unfortunately it can be incredibly slow, especially
on a logical vector where a simple sum() is hundred times faster
(I actually got into the habit of using sum() instead of table()).
The table1D() proposal below doesn't go as far as using sum() on a
logical vector but it already provides significant speedups for most
use cases.

Thanks,
H.


On 08/09/2013 01:19 AM, Herv? Pag?s wrote:> Hi,
>
> table1D() below can be up to 60x faster than base::table() for the 1D
> case. Here are the detailed speedups compared to base::table().
>
>    o With a logical vector of length 5M:     11x faster
>                                      (or more if
'useNA="always"')
>
>    o With factor/integer/numeric/character of length 1M and 9 levels
>      (or 9 distinct values for non-factors):
>       - factor:                              60x faster
>       - integer/numeric vector:              12x faster
>       - character vector:                   2.4x faster
>
>    o With factor/integer/numeric/character of length 1M and no
>      duplicates:
>        - factor:                              5x faster
>        - integer vector:                      2x faster
>        - numeric vector:                    1.7x faster
>        - character vector:       no significant speedup
>
> Would be great if this improvement could make it into base::table().
>
> Thanks,
> H.
>
>    ## A fast table() implementation for the 1D case (replacing the
'...'
>    ## arg with 'x' and omitting the 'dnn' and
'deparse.level' arguments
>    ## which are unrelated to performance).
>
>    table1D <- function(x, exclude = if (useNA == "no") c(NA,
NaN),
>                        useNA = c("no", "ifany",
"always"))
>    {
>      if (!missing(exclude) && is.null(exclude)) {
>          useNA <- "always"
>      } else {
>          useNA <- match.arg(useNA)
>      }
>      if (useNA == "always" && !missing(exclude))
>          exclude <- setdiff(exclude, NA)
>      if (is.factor(x)) {
>          x2 <- levels(x)
>          append_NA <- (useNA == "always" ||
>                        useNA == "ifany" && any(is.na(x)))
&&
>                       !any(is.na(x2))
>          if (append_NA) {
>              x2 <- c(x2, NA)
>              x <- factor(x, levels=x2, exclude=NULL)
>          }
>          t2 <- tabulate(x, nbins=length(x2))
>          if (!is.null(exclude)) {
>              keep_idx <- which(!(x2 %in% exclude))
>              x2 <- x2[keep_idx]
>              t2 <- t2[keep_idx]
>          }
>      } else {
>          xx <- match(x, x)
>          t <- tabulate(xx, nbins=length(xx))
>          keep_idx <- which(t != 0L)
>          x2 <- x[keep_idx]
>          t2 <- t[keep_idx]
>          if (!is.null(exclude)) {
>              exclude <- as.vector(exclude, typeof(x))
>              keep_idx <- which(!(x2 %in% exclude))
>              x2 <- x2[keep_idx]
>              t2 <- t2[keep_idx]
>          }
>          oo <- order(x2)
>          x2 <- x2[oo]
>          t2 <- t2[oo]
>          append_NA <- useNA == "always" &&
!any(is.na(x2))
>          if (append_NA) {
>              x2 <- c(x2, NA)
>              t2 <- c(t2, 0L)
>          }
>      }
>      ans <- array(t2)
>      dimnames(ans) <- list(as.character(x2))
>      names(dimnames(ans)) <- "x"  # always set to 'x'
>      class(ans) <- "table"
>      ans
>    }
>
> table1D() also fixes some issues with base::table() that can be exposed
> by running the tests below.
>
>    test_table <- function(FUN_NAME)
>    {
>      FUN <- match.fun(FUN_NAME)
>
>      .make_target <- function(target_names, target_data)
>      {
>          ans <- array(target_data)
>          dimnames(ans) <- list(as.character(target_names))
>          names(dimnames(ans)) <- "x"
>          class(ans) <- "table"
>          ans
>      }
>
>      .check_identical <- function(target, current, varname, extra_args)
>      {
>          if (identical(target, current))
>              return()
>          if (extra_args != "")
>              extra_args <- paste0(", ", extra_args)
>          cat("unexpected result for '", FUN_NAME,
>              "(x=", varname, extra_args, ")'\n",
sep="")
>      }
>
>      .test_exclude <- function(x, varname, target_names0, target_data0,
> exclude)
>      {
>          extra_args <- paste0("exclude=", deparse(exclude))
>          current <- FUN(x=x, exclude=exclude)
>          target_names <- target_names0
>          target_data <- target_data0
>          if (is.null(exclude)) {
>              if (!any(is.na(target_names))) {
>                  target_names <- c(target_names, NA)
>                  target_data <- c(target_data, 0L)
>              }
>          } else {
>              if (!is.factor(x)) {
>                  exclude <- as.vector(exclude, typeof(x))
>              } else if (!any(is.na(levels(x)))) {
>                  exclude <- union(exclude, NA)
>              }
>              exclude_idx <- match(exclude, target_names, nomatch=0L)
>              if (any(exclude_idx != 0L)) {
>                  target_names <- target_names[-exclude_idx]
>                  target_data <- target_data[-exclude_idx]
>              }
>          }
>          target <- .make_target(target_names, target_data)
>          .check_identical(target, current, varname, extra_args)
>      }
>
>      .do_exclude_tests <- function(x, varname, target_names0,
target_data0,
>                                    more_excludes=NULL)
>      {
>          .BASIC_EXCLUDES <- list(c(NA, NaN), NULL, numeric(0), NA, NaN)
>          excludes <- c(.BASIC_EXCLUDES, more_excludes)
>          for (exclude in excludes)
>              .test_exclude(x, varname, target_names0, target_data0,
> exclude)
>      }
>
>      ## Test on a numeric vector.
>      x0 <- numeric(0)
>      .do_exclude_tests(x0, "x0", character(0), integer(0),
list(5.3))
>
>      x1_target_names0 <- c(-9, 4, 5.3, NaN, NA)
>      x1_target_data0 <- c(1L, 2L, 1L, 2L, 3L)
>      x1 <- c(5.3, 4, NaN, 4, NA, NA, NaN, -9, NA)
>      excludes <- list(c(5.3, -9),
>                       c(5.3, NA, -9),
>                       c(5.3, NaN, -9),
>                       c(5.3, 80, -9),
>                       x1_target_names0)
>      .do_exclude_tests(x1, "x1", x1_target_names0,
x1_target_data0,
> excludes)
>
>      x2_target_names0 <- c(-9, 4, 5.3, NA, NaN)
>      x2_target_data0 <- c(1L, 2L, 1L, 3L, 2L)
>      x2 <- rev(x1)
>      .do_exclude_tests(x2, "x2", x2_target_names0,
x2_target_data0,
> excludes)
>
>      x3_target_names0 <- c(-9, 4, 5.3)
>      x3_target_data0 <- c(1L, 2L, 1L)
>      x3 <- c(5.3, 4, 4, -9)
>      .do_exclude_tests(x3, "x3", x3_target_names0,
x3_target_data0,
> excludes)
>
>      ## Test on a factor.
>      f0 <- factor()
>      .do_exclude_tests(f0, "f0", character(0), integer(0),
list(5.3))
>
>      f1 <- factor(x1)
>      .do_exclude_tests(f1, "f1", x1_target_names0,
x1_target_data0,
> excludes)
>
>      f2 <- factor(x1, exclude=NULL)
>      .do_exclude_tests(f2, "f2", x1_target_names0,
x1_target_data0,
> excludes)
>
>      f3_target_names0 <- c(6.82, x1_target_names0, -7.66)
>      f3_target_data0 <- c(0L, 1L, 2L, 1L, 0L, 0L, 0L)
>      f3 <- factor(x3, levels=f3_target_names0, exclude=NULL)
>      .do_exclude_tests(f3, "f3", f3_target_names0,
f3_target_data0,
> excludes)
>
>      x4_target_names0 <- c(6.82, -9, 5.3, 4, -7.66)
>      x4_target_data0 <- c(0L, 1L, 1L, 2L, 0L)
>      f4 <- factor(x3, levels=x4_target_names0, exclude=NULL)
>      .do_exclude_tests(f4, "f4", x4_target_names0,
x4_target_data0,
> excludes)
>
>      ## Test on a character vector.
>      c0 <- character(0)
>      .do_exclude_tests(c0, "c0", character(0), integer(0),
list("Aa"))
>
>      c1 <- c("b", "AA", "", "a",
"ab", "NaN", "4", "Aa", NA,
"NaN",
> "ab", NA)
>      c1_target_names0 <- sort(unique(c1), na.last=TRUE)
>      c1_target_data0 <- c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L)
>      excludes <- list(c("Aa", 4, ""),
>                       c("Aa", NA, 4, "",
"Z"),
>                       c("Aa", NaN, 4, "",
"Z"),
>                       c("Aa", 4, "", "Z"))
>      .do_exclude_tests(c1, "c1", c1_target_names0,
c1_target_data0,
> excludes)
>
>      c2 <- c("b", "AA", "", "a",
"ab", "", "", "4", "Aa",
"ab")
>      c2_target_names0 <- sort(unique(c2), na.last=TRUE)
>      c2_target_data0 <- c(3L, 1L, 1L, 1L, 1L, 2L, 1L)
>      .do_exclude_tests(c2, "c2", c2_target_names0,
c2_target_data0,
> excludes)
>
>      ## Test on a logical vector.
>      l0 <- logical(0)
>      .do_exclude_tests(l0, "l0", character(0), integer(0),
list(c("Aa",
> TRUE)))
>
>      l1 <- c(FALSE, FALSE, NA, TRUE, FALSE, FALSE, NA, NA, TRUE)
>      l1_target_names0 <- c(FALSE, TRUE, NA)
>      l1_target_data0 <- c(4L, 2L, 3L)
>      excludes <- list(c(TRUE, FALSE),
>                       c("Aa", NA, TRUE),
>                       c("Aa", NaN, TRUE),
>                       l1_target_names0)
>      .do_exclude_tests(l1, "l1", l1_target_names0,
l1_target_data0,
> excludes)
>
>      l2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, TRUE)
>      l2_target_names0 <- c(FALSE, TRUE)
>      l2_target_data0 <- c(4L, 2L)
>      .do_exclude_tests(l2, "l2", l2_target_names0,
l2_target_data0,
> excludes)
>    }
>
>    test_table("table")    # will display some issues
>    test_table("table1D")  # should not display anything
>
>
>> sessionInfo()
> R version 3.0.1 (2013-05-16)
> Platform: x86_64-unknown-linux-gnu (64-bit)
>
> locale:
>   [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
>   [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
>   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
>   [7] LC_PAPER=C                 LC_NAME=C
>   [9] LC_ADDRESS=C               LC_TELEPHONE=C
> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
>
> attached base packages:
> [1] stats     graphics  grDevices utils     datasets  methods   base
>
> loaded via a namespace (and not attached):
> [1] tools_3.0.1
>
-- 
Herv? Pag?s

Program in Computational Biology
Division of Public Health Sciences
Fred Hutchinson Cancer Research Center
1100 Fairview Ave. N, M1-B514
P.O. Box 19024
Seattle, WA 98109-1024

E-mail: hpages at fhcrc.org
Phone:  (206) 667-5791
Fax:    (206) 667-1319

Maybe Matching Threads

[LLVMdev] Improving performance with optimization passes

R devel - Aug 2013 - a fast table() for the 1D case

[Rd] a fast table() for the 1D case

[Rd] a fast table() for the 1D case

Maybe Matching Threads