Hi,
table1D() below can be up to 60x faster than base::table() for the 1D
case. Here are the detailed speedups compared to base::table().
o With a logical vector of length 5M: 11x faster
(or more if
'useNA="always"')
o With factor/integer/numeric/character of length 1M and 9 levels
(or 9 distinct values for non-factors):
- factor: 60x faster
- integer/numeric vector: 12x faster
- character vector: 2.4x faster
o With factor/integer/numeric/character of length 1M and no
duplicates:
- factor: 5x faster
- integer vector: 2x faster
- numeric vector: 1.7x faster
- character vector: no significant speedup
Would be great if this improvement could make it into base::table().
Thanks,
H.
## A fast table() implementation for the 1D case (replacing the '...'
## arg with 'x' and omitting the 'dnn' and
'deparse.level' arguments
## which are unrelated to performance).
table1D <- function(x, exclude = if (useNA == "no") c(NA, NaN),
useNA = c("no", "ifany",
"always"))
{
if (!missing(exclude) && is.null(exclude)) {
useNA <- "always"
} else {
useNA <- match.arg(useNA)
}
if (useNA == "always" && !missing(exclude))
exclude <- setdiff(exclude, NA)
if (is.factor(x)) {
x2 <- levels(x)
append_NA <- (useNA == "always" ||
useNA == "ifany" && any(is.na(x)))
&&
!any(is.na(x2))
if (append_NA) {
x2 <- c(x2, NA)
x <- factor(x, levels=x2, exclude=NULL)
}
t2 <- tabulate(x, nbins=length(x2))
if (!is.null(exclude)) {
keep_idx <- which(!(x2 %in% exclude))
x2 <- x2[keep_idx]
t2 <- t2[keep_idx]
}
} else {
xx <- match(x, x)
t <- tabulate(xx, nbins=length(xx))
keep_idx <- which(t != 0L)
x2 <- x[keep_idx]
t2 <- t[keep_idx]
if (!is.null(exclude)) {
exclude <- as.vector(exclude, typeof(x))
keep_idx <- which(!(x2 %in% exclude))
x2 <- x2[keep_idx]
t2 <- t2[keep_idx]
}
oo <- order(x2)
x2 <- x2[oo]
t2 <- t2[oo]
append_NA <- useNA == "always" && !any(is.na(x2))
if (append_NA) {
x2 <- c(x2, NA)
t2 <- c(t2, 0L)
}
}
ans <- array(t2)
dimnames(ans) <- list(as.character(x2))
names(dimnames(ans)) <- "x" # always set to 'x'
class(ans) <- "table"
ans
}
table1D() also fixes some issues with base::table() that can be exposed
by running the tests below.
test_table <- function(FUN_NAME)
{
FUN <- match.fun(FUN_NAME)
.make_target <- function(target_names, target_data)
{
ans <- array(target_data)
dimnames(ans) <- list(as.character(target_names))
names(dimnames(ans)) <- "x"
class(ans) <- "table"
ans
}
.check_identical <- function(target, current, varname, extra_args)
{
if (identical(target, current))
return()
if (extra_args != "")
extra_args <- paste0(", ", extra_args)
cat("unexpected result for '", FUN_NAME,
"(x=", varname, extra_args, ")'\n",
sep="")
}
.test_exclude <- function(x, varname, target_names0, target_data0,
exclude)
{
extra_args <- paste0("exclude=", deparse(exclude))
current <- FUN(x=x, exclude=exclude)
target_names <- target_names0
target_data <- target_data0
if (is.null(exclude)) {
if (!any(is.na(target_names))) {
target_names <- c(target_names, NA)
target_data <- c(target_data, 0L)
}
} else {
if (!is.factor(x)) {
exclude <- as.vector(exclude, typeof(x))
} else if (!any(is.na(levels(x)))) {
exclude <- union(exclude, NA)
}
exclude_idx <- match(exclude, target_names, nomatch=0L)
if (any(exclude_idx != 0L)) {
target_names <- target_names[-exclude_idx]
target_data <- target_data[-exclude_idx]
}
}
target <- .make_target(target_names, target_data)
.check_identical(target, current, varname, extra_args)
}
.do_exclude_tests <- function(x, varname, target_names0, target_data0,
more_excludes=NULL)
{
.BASIC_EXCLUDES <- list(c(NA, NaN), NULL, numeric(0), NA, NaN)
excludes <- c(.BASIC_EXCLUDES, more_excludes)
for (exclude in excludes)
.test_exclude(x, varname, target_names0, target_data0, exclude)
}
## Test on a numeric vector.
x0 <- numeric(0)
.do_exclude_tests(x0, "x0", character(0), integer(0), list(5.3))
x1_target_names0 <- c(-9, 4, 5.3, NaN, NA)
x1_target_data0 <- c(1L, 2L, 1L, 2L, 3L)
x1 <- c(5.3, 4, NaN, 4, NA, NA, NaN, -9, NA)
excludes <- list(c(5.3, -9),
c(5.3, NA, -9),
c(5.3, NaN, -9),
c(5.3, 80, -9),
x1_target_names0)
.do_exclude_tests(x1, "x1", x1_target_names0, x1_target_data0,
excludes)
x2_target_names0 <- c(-9, 4, 5.3, NA, NaN)
x2_target_data0 <- c(1L, 2L, 1L, 3L, 2L)
x2 <- rev(x1)
.do_exclude_tests(x2, "x2", x2_target_names0, x2_target_data0,
excludes)
x3_target_names0 <- c(-9, 4, 5.3)
x3_target_data0 <- c(1L, 2L, 1L)
x3 <- c(5.3, 4, 4, -9)
.do_exclude_tests(x3, "x3", x3_target_names0, x3_target_data0,
excludes)
## Test on a factor.
f0 <- factor()
.do_exclude_tests(f0, "f0", character(0), integer(0), list(5.3))
f1 <- factor(x1)
.do_exclude_tests(f1, "f1", x1_target_names0, x1_target_data0,
excludes)
f2 <- factor(x1, exclude=NULL)
.do_exclude_tests(f2, "f2", x1_target_names0, x1_target_data0,
excludes)
f3_target_names0 <- c(6.82, x1_target_names0, -7.66)
f3_target_data0 <- c(0L, 1L, 2L, 1L, 0L, 0L, 0L)
f3 <- factor(x3, levels=f3_target_names0, exclude=NULL)
.do_exclude_tests(f3, "f3", f3_target_names0, f3_target_data0,
excludes)
x4_target_names0 <- c(6.82, -9, 5.3, 4, -7.66)
x4_target_data0 <- c(0L, 1L, 1L, 2L, 0L)
f4 <- factor(x3, levels=x4_target_names0, exclude=NULL)
.do_exclude_tests(f4, "f4", x4_target_names0, x4_target_data0,
excludes)
## Test on a character vector.
c0 <- character(0)
.do_exclude_tests(c0, "c0", character(0), integer(0),
list("Aa"))
c1 <- c("b", "AA", "", "a",
"ab", "NaN", "4", "Aa", NA,
"NaN",
"ab", NA)
c1_target_names0 <- sort(unique(c1), na.last=TRUE)
c1_target_data0 <- c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L)
excludes <- list(c("Aa", 4, ""),
c("Aa", NA, 4, "", "Z"),
c("Aa", NaN, 4, "", "Z"),
c("Aa", 4, "", "Z"))
.do_exclude_tests(c1, "c1", c1_target_names0, c1_target_data0,
excludes)
c2 <- c("b", "AA", "", "a",
"ab", "", "", "4", "Aa",
"ab")
c2_target_names0 <- sort(unique(c2), na.last=TRUE)
c2_target_data0 <- c(3L, 1L, 1L, 1L, 1L, 2L, 1L)
.do_exclude_tests(c2, "c2", c2_target_names0, c2_target_data0,
excludes)
## Test on a logical vector.
l0 <- logical(0)
.do_exclude_tests(l0, "l0", character(0), integer(0),
list(c("Aa",
TRUE)))
l1 <- c(FALSE, FALSE, NA, TRUE, FALSE, FALSE, NA, NA, TRUE)
l1_target_names0 <- c(FALSE, TRUE, NA)
l1_target_data0 <- c(4L, 2L, 3L)
excludes <- list(c(TRUE, FALSE),
c("Aa", NA, TRUE),
c("Aa", NaN, TRUE),
l1_target_names0)
.do_exclude_tests(l1, "l1", l1_target_names0, l1_target_data0,
excludes)
l2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, TRUE)
l2_target_names0 <- c(FALSE, TRUE)
l2_target_data0 <- c(4L, 2L)
.do_exclude_tests(l2, "l2", l2_target_names0, l2_target_data0,
excludes)
}
test_table("table") # will display some issues
test_table("table1D") # should not display anything
> sessionInfo()
R version 3.0.1 (2013-05-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=C LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] tools_3.0.1
--
Herv? Pag?s
Program in Computational Biology
Division of Public Health Sciences
Fred Hutchinson Cancer Research Center
1100 Fairview Ave. N, M1-B514
P.O. Box 19024
Seattle, WA 98109-1024
E-mail: hpages at fhcrc.org
Phone: (206) 667-5791
Fax: (206) 667-1319
Any chance some improvements can be made on table()? table() is probably one of the most used R functions when working interactively. Unfortunately it can be incredibly slow, especially on a logical vector where a simple sum() is hundred times faster (I actually got into the habit of using sum() instead of table()). The table1D() proposal below doesn't go as far as using sum() on a logical vector but it already provides significant speedups for most use cases. Thanks, H. On 08/09/2013 01:19 AM, Herv? Pag?s wrote:> Hi, > > table1D() below can be up to 60x faster than base::table() for the 1D > case. Here are the detailed speedups compared to base::table(). > > o With a logical vector of length 5M: 11x faster > (or more if 'useNA="always"') > > o With factor/integer/numeric/character of length 1M and 9 levels > (or 9 distinct values for non-factors): > - factor: 60x faster > - integer/numeric vector: 12x faster > - character vector: 2.4x faster > > o With factor/integer/numeric/character of length 1M and no > duplicates: > - factor: 5x faster > - integer vector: 2x faster > - numeric vector: 1.7x faster > - character vector: no significant speedup > > Would be great if this improvement could make it into base::table(). > > Thanks, > H. > > ## A fast table() implementation for the 1D case (replacing the '...' > ## arg with 'x' and omitting the 'dnn' and 'deparse.level' arguments > ## which are unrelated to performance). > > table1D <- function(x, exclude = if (useNA == "no") c(NA, NaN), > useNA = c("no", "ifany", "always")) > { > if (!missing(exclude) && is.null(exclude)) { > useNA <- "always" > } else { > useNA <- match.arg(useNA) > } > if (useNA == "always" && !missing(exclude)) > exclude <- setdiff(exclude, NA) > if (is.factor(x)) { > x2 <- levels(x) > append_NA <- (useNA == "always" || > useNA == "ifany" && any(is.na(x))) && > !any(is.na(x2)) > if (append_NA) { > x2 <- c(x2, NA) > x <- factor(x, levels=x2, exclude=NULL) > } > t2 <- tabulate(x, nbins=length(x2)) > if (!is.null(exclude)) { > keep_idx <- which(!(x2 %in% exclude)) > x2 <- x2[keep_idx] > t2 <- t2[keep_idx] > } > } else { > xx <- match(x, x) > t <- tabulate(xx, nbins=length(xx)) > keep_idx <- which(t != 0L) > x2 <- x[keep_idx] > t2 <- t[keep_idx] > if (!is.null(exclude)) { > exclude <- as.vector(exclude, typeof(x)) > keep_idx <- which(!(x2 %in% exclude)) > x2 <- x2[keep_idx] > t2 <- t2[keep_idx] > } > oo <- order(x2) > x2 <- x2[oo] > t2 <- t2[oo] > append_NA <- useNA == "always" && !any(is.na(x2)) > if (append_NA) { > x2 <- c(x2, NA) > t2 <- c(t2, 0L) > } > } > ans <- array(t2) > dimnames(ans) <- list(as.character(x2)) > names(dimnames(ans)) <- "x" # always set to 'x' > class(ans) <- "table" > ans > } > > table1D() also fixes some issues with base::table() that can be exposed > by running the tests below. > > test_table <- function(FUN_NAME) > { > FUN <- match.fun(FUN_NAME) > > .make_target <- function(target_names, target_data) > { > ans <- array(target_data) > dimnames(ans) <- list(as.character(target_names)) > names(dimnames(ans)) <- "x" > class(ans) <- "table" > ans > } > > .check_identical <- function(target, current, varname, extra_args) > { > if (identical(target, current)) > return() > if (extra_args != "") > extra_args <- paste0(", ", extra_args) > cat("unexpected result for '", FUN_NAME, > "(x=", varname, extra_args, ")'\n", sep="") > } > > .test_exclude <- function(x, varname, target_names0, target_data0, > exclude) > { > extra_args <- paste0("exclude=", deparse(exclude)) > current <- FUN(x=x, exclude=exclude) > target_names <- target_names0 > target_data <- target_data0 > if (is.null(exclude)) { > if (!any(is.na(target_names))) { > target_names <- c(target_names, NA) > target_data <- c(target_data, 0L) > } > } else { > if (!is.factor(x)) { > exclude <- as.vector(exclude, typeof(x)) > } else if (!any(is.na(levels(x)))) { > exclude <- union(exclude, NA) > } > exclude_idx <- match(exclude, target_names, nomatch=0L) > if (any(exclude_idx != 0L)) { > target_names <- target_names[-exclude_idx] > target_data <- target_data[-exclude_idx] > } > } > target <- .make_target(target_names, target_data) > .check_identical(target, current, varname, extra_args) > } > > .do_exclude_tests <- function(x, varname, target_names0, target_data0, > more_excludes=NULL) > { > .BASIC_EXCLUDES <- list(c(NA, NaN), NULL, numeric(0), NA, NaN) > excludes <- c(.BASIC_EXCLUDES, more_excludes) > for (exclude in excludes) > .test_exclude(x, varname, target_names0, target_data0, > exclude) > } > > ## Test on a numeric vector. > x0 <- numeric(0) > .do_exclude_tests(x0, "x0", character(0), integer(0), list(5.3)) > > x1_target_names0 <- c(-9, 4, 5.3, NaN, NA) > x1_target_data0 <- c(1L, 2L, 1L, 2L, 3L) > x1 <- c(5.3, 4, NaN, 4, NA, NA, NaN, -9, NA) > excludes <- list(c(5.3, -9), > c(5.3, NA, -9), > c(5.3, NaN, -9), > c(5.3, 80, -9), > x1_target_names0) > .do_exclude_tests(x1, "x1", x1_target_names0, x1_target_data0, > excludes) > > x2_target_names0 <- c(-9, 4, 5.3, NA, NaN) > x2_target_data0 <- c(1L, 2L, 1L, 3L, 2L) > x2 <- rev(x1) > .do_exclude_tests(x2, "x2", x2_target_names0, x2_target_data0, > excludes) > > x3_target_names0 <- c(-9, 4, 5.3) > x3_target_data0 <- c(1L, 2L, 1L) > x3 <- c(5.3, 4, 4, -9) > .do_exclude_tests(x3, "x3", x3_target_names0, x3_target_data0, > excludes) > > ## Test on a factor. > f0 <- factor() > .do_exclude_tests(f0, "f0", character(0), integer(0), list(5.3)) > > f1 <- factor(x1) > .do_exclude_tests(f1, "f1", x1_target_names0, x1_target_data0, > excludes) > > f2 <- factor(x1, exclude=NULL) > .do_exclude_tests(f2, "f2", x1_target_names0, x1_target_data0, > excludes) > > f3_target_names0 <- c(6.82, x1_target_names0, -7.66) > f3_target_data0 <- c(0L, 1L, 2L, 1L, 0L, 0L, 0L) > f3 <- factor(x3, levels=f3_target_names0, exclude=NULL) > .do_exclude_tests(f3, "f3", f3_target_names0, f3_target_data0, > excludes) > > x4_target_names0 <- c(6.82, -9, 5.3, 4, -7.66) > x4_target_data0 <- c(0L, 1L, 1L, 2L, 0L) > f4 <- factor(x3, levels=x4_target_names0, exclude=NULL) > .do_exclude_tests(f4, "f4", x4_target_names0, x4_target_data0, > excludes) > > ## Test on a character vector. > c0 <- character(0) > .do_exclude_tests(c0, "c0", character(0), integer(0), list("Aa")) > > c1 <- c("b", "AA", "", "a", "ab", "NaN", "4", "Aa", NA, "NaN", > "ab", NA) > c1_target_names0 <- sort(unique(c1), na.last=TRUE) > c1_target_data0 <- c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L) > excludes <- list(c("Aa", 4, ""), > c("Aa", NA, 4, "", "Z"), > c("Aa", NaN, 4, "", "Z"), > c("Aa", 4, "", "Z")) > .do_exclude_tests(c1, "c1", c1_target_names0, c1_target_data0, > excludes) > > c2 <- c("b", "AA", "", "a", "ab", "", "", "4", "Aa", "ab") > c2_target_names0 <- sort(unique(c2), na.last=TRUE) > c2_target_data0 <- c(3L, 1L, 1L, 1L, 1L, 2L, 1L) > .do_exclude_tests(c2, "c2", c2_target_names0, c2_target_data0, > excludes) > > ## Test on a logical vector. > l0 <- logical(0) > .do_exclude_tests(l0, "l0", character(0), integer(0), list(c("Aa", > TRUE))) > > l1 <- c(FALSE, FALSE, NA, TRUE, FALSE, FALSE, NA, NA, TRUE) > l1_target_names0 <- c(FALSE, TRUE, NA) > l1_target_data0 <- c(4L, 2L, 3L) > excludes <- list(c(TRUE, FALSE), > c("Aa", NA, TRUE), > c("Aa", NaN, TRUE), > l1_target_names0) > .do_exclude_tests(l1, "l1", l1_target_names0, l1_target_data0, > excludes) > > l2 <- c(FALSE, FALSE, TRUE, FALSE, FALSE, TRUE) > l2_target_names0 <- c(FALSE, TRUE) > l2_target_data0 <- c(4L, 2L) > .do_exclude_tests(l2, "l2", l2_target_names0, l2_target_data0, > excludes) > } > > test_table("table") # will display some issues > test_table("table1D") # should not display anything > > >> sessionInfo() > R version 3.0.1 (2013-05-16) > Platform: x86_64-unknown-linux-gnu (64-bit) > > locale: > [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C > [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 > [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 > [7] LC_PAPER=C LC_NAME=C > [9] LC_ADDRESS=C LC_TELEPHONE=C > [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C > > attached base packages: > [1] stats graphics grDevices utils datasets methods base > > loaded via a namespace (and not attached): > [1] tools_3.0.1 >-- Herv? Pag?s Program in Computational Biology Division of Public Health Sciences Fred Hutchinson Cancer Research Center 1100 Fairview Ave. N, M1-B514 P.O. Box 19024 Seattle, WA 98109-1024 E-mail: hpages at fhcrc.org Phone: (206) 667-5791 Fax: (206) 667-1319