thr3ads.net - R devel - [Rd] [.data.frame speedup [Jul 2008]

If this information is useful, please help other people find it:
Share via:

Tim Hesterberg

2008-Jul-01 18:20 UTC

[Rd] [.data.frame speedup

Below is a version of [.data.frame that is faster
for subscripting rows of large data frames; it avoids calling
    duplicated(rows)
if there is no need to check for duplicate row names, when:
    i is logical
    attr(x, "dup.row.names") is not NULL (S+ compatibility)
    i is numeric and negative
    i is strictly increasing


"[.data.frame" <-
function (x, i, j,
          drop = if (missing(i)) TRUE else length(cols) == 1)
{
  # This version of [.data.frame avoid wasting time enforcing unique
  # row names.
  mdrop <- missing(drop)
  Narg <- nargs() - (!mdrop)
  if (Narg < 3) {
    if (!mdrop)
      warning("drop argument will be ignored")
    if (missing(i))
      return(x)
    if (is.matrix(i))
      return(as.matrix(x)[i])
    y <- NextMethod("[")
    cols <- names(y)
    if (!is.null(cols) && any(is.na(cols)))
      stop("undefined columns selected")
    if (any(duplicated(cols)))
      names(y) <- make.unique(cols)
    return(structure(y, class = oldClass(x),
                     row.names = .row_names_info(x, 0L)))
  }
  if (missing(i)) {
    if (missing(j) && drop && length(x) == 1L)
      return(.subset2(x, 1L))
    y <- if (missing(j))
      x
    else .subset(x, j)
    if (drop && length(y) == 1L)
      return(.subset2(y, 1L))
    cols <- names(y)
    if (any(is.na(cols)))
      stop("undefined columns selected")
    if (any(duplicated(cols)))
      names(y) <- make.unique(cols)
    nrow <- .row_names_info(x, 2L)
    if (drop && !mdrop && nrow == 1L)
      return(structure(y, class = NULL, row.names = NULL))
    else return(structure(y, class = oldClass(x),
                          row.names = .row_names_info(x, 0L)))
  }
  xx <- x
  cols <- names(xx)
  x <- vector("list", length(x))
  x <- .Call("R_copyDFattr", xx, x, PACKAGE = "base")
  oldClass(x) <- attr(x, "row.names") <- NULL
  # Do not want to check for duplicates if don't need to
  noDuplicateRowNames <- (is.logical(i) ||
                          (!is.null(attr(x, "dup.row.names"))) ||
                          (is.numeric(i) && min(i, 0, na.rm=TRUE) <
0) ||
                          (!notSorted(i, strict = TRUE)))
  if (!missing(j)) {
    x <- x[j]
    cols <- names(x)
    if (drop && length(x) == 1L) {
      if (is.character(i)) {
        rows <- attr(xx, "row.names")
        i <- pmatch(i, rows, duplicates.ok = TRUE)
      }
      xj <- .subset2(.subset(xx, j), 1L)
      return(if (length(dim(xj)) != 2L) xj[i] else xj[i,
                         , drop = FALSE])
    }
    if (any(is.na(cols)))
      stop("undefined columns selected")
    nxx <- structure(seq_along(xx), names = names(xx))
    sxx <- match(nxx[j], seq_along(xx))
  }
  else sxx <- seq_along(x)
  rows <- NULL
  if (is.character(i)) {
    rows <- attr(xx, "row.names")
    i <- pmatch(i, rows, duplicates.ok = TRUE)
  }
  for (j in seq_along(x)) {
    xj <- xx[[sxx[j]]]
    x[[j]] <- if (length(dim(xj)) != 2L)
      xj[i]
    else xj[i, , drop = FALSE]
  }
  if (drop) {
    n <- length(x)
    if (n == 1L)
      return(x[[1L]])
    if (n > 1L) {
      xj <- x[[1L]]
      nrow <- if (length(dim(xj)) == 2L)
        dim(xj)[1L]
      else length(xj)
      drop <- !mdrop && nrow == 1L
    }
    else drop <- FALSE
  }
  if (!drop) {
    if (is.null(rows))
      rows <- attr(xx, "row.names")
    rows <- rows[i]
    if ((ina <- any(is.na(rows))) | (dup <- !noDuplicateRowNames
&&
any(duplicated(rows)))) {
      if (ina)
        rows[is.na(rows)] <- "NA"
      if (dup)
        rows <- make.unique(as.character(rows))
    }
    if (any(duplicated(nm <- names(x))))
      names(x) <- make.unique(nm)
    if (is.null(rows))
      rows <- attr(xx, "row.names")[i]
    attr(x, "row.names") <- rows
    oldClass(x) <- oldClass(xx)
  }
  x
}

	[[alternative HTML version deleted]]

Tim Hesterberg

2008-Jul-01 22:23 UTC

head link

[Rd] [.data.frame speedup

There is a bug in the standard version of [.data.frame;
it mixes up handling duplicates and NAs when subscripting rows.
  x <- data.frame(x=1:3, y=2:4,
row.names=c("a","b","NA"))
  y <- x[c(2:3, NA),]
  y
It creates a data frame with duplicate rows, but won't print.

In the previous message I included a version of [.data.frame;
it fails for the same example, for a different reason.  Here
is a fix.


"subscript.data.frame" <-
function (x, i, j,
          drop = if (missing(i)) TRUE else length(cols) == 1)
{
  # This version of [.data.frame avoid wasting time enforcing unique
  # row names if possible.
  mdrop <- missing(drop)
  Narg <- nargs() - (!mdrop)
  if (Narg < 3) {
    if (!mdrop)
      warning("drop argument will be ignored")
    if (missing(i))
      return(x)
    if (is.matrix(i))
      return(as.matrix(x)[i])
    y <- NextMethod("[")
    cols <- names(y)
    if (!is.null(cols) && any(is.na(cols)))
      stop("undefined columns selected")
    if (any(duplicated(cols)))
      names(y) <- make.unique(cols)
    return(structure(y, class = oldClass(x),
                     row.names = .row_names_info(x, 0L)))
  }
  if (missing(i)) {
    if (missing(j) && drop && length(x) == 1L)
      return(.subset2(x, 1L))
    y <- if (missing(j))
      x
    else .subset(x, j)
    if (drop && length(y) == 1L)
      return(.subset2(y, 1L))
    cols <- names(y)
    if (any(is.na(cols)))
      stop("undefined columns selected")
    if (any(duplicated(cols)))
      names(y) <- make.unique(cols)
    nrow <- .row_names_info(x, 2L)
    if (drop && !mdrop && nrow == 1L)
      return(structure(y, class = NULL, row.names = NULL))
    else return(structure(y, class = oldClass(x),
                          row.names = .row_names_info(x, 0L)))
  }
  xx <- x
  cols <- names(xx)
  x <- vector("list", length(x))
  x <- .Call("R_copyDFattr", xx, x, PACKAGE = "base")
  oldClass(x) <- attr(x, "row.names") <- NULL
  # Do not want to check for duplicates if don't need to
  noDuplicateRowNames <- (is.logical(i) ||
                          (!is.null(attr(x, "dup.row.names"))) ||
                          (is.numeric(i) && min(i, 0, na.rm=TRUE) <
0) ||
                          (!anyMissing(i) && !notSorted(i, strict =
TRUE)))
  if (!missing(j)) {
    x <- x[j]
    cols <- names(x)
    if (drop && length(x) == 1L) {
      if (is.character(i)) {
        rows <- attr(xx, "row.names")
        i <- pmatch(i, rows, duplicates.ok = TRUE)
      }
      xj <- .subset2(.subset(xx, j), 1L)
      return(if (length(dim(xj)) != 2L) xj[i] else xj[i,
                         , drop = FALSE])
    }
    if (any(is.na(cols)))
      stop("undefined columns selected")
    nxx <- structure(seq_along(xx), names = names(xx))
    sxx <- match(nxx[j], seq_along(xx))
  }
  else sxx <- seq_along(x)
  rows <- NULL
  if (is.character(i)) {
    rows <- attr(xx, "row.names")
    i <- pmatch(i, rows, duplicates.ok = TRUE)
  }
  for (j in seq_along(x)) {
    xj <- xx[[sxx[j]]]
    x[[j]] <- if (length(dim(xj)) != 2L)
      xj[i]
    else xj[i, , drop = FALSE]
  }
  if (drop) {
    n <- length(x)
    if (n == 1L)
      return(x[[1L]])
    if (n > 1L) {
      xj <- x[[1L]]
      nrow <- if (length(dim(xj)) == 2L)
        dim(xj)[1L]
      else length(xj)
      drop <- !mdrop && nrow == 1L
    }
    else drop <- FALSE
  }
  if (!drop) {
    if (is.null(rows))
      rows <- attr(xx, "row.names")
    rows <- rows[i]
    if(any(is.na(rows)))
      rows[is.na(rows)] <- "NA"
    if(!noDuplicateRowNames && any(duplicated(rows)))
        rows <- make.unique(as.character(rows))
    if (any(duplicated(nm <- names(x))))
      names(x) <- make.unique(nm)
    if (is.null(rows))
      rows <- attr(xx, "row.names")[i]
    attr(x, "row.names") <- rows
    oldClass(x) <- oldClass(xx)
  }
  x
}

# That requires anyMissing from the splus2R package,
# plus notSorted (or a version of is.unsorted with argument 'strict'
added).

notSorted <- function(x, decreasing = FALSE, strict = FALSE, na.rm = FALSE){
  # return TRUE if x is not sorted
  # If decreasing=FALSE, check for sort in increasing order
  # If strict=TRUE, ties correspond to not being sorted
  n <- length(x)
  if(length(n) < 2)
    return(FALSE)
  if(!is.atomic(x) || (!na.rm && any(is.na(x))))
    return(NA)
  if(na.rm && any(ii <- is.na(x)))
    x <- x[!ii]
  if(decreasing){
    ifelse1(strict,
            any(x[-1] >= x[-n]),
            any(x[-1] >  x[-n]))
  } else { # check for sort in increasing order
    ifelse1(strict,
            any(x[-1] <= x[-n]),
            any(x[-1] <  x[-n]))
  }
}


On Tue, Jul 1, 2008 at 11:20 AM, Tim Hesterberg <timhesterberg@gmail.com>
wrote:
> Below is a version of [.data.frame that is faster
> for subscripting rows of large data frames; it avoids calling
>     duplicated(rows)
> if there is no need to check for duplicate row names, when:
>     i is logical
>     attr(x, "dup.row.names") is not NULL (S+ compatibility)
>     i is numeric and negative
>     i is strictly increasing
>
	[[alternative HTML version deleted]]

Maybe Matching Threads

Search for more apparently analagous threads

R devel - Jul 2008 - [.data.frame speedup

[Rd] [.data.frame speedup

[Rd] [.data.frame speedup

Maybe Matching Threads