There is a bug in the standard version of [.data.frame;
it mixes up handling duplicates and NAs when subscripting rows.
x <- data.frame(x=1:3, y=2:4,
row.names=c("a","b","NA"))
y <- x[c(2:3, NA),]
y
It creates a data frame with duplicate rows, but won't print.
In the previous message I included a version of [.data.frame;
it fails for the same example, for a different reason. Here
is a fix.
"subscript.data.frame" <-
function (x, i, j,
drop = if (missing(i)) TRUE else length(cols) == 1)
{
# This version of [.data.frame avoid wasting time enforcing unique
# row names if possible.
mdrop <- missing(drop)
Narg <- nargs() - (!mdrop)
if (Narg < 3) {
if (!mdrop)
warning("drop argument will be ignored")
if (missing(i))
return(x)
if (is.matrix(i))
return(as.matrix(x)[i])
y <- NextMethod("[")
cols <- names(y)
if (!is.null(cols) && any(is.na(cols)))
stop("undefined columns selected")
if (any(duplicated(cols)))
names(y) <- make.unique(cols)
return(structure(y, class = oldClass(x),
row.names = .row_names_info(x, 0L)))
}
if (missing(i)) {
if (missing(j) && drop && length(x) == 1L)
return(.subset2(x, 1L))
y <- if (missing(j))
x
else .subset(x, j)
if (drop && length(y) == 1L)
return(.subset2(y, 1L))
cols <- names(y)
if (any(is.na(cols)))
stop("undefined columns selected")
if (any(duplicated(cols)))
names(y) <- make.unique(cols)
nrow <- .row_names_info(x, 2L)
if (drop && !mdrop && nrow == 1L)
return(structure(y, class = NULL, row.names = NULL))
else return(structure(y, class = oldClass(x),
row.names = .row_names_info(x, 0L)))
}
xx <- x
cols <- names(xx)
x <- vector("list", length(x))
x <- .Call("R_copyDFattr", xx, x, PACKAGE = "base")
oldClass(x) <- attr(x, "row.names") <- NULL
# Do not want to check for duplicates if don't need to
noDuplicateRowNames <- (is.logical(i) ||
(!is.null(attr(x, "dup.row.names"))) ||
(is.numeric(i) && min(i, 0, na.rm=TRUE) <
0) ||
(!anyMissing(i) && !notSorted(i, strict =
TRUE)))
if (!missing(j)) {
x <- x[j]
cols <- names(x)
if (drop && length(x) == 1L) {
if (is.character(i)) {
rows <- attr(xx, "row.names")
i <- pmatch(i, rows, duplicates.ok = TRUE)
}
xj <- .subset2(.subset(xx, j), 1L)
return(if (length(dim(xj)) != 2L) xj[i] else xj[i,
, drop = FALSE])
}
if (any(is.na(cols)))
stop("undefined columns selected")
nxx <- structure(seq_along(xx), names = names(xx))
sxx <- match(nxx[j], seq_along(xx))
}
else sxx <- seq_along(x)
rows <- NULL
if (is.character(i)) {
rows <- attr(xx, "row.names")
i <- pmatch(i, rows, duplicates.ok = TRUE)
}
for (j in seq_along(x)) {
xj <- xx[[sxx[j]]]
x[[j]] <- if (length(dim(xj)) != 2L)
xj[i]
else xj[i, , drop = FALSE]
}
if (drop) {
n <- length(x)
if (n == 1L)
return(x[[1L]])
if (n > 1L) {
xj <- x[[1L]]
nrow <- if (length(dim(xj)) == 2L)
dim(xj)[1L]
else length(xj)
drop <- !mdrop && nrow == 1L
}
else drop <- FALSE
}
if (!drop) {
if (is.null(rows))
rows <- attr(xx, "row.names")
rows <- rows[i]
if(any(is.na(rows)))
rows[is.na(rows)] <- "NA"
if(!noDuplicateRowNames && any(duplicated(rows)))
rows <- make.unique(as.character(rows))
if (any(duplicated(nm <- names(x))))
names(x) <- make.unique(nm)
if (is.null(rows))
rows <- attr(xx, "row.names")[i]
attr(x, "row.names") <- rows
oldClass(x) <- oldClass(xx)
}
x
}
# That requires anyMissing from the splus2R package,
# plus notSorted (or a version of is.unsorted with argument 'strict'
added).
notSorted <- function(x, decreasing = FALSE, strict = FALSE, na.rm = FALSE){
# return TRUE if x is not sorted
# If decreasing=FALSE, check for sort in increasing order
# If strict=TRUE, ties correspond to not being sorted
n <- length(x)
if(length(n) < 2)
return(FALSE)
if(!is.atomic(x) || (!na.rm && any(is.na(x))))
return(NA)
if(na.rm && any(ii <- is.na(x)))
x <- x[!ii]
if(decreasing){
ifelse1(strict,
any(x[-1] >= x[-n]),
any(x[-1] > x[-n]))
} else { # check for sort in increasing order
ifelse1(strict,
any(x[-1] <= x[-n]),
any(x[-1] < x[-n]))
}
}
On Tue, Jul 1, 2008 at 11:20 AM, Tim Hesterberg <timhesterberg@gmail.com>
wrote:
> Below is a version of [.data.frame that is faster
> for subscripting rows of large data frames; it avoids calling
> duplicated(rows)
> if there is no need to check for duplicate row names, when:
> i is logical
> attr(x, "dup.row.names") is not NULL (S+ compatibility)
> i is numeric and negative
> i is strictly increasing
>
[[alternative HTML version deleted]]