"Jens Oehlschlägel"
2007-Nov-02 14:53 UTC
[Rd] applying duplicated, unique and match to lists?
Dear R developers, While improving duplicated.array() and friends and developing equivalents for the new ff package for large datasets I came across two questions: 1) is it safe to use duplicated.default(), unique.default() and match() on arbitrary lists? If so, we can speed up duplicated.array and friends considerably by using list() instead of paste(collapse="\r") 2) while duplicated.default() is very fast even on lists, match() is very slow on lists. Why is the internal conversion to character necessary? If the hashtable behind duplicated() in unique.c work for lists, why can't we use them for match()? If conversion to character is unavoidable, a better scaling alternative could be serializing and compressing to md5: even with final identity check agains unlikely collisions this is much faster in many cases (break even seems to be for quite small list elements like 2 doubles). 1) the new versions should also work for lists with a dim attribute (old versions has as.vector() which does not work for lists) Factor 10 speedup for row duplicates (here atomic matrices)> system.time(duplicated(x, hashFUN=function(x)paste(x, collapse="\r")))user system elapsed 2.37 0.02 2.45> system.time(duplicated(x, hashFUN=md5))user system elapsed 0.51 0.00 0.51> system.time(duplicated(x, hashFUN=list))user system elapsed 0.17 0.00 0.17 2) Speedup potential for list matching (md5 results below)> x <- as.list(runif(100000)) > system.time(duplicated(x))user system elapsed 0.01 0.00 0.02> system.time(match(x,x))user system elapsed 2.01 0.00 2.03 Please find below more comments and tests, new code for duplicated.array() and friends, suggestions for new classes 'hash' (requiring digest) and 'id' (and if you are curious: first code drafts for the respective ff methods). Best regards Jens Oehlschl?gel ---- # Hashing of large objects in ff # (c) 2007 Jens Oehlsch?gel # Licence: GPL2 # Created: 2007-10-30 # Last changed: 2007-10-30 require(digest) # digest maintainer: Dirk Eddelbuettel <edd at debian.org> # { --- available hash functions --- # perfect projection: list # NOTE that the 'easiest hash function' is 'list' # it is faster than everything else when calculating duplicated or unique, but it is extremely slow for 'match' (currently, R-2.6.0) # thus for matching list elements, it is faster converting the list elements with md5 # no projection for vectors only none <- function(x)x # concatenation of as.character as currently (R-2.6.1) in duplicated.array, match.array (pairs of projections may erroneously apear as identical when the vectors are very similar, RAM expensive) pasteid <- function(x)paste(x, collapse="\r") # perfectly identity preserving projection (but even more RAM expensive) id1 <- function(x)paste(.Call("R_serialize", x, NULL, FALSE, NULL, PACKAGE = "base")[-(1:14)], collapse="") # 32 byte projection md5 <- function(x).Call("digest", .Call("R_serialize", x, NULL, FALSE, NULL, PACKAGE = "base"), 1L, -1L, 14L, PACKAGE = "digest") # 40 byte projection sha1 <- function(x).Call("digest", .Call("R_serialize", x, NULL, FALSE, NULL, PACKAGE = "base"), 2L, -1L, 14L, PACKAGE = "digest") # 8 byte projection: more collisions crc32 <- function(x).Call("digest", .Call("R_serialize", x, NULL, FALSE, NULL, PACKAGE = "base"), 3L, -1L, 14L, PACKAGE = "digest") #! \name{md5} #! \alias{md5} #! \title{ faster shortcut functions for in-memory digest } #! \description{ #! These functions project (serialize or hash) their input object and return a string. Because they avoid any R overhead they are better suitable for sapply() than using the more general function 'digest' #! } #! \usage{ #! md5(x) #! sha1(x) #! crc32(x) #! id1(x) #! } #! %- maybe also 'usage' for other objects documented here. #! \arguments{ #! \item{x}{ a fully serializable R object } #! } #! \value{ #! character scalar #! } #! \seealso{ \code{\link{digest}}, \code{\link[base]{serialize}} } #! \examples{ #! md5(pi) #! sha1(pi) #! crc32(pi) #! id1(pi) #! #! dontshow{ #! if (!identical(paste(serialize(list(str="a string", double=pi), connection=NULL)[-(1:14)], collapse=""), id1(list(str="a string", double=pi)))) #! stop("something has changed in serialization, please fix the internal .Calls in function 'id1', 'md5, 'sha1', 'crc32'") #! #! if (!identical(digest(list(str="a string", double=pi), algo="md5"), md5(list(str="a string", double=pi)))) #! stop("something has changed in package 'digest' or in serialization, please fix the internal .Calls in function 'md5'") #! #! if (!identical(digest(list(str="a string", double=pi), algo="sha1"), sha1(list(str="a string", double=pi)))) #! stop("something has changed in package 'digest' or in serialization, please fix the internal .Calls in function 'sha1'") #! #! if (!identical(digest(list(str="a string", double=pi), algo="crc32"), crc32(list(str="a string", double=pi)))) #! stop("something has changed in package 'digest' or in serialization, please fix the internal .Calls in function 'crc32'") #! } #! } #! \keyword{misc} # } --- available hash functions --- if (FALSE){ # current (R-2.6.0) versions of duplicated.* and unique.* can fail for very similar rows x <- matrix(1, 2, 2) x[1,1] <- 1 + 1e-15 x[2,1] <- 1 + 2e-15 x[1,1]==x[2,1] duplicated(x) apply(x, 1, paste, collapse="\r") # is using md5 is safer? apply(x, 1, md5) # atomic data n <- 10000 x <- matrix(runif(n*20),n,20) x <- x[rep(1:nrow(x),rep(2,nrow(x))),] # using list or md5 is faster than pasteid or digest or even serialize via id1 system.time(apply(x, 1, pasteid)) system.time(apply(x, 1, digest)) system.time(apply(x, 1, id1)) system.time(apply(x, 1, md5)) system.time(apply(x, 1, list)) # using md5 takes less RAM for strings object.size(x) object.size(apply(x, 1, pasteid)) object.size(apply(x, 1, md5)) object.size(apply(x, 1, id1)) object.size(apply(x, 1, list)) # atomic matrix performance system.time(duplicated(x, hashFUN=pasteid)) system.time(duplicated(x, hashFUN=id1)) system.time(duplicated(x, hashFUN=md5)) system.time(duplicated(x, hashFUN=list)) # list data n <- 1000 x <- matrix(as.list(runif(n*20)),n,20) x <- x[rep(1:nrow(x),rep(2,nrow(x))),] # list matrix performance system.time(duplicated(x, hashFUN=pasteid)) system.time(duplicated(x, hashFUN=id1)) system.time(duplicated(x, hashFUN=md5)) system.time(duplicated(x, hashFUN=list)) n <- 100000 # match works fine for atomic and list character data x <- as.character(runif(n)) system.time(duplicated(x)) system.time(match(x,x)) y <- as.list(x) system.time(duplicated(y)) system.time(match(y,y)) # but is very slow for numeric (double and integer) lists (although duplicated on numeric lists is fast, doesn't use match the same hashtable?) x <- runif(n) system.time(duplicated(x)) system.time(match(x,x)) y <- as.list(x) system.time(duplicated(y)) system.time(match(y,y)) # try some alternatives system.time({z <- sapply(y, id1); zt <- sapply(y, id1); match(z,zt)}) system.time({z <- sapply(y, md5); zt <- sapply(y, md5); match(z,zt)}) system.time({z <- sapply(y, md5); zt <- sapply(y, md5); pos<-match(z,zt); all(sapply(seq(along=pos), function(i)identical(z[i],zt[pos[i]])))}) n <- 100000 m <- 50 # even worse: lists with vectors of numeric x <- matrix(runif(n), m) y <- lapply(1:ncol(x), function(i)x[,i]) system.time(duplicated(y)) system.time(match(y,y)) # is so slow that md5 converting can speed up match considerably, even with final identity check (break-even is at m=2, for longer vectors md5 is faster) system.time({z <- sapply(y, md5); zt <- sapply(y, md5); match(z,z)}) system.time({z <- sapply(y, md5); zt <- sapply(y, md5); pos<-match(z,zt); all(sapply(seq(along=pos), function(i)identical(z[i],zt[pos[i]])))}) # less impressive but still so for strings x <- matrix(as.character(runif(n)), m) y <- lapply(1:ncol(x), function(i)x[,i]) system.time(duplicated(y)) system.time(match(y,y)) # is so slow that md5 converting speeds up match system.time({z <- lapply(y, md5); match(z,zt)}) system.time({z <- sapply(y, md5); zt <- sapply(y, md5); pos<-match(z,zt); all(sapply(seq(along=pos), function(i)identical(z[i],zt[pos[i]])))}) rm(x) } duplicated.matrix <- duplicated.array <- function (x, incomparables = FALSE, fromLast = FALSE, MARGIN = 1, hashFUN=list, ...) { if (!is.logical(incomparables) || incomparables) .NotYetUsed("incomparables != FALSE") ndim <- length(dim(x)) if (length(MARGIN) > ndim || any(MARGIN > ndim)) stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x)) temp <- apply(x, MARGIN, hashFUN) d <- dim(temp) dn <- dimnames(temp) dim(temp) <- NULL # we had as.vector here, but that fails for lists with dim attributes res <- duplicated(temp, fromLast = fromLast) dim(res) <- d dimnames(res) <- dn res } duplicated.data.frame <- function (x, incomparables = FALSE, fromLast = FALSE, hashFUN=list, ...) { if (!is.logical(incomparables) || incomparables) .NotYetUsed("incomparables != FALSE") ndim <- length(dim(x)) temp <- apply(x, 1, hashFUN) d <- dim(temp) dn <- dimnames(temp) dim(temp) <- NULL # we had as.vector here, but that fails for lists with dim attributes res <- duplicated(temp, fromLast = fromLast) dim(res) <- d dimnames(res) <- dn res } unique.matrix <- unique.array <- function (x, incomparables = FALSE, fromLast = FALSE, MARGIN = 1, hashFUN=list, ...) { if (!is.logical(incomparables) || incomparables) .NotYetUsed("incomparables != FALSE") ndim <- length(dim(x)) if (length(MARGIN) > 1 || any(MARGIN > ndim)) stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x)) temp <- apply(x, MARGIN, hashFUN) args <- rep(alist(a = ), ndim) names(args) <- NULL dim(temp) <- NULL # we had as.vector here, but that fails for lists with dim attributes args[[MARGIN]] <- !duplicated(temp, fromLast = fromLast) do.call("[", c(list(x = x), args, list(drop = FALSE))) } unique.data.frame <- function (x, incomparables = FALSE, fromLast = FALSE, hashFUN=list, ...) { if (!is.logical(incomparables) || incomparables) .NotYetUsed("incomparables != FALSE") x[!duplicated(x, fromLast = fromLast, hashFUN = hashFUN), , drop = FALSE] } # like duplicated but return hash value instead of logical hash <- function(x, ...) UseMethod("hash") hash.default <- function(x, hashFUN=md5, ...) sapply(x, hashFUN) hash.matrix <- hash.array <- function (x, MARGIN = 1, hashFUN=md5, ...) { ndim <- length(dim(x)) if (length(MARGIN) > ndim || any(MARGIN > ndim)) stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x)) apply(x, MARGIN, hashFUN) } hash.data.frame <- function (x, hashFUN=md5, ...) { ndim <- length(dim(x)) apply(x, 1, hashFUN) } # like duplicated but return the position of first occurence instead of logical id <- function(x, ...) UseMethod("id") id.default <- function(x, fromLast = FALSE, hashFUN=NULL, ...){ if (is.null(hashFUN)){ if (is.list(x)) x <- sapply(x, md5) # fix the slow performance of match on lists (R-2.6.0) by converting to md5 d <- (1:length(x))[!duplicated(x, fromLast = fromLast)] d[match(x,x[d])] }else{ x <- lapply(x, hashFUN) d <- (1:length(x))[!duplicated(x, fromLast = fromLast)] d[match(x,x[d])] } } id.matrix <- id.array <- function (x, fromLast = FALSE, MARGIN = 1, hashFUN=md5, ...) { ndim <- length(dim(x)) nmarg <- length(MARGIN) if (nmarg > ndim || any(MARGIN > ndim)) stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x)) h <- apply(x, MARGIN, hashFUN) d <- (1:length(h))[!duplicated(h, fromLast = fromLast)] i <- d[match(h,h[d])] dim(i) <- dim(h) dimnames(i) <- dimnames(h) i } id.data.frame <- function (x, fromLast = FALSE, hashFUN=md5, ...) { h <- apply(x, 1, hashFUN) d <- (1:length(h))[!duplicated(h, fromLast = fromLast)] d[match(h,h[d])] } if (FALSE){ n <- 10000 # test duplicated x <- matrix(runif(n*20),n,20) x <- x[rep(1:nrow(x),rep(2,nrow(x))),] d1 <- duplicated.matrix(x, hashFUN=list) d2 <- duplicated.matrix(x, hashFUN=md5) all.equal(d1,d2) table(d1) rm(d1,d2) system.time(duplicated.matrix(x, hashFUN=list)) system.time(duplicated.matrix(x, hashFUN=md5)) x <- matrix(as.list(runif(n*20)),n,20) x <- x[rep(1:nrow(x),rep(2,nrow(x))),] d1 <- duplicated.matrix(x, hashFUN=list) d2 <- duplicated.matrix(x, hashFUN=md5) all.equal(d1,d2) table(d1) rm(d1,d2) system.time(duplicated.matrix(x, hashFUN=list)) system.time(duplicated.matrix(x, hashFUN=md5)) # test unique x <- matrix(runif(n*20),n,20) x <- x[rep(1:nrow(x),rep(2,nrow(x))),] d1 <- unique.matrix(x, hashFUN=list) d2 <- unique.matrix(x, hashFUN=md5) all.equal(d1,d2) dim(d1) rm(d1,d2) system.time(unique.matrix(x, hashFUN=list)) system.time(unique.matrix(x, hashFUN=md5)) x <- matrix(as.list(runif(n*20)),n,20) x <- x[rep(1:nrow(x),rep(2,nrow(x))),] d1 <- unique.matrix(x, hashFUN=list) d2 <- unique.matrix(x, hashFUN=md5) all.equal(d1,d2) all.equal(md5(d1),md5(d2)) dim(d1) rm(d1,d2) system.time(unique.matrix(x, hashFUN=list)) system.time(unique.matrix(x, hashFUN=md5)) # test id x <- matrix(runif(n*20),n,20) x <- x[rep(1:nrow(x),rep(2,nrow(x))),] d1 <- id.matrix(x, hashFUN=md5) d2 <- id.matrix(x, hashFUN=list) all.equal(d1,d2) length(unique(d1)) rm(d1,d2) system.time(id.matrix(x, hashFUN=md5)) system.time(id.matrix(x, hashFUN=list)) x <- matrix(as.list(runif(n*20)),n,20) x <- x[rep(1:nrow(x),rep(2,nrow(x))),] d1 <- id.matrix(x, hashFUN=md5) d2 <- id.matrix(x, hashFUN=list) all.equal(d1,d2) length(unique(d1)) rm(d1,d2) system.time(id.matrix(x, hashFUN=md5)) system.time(id.matrix(x, hashFUN=list)) } # BTW, the following are methods for the upcoming package/class 'ff' # they limit the chunk size of RAM needed for reading the data from disk # yet they assume that the result fits into RAM (and is returned as such) # they rely on ffapply which helps with chunked indexing hash.ff <- function (x, MARGIN = NULL, hashFUN=md5 , return.ff = FALSE , ... # passed to ffapply ) { if (!is.logical(return.ff) || return.ff) .NotYetUsed("return.ff = TRUE") d <- dim(x) if (is.null(MARGIN)) MARGIN <- if (is.null(d)) integer() else 1L nmarg <- length(MARGIN) if (nmarg){ ndim <- length(d) if (nmarg > ndim || any(MARGIN > ndim)) stop("MARGIN = ", MARGIN, " is invalid for dim = ", d) if (nmarg==ndim){ ret <- apply(x[], MARGIN, hashFUN) }else{ args <- rep(alist(a = ), ndim) names(args) <- NULL ret <- ffapply(x, { args[MARGIN] <- lapply(seq(along=MARGIN), function(i)i1[i]:i2[i]) temp <- do.call("[", c(list(x = x), args, list(drop = FALSE))) apply(temp, MARGIN, hashFUN) }, margin=MARGIN, return="unlist", ...) if (nmarg>1){ dim(ret) <- d[MARGIN] dimnames(ret) <- dimnames(x)[MARGIN] } } }else{ ret <- ffapply(x, sapply(x[i1:i2], hashFUN), return="unlist", ...) } ret } # xx this is yet without a final identity check agains md5 collisions duplicated.ff <- function (x, incomparables = FALSE, fromLast = FALSE, MARGIN = 1, hashFUN=md5 , return.ff = FALSE , ... # passed to ffapply ) { if (!is.logical(incomparables) || incomparables) .NotYetUsed("incomparables != FALSE") if (!is.logical(return.ff) || return.ff) .NotYetUsed("return.ff = TRUE") d <- dim(x) if (is.null(MARGIN)) MARGIN <- if (is.null(d)) integer() else 1L ndim <- length(d) nmarg <- length(MARGIN) if (nmarg > ndim || any(MARGIN > ndim)) stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x)) if (nmarg){ if (nmarg==ndim){ # no hashFUN needed h <- aperm(x[], MARGIN) }else{ h <- hash(x, MARGIN=MARGIN, return.ff=return.ff, hashFUN=hashFUN, ...) } d <- dim(h) dn <- dimnames(h) dim(h) <- NULL dup <- duplicated(h, fromLast = fromLast) dim(dup) <- d dimnames(dup) <- dn }else{ # yet no RAM savings in this case n <- length(x) # 1:length(n) WOULD be expanded dup <- duplicated(x[1:n], fromLast = fromLast) # 1:n is NOT expanded and returns a simple vector (faster than as.vector(x[])) } dup } # xx this is yet without a final identity check agains md5 collisions unique.ff <- function (x, incomparables = FALSE, fromLast = FALSE, MARGIN = NULL, hashFUN=md5 , return.ff = FALSE , ... # passed to ffapply ) { if (!is.logical(incomparables) || incomparables) .NotYetUsed("incomparables != FALSE") if (!is.logical(return.ff) || return.ff) .NotYetUsed("return.ff = TRUE") d <- dim(x) if (is.null(MARGIN)) MARGIN <- if (is.null(d)) integer() else 1L ndim <- length(d) nmarg <- length(MARGIN) if (nmarg > 1 || any(MARGIN > ndim)) stop("MARGIN = ", MARGIN, " is invalid for dim = ", dim(x)) if (nmarg){ if (nmarg==ndim){ # no hashFUN needed h <- aperm(x[], MARGIN) }else{ h <- hash(x, MARGIN=MARGIN, return.ff=return.ff, hashFUN=hashFUN, ...) } d <- dim(h) dn <- dimnames(h) dim(h) <- NULL args <- rep(alist(a = ), ndim) names(args) <- NULL args[[MARGIN]] <- !duplicated(h, fromLast = fromLast) do.call("[", c(list(x = x), args, list(drop = FALSE))) }else{ # yet no RAM savings in this case n <- length(x) unique(x[1:n], fromLast = fromLast) # 1:n is NOT expanded and returns a simple vector (faster than as.vector(x[])) } } # xx this is yet without a final identity check agains md5 collisions id.ff <- function (x, fromLast = FALSE, MARGIN = NULL, hashFUN=NULL , return.ff = FALSE , ... # passed to ffapply ) { if (!is.logical(return.ff) || return.ff) .NotYetUsed("return.ff = TRUE") d <- dim(x) if (is.null(MARGIN)) MARGIN <- if (is.null(d)) integer() else 1L nmarg <- length(MARGIN) if (nmarg){ if (is.null(hashFUN)) hashFUN <- md5 ndim <- length(d) if (nmarg > ndim || any(MARGIN > ndim)) stop("MARGIN = ", MARGIN, " is invalid for dim = ", d) if (nmarg==ndim){ if (is.list(x[1])) h <- apply(x[], MARGIN, hashFUN) else h <- aperm(x[], MARGIN) dim(h) <- NULL }else{ args <- rep(alist(a = ), ndim) names(args) <- NULL h <- ffapply(x, { args[MARGIN] <- lapply(seq(along=MARGIN), function(i)i1[i]:i2[i]) temp <- do.call("[", c(list(x = x), args, list(drop = FALSE))) apply(temp, MARGIN, hashFUN) }, margin=MARGIN, return="unlist", ...) } nd <- (1:length(h))[!duplicated(h, fromLast = fromLast)] i <- nd[match(h,h[nd])] if (nmarg>1){ # yet no RAM savings in this case dim(i) <- d[MARGIN] dimnames(i) <- dimnames(x)[MARGIN] } }else{ i <- id(x, fromLast = fromLast, hashFUN=hashFUN) } i } # xx this is yet without a final identity check agains md5 collisions # row identity for ff and R matrices (less overhead compared to id.ff via ffapply ) ffrowid <- function(x, ...){ id(ffrowapply(x, apply(x[i1:i2,,drop=FALSE], 1, md5), return="unlist", use.names=FALSE, ...)) } if (FALSE){ a <- ff(0, dim=c(100000,10),dimorder=2:1) ffapply(a, a[i1:i2]<-runif(i2-i1+1)) r <- ffrowhash(a) } if (FALSE){ n <- 100000 m <- 10 x <- ff(0, dim=c(n,m)) x[,dimorder=2:1] <- 1:(m*n/2) hash(x, MARGIN=integer()) # hash single cells, no RAM optimization hash(x) # hash rows, RAM savings because rows are read and md5ed i chunks duplicated(x) # id(x) # positions of first occurences ffrowid(x) # faster positions unique(x) # unique rows } --