Hello, colnames seems to be not optimized well for data.frame. It escapes processing for data.frame in if (is.data.frame(x) && do.NULL) return(names(x)) but only when do.NULL true. This makes huge difference when do.NULL false. Minimal edit to `colnames`: if (is.data.frame(x)) { nm <- names(x) if (do.NULL || !is.null(nm)) return(nm) else return(paste0(prefix, seq_along(x))) } Script and timings: N=1e7; K=100 set.seed(1) DF <- data.frame( id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) id4 = sample(K, N, TRUE), # large groups (int) id5 = sample(K, N, TRUE), # large groups (int) id6 = sample(N/K, N, TRUE), # small groups (int) v1 = sample(5, N, TRUE), # int in range [1,5] v2 = sample(5, N, TRUE), # int in range [1,5] v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 ) cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") #GB = 0.397 colnames(DF) = NULL system.time(nm1<-colnames(DF, FALSE)) # user system elapsed # 22.158 0.299 22.498 print(nm1) #[1] "col1" "col2" "col3" "col4" "col5" "col6" "col7" "col8" "col9" ### restart R colnames <- function (x, do.NULL = TRUE, prefix = "col") { if (is.data.frame(x)) { nm <- names(x) if (do.NULL || !is.null(nm)) return(nm) else return(paste0(prefix, seq_along(x))) } dn <- dimnames(x) if (!is.null(dn[[2L]])) dn[[2L]] else { nc <- NCOL(x) if (do.NULL) NULL else if (nc > 0L) paste0(prefix, seq_len(nc)) else character() } } N=1e7; K=100 set.seed(1) DF <- data.frame( id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) id4 = sample(K, N, TRUE), # large groups (int) id5 = sample(K, N, TRUE), # large groups (int) id6 = sample(N/K, N, TRUE), # small groups (int) v1 = sample(5, N, TRUE), # int in range [1,5] v2 = sample(5, N, TRUE), # int in range [1,5] v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 ) cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") #GB = 0.397 colnames(DF) = NULL system.time(nm1<-colnames(DF, FALSE)) # user system elapsed # 0.001 0.000 0.000 print(nm1) #[1] "col1" "col2" "col3" "col4" "col5" "col6" "col7" "col8" "col9" sessionInfo() #R Under development (unstable) (2016-12-19 r71815) #Platform: x86_64-pc-linux-gnu (64-bit) #Running under: Debian GNU/Linux stretch/sid # #locale: # [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C # [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 # [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 # [7] LC_PAPER=en_US.UTF-8 LC_NAME=C # [9] LC_ADDRESS=C LC_TELEPHONE=C #[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C # #attached base packages: #[1] stats graphics grDevices utils datasets methods base # # #loaded via a namespace (and not attached): #[1] compiler_3.4.0
Hi there, Any update on this? Should I create bugzilla ticket and submit patch? Regards Jan Gorecki On 20 December 2016 at 01:27, Jan Gorecki <J.Gorecki at wit.edu.pl> wrote:> Hello, > > colnames seems to be not optimized well for data.frame. It escapes > processing for data.frame in > > if (is.data.frame(x) && do.NULL) > return(names(x)) > > but only when do.NULL true. This makes huge difference when do.NULL > false. Minimal edit to `colnames`: > > if (is.data.frame(x)) { > nm <- names(x) > if (do.NULL || !is.null(nm)) > return(nm) > else > return(paste0(prefix, seq_along(x))) > } > > Script and timings: > > N=1e7; K=100 > set.seed(1) > DF <- data.frame( > id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) > id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) > id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) > id4 = sample(K, N, TRUE), # large groups (int) > id5 = sample(K, N, TRUE), # large groups (int) > id6 = sample(N/K, N, TRUE), # small groups (int) > v1 = sample(5, N, TRUE), # int in range [1,5] > v2 = sample(5, N, TRUE), # int in range [1,5] > v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 > ) > cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") > #GB = 0.397 > colnames(DF) = NULL > system.time(nm1<-colnames(DF, FALSE)) > # user system elapsed > # 22.158 0.299 22.498 > print(nm1) > #[1] "col1" "col2" "col3" "col4" "col5" "col6" "col7" "col8" "col9" > > ### restart R > > colnames <- function (x, do.NULL = TRUE, prefix = "col") > { > if (is.data.frame(x)) { > nm <- names(x) > if (do.NULL || !is.null(nm)) > return(nm) > else > return(paste0(prefix, seq_along(x))) > } > dn <- dimnames(x) > if (!is.null(dn[[2L]])) > dn[[2L]] > else { > nc <- NCOL(x) > if (do.NULL) > NULL > else if (nc > 0L) > paste0(prefix, seq_len(nc)) > else character() > } > } > N=1e7; K=100 > set.seed(1) > DF <- data.frame( > id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) > id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) > id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) > id4 = sample(K, N, TRUE), # large groups (int) > id5 = sample(K, N, TRUE), # large groups (int) > id6 = sample(N/K, N, TRUE), # small groups (int) > v1 = sample(5, N, TRUE), # int in range [1,5] > v2 = sample(5, N, TRUE), # int in range [1,5] > v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 > ) > cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") > #GB = 0.397 > colnames(DF) = NULL > system.time(nm1<-colnames(DF, FALSE)) > # user system elapsed > # 0.001 0.000 0.000 > print(nm1) > #[1] "col1" "col2" "col3" "col4" "col5" "col6" "col7" "col8" "col9" > > sessionInfo() > #R Under development (unstable) (2016-12-19 r71815) > #Platform: x86_64-pc-linux-gnu (64-bit) > #Running under: Debian GNU/Linux stretch/sid > # > #locale: > # [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C > # [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 > # [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 > # [7] LC_PAPER=en_US.UTF-8 LC_NAME=C > # [9] LC_ADDRESS=C LC_TELEPHONE=C > #[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C > # > #attached base packages: > #[1] stats graphics grDevices utils datasets methods base # > # > #loaded via a namespace (and not attached): > #[1] compiler_3.4.0
Martin Maechler
2016-Dec-29 18:13 UTC
[Rd] colnames for data.frame could be greatly improved
> Hi there, > Any update on this? > Should I create bugzilla ticket and submit patch?> Regards > Jan GoreckiHi Jan, Why should we care that the do.NULL = FALSE case is slower? After all do.NULL = TRUE is the default. In other words, where are use cases where it is problematic that do.NULL = FALSE is relatively slow? Shorter code *is* nicer than longer code, so I need a bit more conviction why we should add more code for that special case .. Martin Maechler, ETH Zurich> On 20 December 2016 at 01:27, Jan Gorecki <J.Gorecki at wit.edu.pl> wrote: > > Hello, > > > > colnames seems to be not optimized well for data.frame. It escapes > > processing for data.frame in > > > > if (is.data.frame(x) && do.NULL) > > return(names(x)) > > > > but only when do.NULL true. This makes huge difference when do.NULL > > false. Minimal edit to `colnames`: > > > > if (is.data.frame(x)) { > > nm <- names(x) > > if (do.NULL || !is.null(nm)) > > return(nm) > > else > > return(paste0(prefix, seq_along(x))) > > } > > > > Script and timings: > > > > N=1e7; K=100 > > set.seed(1) > > DF <- data.frame( > > id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) > > id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) > > id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) > > id4 = sample(K, N, TRUE), # large groups (int) > > id5 = sample(K, N, TRUE), # large groups (int) > > id6 = sample(N/K, N, TRUE), # small groups (int) > > v1 = sample(5, N, TRUE), # int in range [1,5] > > v2 = sample(5, N, TRUE), # int in range [1,5] > > v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 > > ) > > cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") > > #GB = 0.397 > > colnames(DF) = NULL > > system.time(nm1<-colnames(DF, FALSE)) > > # user system elapsed > > # 22.158 0.299 22.498 > > print(nm1) > > #[1] "col1" "col2" "col3" "col4" "col5" "col6" "col7" "col8" "col9" > > > > ### restart R > > > > colnames <- function (x, do.NULL = TRUE, prefix = "col") > > { > > if (is.data.frame(x)) { > > nm <- names(x) > > if (do.NULL || !is.null(nm)) > > return(nm) > > else > > return(paste0(prefix, seq_along(x))) > > } > > dn <- dimnames(x) > > if (!is.null(dn[[2L]])) > > dn[[2L]] > > else { > > nc <- NCOL(x) > > if (do.NULL) > > NULL > > else if (nc > 0L) > > paste0(prefix, seq_len(nc)) > > else character() > > } > > } > > N=1e7; K=100 > > set.seed(1) > > DF <- data.frame( > > id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) > > id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char) > > id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char) > > id4 = sample(K, N, TRUE), # large groups (int) > > id5 = sample(K, N, TRUE), # large groups (int) > > id6 = sample(N/K, N, TRUE), # small groups (int) > > v1 = sample(5, N, TRUE), # int in range [1,5] > > v2 = sample(5, N, TRUE), # int in range [1,5] > > v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 > > ) > > cat("GB =", round(sum(gc()[,2])/1024, 3), "\n") > > #GB = 0.397 > > colnames(DF) = NULL > > system.time(nm1<-colnames(DF, FALSE)) > > # user system elapsed > > # 0.001 0.000 0.000 > > print(nm1) > > #[1] "col1" "col2" "col3" "col4" "col5" "col6" "col7" "col8" "col9" > > > > sessionInfo() > > #R Under development (unstable) (2016-12-19 r71815) > > #Platform: x86_64-pc-linux-gnu (64-bit) > > #Running under: Debian GNU/Linux stretch/sid > > # > > #locale: > > # [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C > > # [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 > > # [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 > > # [7] LC_PAPER=en_US.UTF-8 LC_NAME=C > > # [9] LC_ADDRESS=C LC_TELEPHONE=C > > #[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C > > # > > #attached base packages: > > #[1] stats graphics grDevices utils datasets methods base # > > # > > #loaded via a namespace (and not attached): > > #[1] compiler_3.4.0 > > ______________________________________________ > R-devel at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-devel