thr3ads.net - R devel - [Rd] colnames for data.frame could be greatly improved [Dec 2016]

If this information is useful, please help other people find it:
Share via:

Jan Gorecki

2016-Dec-20 01:27 UTC

[Rd] colnames for data.frame could be greatly improved

Hello,

colnames seems to be not optimized well for data.frame. It escapes
processing for data.frame in

  if (is.data.frame(x) && do.NULL)
    return(names(x))

but only when do.NULL true. This makes huge difference when do.NULL
false. Minimal edit to `colnames`:

    if (is.data.frame(x)) {
        nm <- names(x)
        if (do.NULL || !is.null(nm))
            return(nm)
        else
            return(paste0(prefix, seq_along(x)))
    }

Script and timings:

N=1e7; K=100
set.seed(1)
DF <- data.frame(
    id1 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups
(char)
    id2 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups
(char)
    id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups
(char)
    id4 = sample(K, N, TRUE),                          # large groups (int)
    id5 = sample(K, N, TRUE),                          # large groups (int)
    id6 = sample(N/K, N, TRUE),                        # small groups (int)
    v1 =  sample(5, N, TRUE),                          # int in range [1,5]
    v2 =  sample(5, N, TRUE),                          # int in range [1,5]
    v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
)
cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
#GB = 0.397
colnames(DF) = NULL
system.time(nm1<-colnames(DF, FALSE))
#   user  system elapsed
# 22.158   0.299  22.498
print(nm1)
#[1] "col1" "col2" "col3" "col4"
"col5" "col6" "col7" "col8"
"col9"

### restart R

colnames <- function (x, do.NULL = TRUE, prefix = "col")
{
    if (is.data.frame(x)) {
        nm <- names(x)
        if (do.NULL || !is.null(nm))
            return(nm)
        else
            return(paste0(prefix, seq_along(x)))
    }
    dn <- dimnames(x)
    if (!is.null(dn[[2L]]))
        dn[[2L]]
    else {
        nc <- NCOL(x)
        if (do.NULL)
            NULL
        else if (nc > 0L)
            paste0(prefix, seq_len(nc))
        else character()
    }
}
N=1e7; K=100
set.seed(1)
DF <- data.frame(
    id1 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups
(char)
    id2 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups
(char)
    id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups
(char)
    id4 = sample(K, N, TRUE),                          # large groups (int)
    id5 = sample(K, N, TRUE),                          # large groups (int)
    id6 = sample(N/K, N, TRUE),                        # small groups (int)
    v1 =  sample(5, N, TRUE),                          # int in range [1,5]
    v2 =  sample(5, N, TRUE),                          # int in range [1,5]
    v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
)
cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
#GB = 0.397
colnames(DF) = NULL
system.time(nm1<-colnames(DF, FALSE))
#   user  system elapsed
#  0.001   0.000   0.000
print(nm1)
#[1] "col1" "col2" "col3" "col4"
"col5" "col6" "col7" "col8"
"col9"

sessionInfo()
#R Under development (unstable) (2016-12-19 r71815)
#Platform: x86_64-pc-linux-gnu (64-bit)
#Running under: Debian GNU/Linux stretch/sid
#
#locale:
# [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
# [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
# [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
# [9] LC_ADDRESS=C               LC_TELEPHONE=C
#[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#
#attached base packages:
#[1] stats     graphics  grDevices utils     datasets  methods   base  #
#
#loaded via a namespace (and not attached):
#[1] compiler_3.4.0

Jan Gorecki

2016-Dec-27 17:48 UTC

head link

[Rd] colnames for data.frame could be greatly improved

Hi there,
Any update on this?
Should I create bugzilla ticket and submit patch?
Regards
Jan Gorecki

On 20 December 2016 at 01:27, Jan Gorecki <J.Gorecki at wit.edu.pl>
wrote:> Hello,
>
> colnames seems to be not optimized well for data.frame. It escapes
> processing for data.frame in
>
>   if (is.data.frame(x) && do.NULL)
>     return(names(x))
>
> but only when do.NULL true. This makes huge difference when do.NULL
> false. Minimal edit to `colnames`:
>
>     if (is.data.frame(x)) {
>         nm <- names(x)
>         if (do.NULL || !is.null(nm))
>             return(nm)
>         else
>             return(paste0(prefix, seq_along(x)))
>     }
>
> Script and timings:
>
> N=1e7; K=100
> set.seed(1)
> DF <- data.frame(
>     id1 = sample(sprintf("id%03d",1:K), N, TRUE),      # large
groups (char)
>     id2 = sample(sprintf("id%03d",1:K), N, TRUE),      # large
groups (char)
>     id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small
groups (char)
>     id4 = sample(K, N, TRUE),                          # large groups (int)
>     id5 = sample(K, N, TRUE),                          # large groups (int)
>     id6 = sample(N/K, N, TRUE),                        # small groups (int)
>     v1 =  sample(5, N, TRUE),                          # int in range [1,5]
>     v2 =  sample(5, N, TRUE),                          # int in range [1,5]
>     v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g.
23.5749
> )
> cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
> #GB = 0.397
> colnames(DF) = NULL
> system.time(nm1<-colnames(DF, FALSE))
> #   user  system elapsed
> # 22.158   0.299  22.498
> print(nm1)
> #[1] "col1" "col2" "col3" "col4"
"col5" "col6" "col7" "col8"
"col9"
>
> ### restart R
>
> colnames <- function (x, do.NULL = TRUE, prefix = "col")
> {
>     if (is.data.frame(x)) {
>         nm <- names(x)
>         if (do.NULL || !is.null(nm))
>             return(nm)
>         else
>             return(paste0(prefix, seq_along(x)))
>     }
>     dn <- dimnames(x)
>     if (!is.null(dn[[2L]]))
>         dn[[2L]]
>     else {
>         nc <- NCOL(x)
>         if (do.NULL)
>             NULL
>         else if (nc > 0L)
>             paste0(prefix, seq_len(nc))
>         else character()
>     }
> }
> N=1e7; K=100
> set.seed(1)
> DF <- data.frame(
>     id1 = sample(sprintf("id%03d",1:K), N, TRUE),      # large
groups (char)
>     id2 = sample(sprintf("id%03d",1:K), N, TRUE),      # large
groups (char)
>     id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small
groups (char)
>     id4 = sample(K, N, TRUE),                          # large groups (int)
>     id5 = sample(K, N, TRUE),                          # large groups (int)
>     id6 = sample(N/K, N, TRUE),                        # small groups (int)
>     v1 =  sample(5, N, TRUE),                          # int in range [1,5]
>     v2 =  sample(5, N, TRUE),                          # int in range [1,5]
>     v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g.
23.5749
> )
> cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
> #GB = 0.397
> colnames(DF) = NULL
> system.time(nm1<-colnames(DF, FALSE))
> #   user  system elapsed
> #  0.001   0.000   0.000
> print(nm1)
> #[1] "col1" "col2" "col3" "col4"
"col5" "col6" "col7" "col8"
"col9"
>
> sessionInfo()
> #R Under development (unstable) (2016-12-19 r71815)
> #Platform: x86_64-pc-linux-gnu (64-bit)
> #Running under: Debian GNU/Linux stretch/sid
> #
> #locale:
> # [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
> # [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
> # [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
> # [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
> # [9] LC_ADDRESS=C               LC_TELEPHONE=C
> #[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
> #
> #attached base packages:
> #[1] stats     graphics  grDevices utils     datasets  methods   base  #
> #
> #loaded via a namespace (and not attached):
> #[1] compiler_3.4.0

Martin Maechler

2016-Dec-29 18:13 UTC

head link

[Rd] colnames for data.frame could be greatly improved

> Hi there,
> Any update on this?
> Should I create bugzilla ticket and submit patch?
> Regards
> Jan Gorecki
Hi Jan,

Why should we care that the  do.NULL = FALSE case is slower?
After all do.NULL = TRUE is the default.

In other words, where are use cases where it is problematic that
do.NULL = FALSE is relatively slow?

Shorter code  *is* nicer than longer code,  so I need a bit more
conviction why we should add more code for that special case ..

Martin Maechler, ETH Zurich
> On 20 December 2016 at 01:27, Jan Gorecki <J.Gorecki at wit.edu.pl>
wrote:
> > Hello,
> >
> > colnames seems to be not optimized well for data.frame. It escapes
> > processing for data.frame in
> >
> >   if (is.data.frame(x) && do.NULL)
> >     return(names(x))
> >
> > but only when do.NULL true. This makes huge difference when do.NULL
> > false. Minimal edit to `colnames`:
> >
> >     if (is.data.frame(x)) {
> >         nm <- names(x)
> >         if (do.NULL || !is.null(nm))
> >             return(nm)
> >         else
> >             return(paste0(prefix, seq_along(x)))
> >     }
> >
> > Script and timings:
> >
> > N=1e7; K=100
> > set.seed(1)
> > DF <- data.frame(
> >     id1 = sample(sprintf("id%03d",1:K), N, TRUE),      #
large groups (char)
> >     id2 = sample(sprintf("id%03d",1:K), N, TRUE),      #
large groups (char)
> >     id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), #
small groups (char)
> >     id4 = sample(K, N, TRUE),                          # large groups
(int)
> >     id5 = sample(K, N, TRUE),                          # large groups
(int)
> >     id6 = sample(N/K, N, TRUE),                        # small groups
(int)
> >     v1 =  sample(5, N, TRUE),                          # int in range
[1,5]
> >     v2 =  sample(5, N, TRUE),                          # int in range
[1,5]
> >     v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g.
23.5749
> > )
> > cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
> > #GB = 0.397
> > colnames(DF) = NULL
> > system.time(nm1<-colnames(DF, FALSE))
> > #   user  system elapsed
> > # 22.158   0.299  22.498
> > print(nm1)
> > #[1] "col1" "col2" "col3"
"col4" "col5" "col6" "col7"
"col8" "col9"
> >
> > ### restart R
> >
> > colnames <- function (x, do.NULL = TRUE, prefix = "col")
> > {
> >     if (is.data.frame(x)) {
> >         nm <- names(x)
> >         if (do.NULL || !is.null(nm))
> >             return(nm)
> >         else
> >             return(paste0(prefix, seq_along(x)))
> >     }
> >     dn <- dimnames(x)
> >     if (!is.null(dn[[2L]]))
> >         dn[[2L]]
> >     else {
> >         nc <- NCOL(x)
> >         if (do.NULL)
> >             NULL
> >         else if (nc > 0L)
> >             paste0(prefix, seq_len(nc))
> >         else character()
> >     }
> > }
> > N=1e7; K=100
> > set.seed(1)
> > DF <- data.frame(
> >     id1 = sample(sprintf("id%03d",1:K), N, TRUE),      #
large groups (char)
> >     id2 = sample(sprintf("id%03d",1:K), N, TRUE),      #
large groups (char)
> >     id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), #
small groups (char)
> >     id4 = sample(K, N, TRUE),                          # large groups
(int)
> >     id5 = sample(K, N, TRUE),                          # large groups
(int)
> >     id6 = sample(N/K, N, TRUE),                        # small groups
(int)
> >     v1 =  sample(5, N, TRUE),                          # int in range
[1,5]
> >     v2 =  sample(5, N, TRUE),                          # int in range
[1,5]
> >     v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g.
23.5749
> > )
> > cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
> > #GB = 0.397
> > colnames(DF) = NULL
> > system.time(nm1<-colnames(DF, FALSE))
> > #   user  system elapsed
> > #  0.001   0.000   0.000
> > print(nm1)
> > #[1] "col1" "col2" "col3"
"col4" "col5" "col6" "col7"
"col8" "col9"
> >
> > sessionInfo()
> > #R Under development (unstable) (2016-12-19 r71815)
> > #Platform: x86_64-pc-linux-gnu (64-bit)
> > #Running under: Debian GNU/Linux stretch/sid
> > #
> > #locale:
> > # [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
> > # [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
> > # [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
> > # [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
> > # [9] LC_ADDRESS=C               LC_TELEPHONE=C
> > #[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
> > #
> > #attached base packages:
> > #[1] stats     graphics  grDevices utils     datasets  methods   base 
#
> > #
> > #loaded via a namespace (and not attached):
> > #[1] compiler_3.4.0
> 
> ______________________________________________
> R-devel at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-devel

Reasonably Related Threads

Search for more seemingly similar threads

R devel - Dec 2016 - colnames for data.frame could be greatly improved

[Rd] colnames for data.frame could be greatly improved

[Rd] colnames for data.frame could be greatly improved

[Rd] colnames for data.frame could be greatly improved

Reasonably Related Threads