thr3ads.net - R help es - [R-es] Encontrar la primera columna no NA [Oct 2016]

If this information is useful, please help other people find it:
Share via:

Olivier Nuñez

2016-Oct-27 16:11 UTC

[R-es] Encontrar la primera columna no NA

Por último, utilizando la indexación lineal de matriz que propusó luisfo en su
momento:
> t <- Sys.time()
> M=as.matrix(dat)
> index <- which(!is.na(M)) - 1
> meses<-colnames(M)
> M2<- data.table(columna=index %/% nrow(M) +1L, jugador=index %% nrow(M)
+1L , valor=M[index+1L])
> setkey(M2,jugador,columna)
> M2[,.(First_month=meses[columna[1]],Value_First_month=valor[1]),by=jugador]        jugador First_month Value_First_month
     1:       1         Uno        0.93520715
     2:       2         Uno        0.85930634
     3:       3         dos        0.13521503
     4:       4         Uno        0.86996341
     5:       5         dos        0.65879889
    ---                                      
 99996:   99996         Uno        0.94728423
 99997:   99997         Uno        0.24088571
 99998:   99998         Uno        0.07458581
 99999:   99999         Uno        0.30535050
100000:  100000         Uno        0.54640585> difftime( Sys.time(), t)
Time difference of 0.3299999 secs> ----- Mensaje original -----
De: Javier Villacampa González <javier.villacampa.gonzalez en gmail.com>
Para: Olivier Nuñez <onunez en unex.es>
CC: R ayuda <r-help-es en r-project.org>
Enviado: Thu, 27 Oct 2016 17:17:12 +0200 (CEST)
Asunto: Re: [R-es] Encontrar la primera columna no NA

Hemos mejorado bastante desde el inicio. Pero aun andamos lejos. Igual es
por el merge que hago. Seguire mirando
library(microbenchmark)
N <- 1e1
tabla <-
  microbenchmark(
    # JVG_dplyr ={
    #   dat %>%
    #     apply( MARGIN = 1, FUN     #              function(x){
    #                which( !is.na(x)  ) %>%  min( na.rm = TRUE ) %>%
return()
    #                }
    #     )
    #   dat[ , First_month := First_month]
    #   N_for <- length( unique(First_month ))
    #   for( j in 1:N_for){
    #     x <- dat[  First_month == j,  j,  with = FALSE]
    #     dat[ First_month == j , Value_First_month := x ]
    #   }
    # },
    JVG ={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )

      apply(X = dat,  MARGIN = 1, FUN               function(x){
                return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
              }
      )
      dat[ , First_month := First_month]
      N_for <- length( unique(First_month ))
      for( j in 1:N_for){
        x <- dat[  First_month == j,  j,  with = FALSE]
        dat[ First_month == j , Value_First_month := x ]
      }
    },
    Olivier ={
    dat <-
      data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                  dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                  tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                  cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                  cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                  seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
      )
      dat[,First_month       := apply(X = .SD,MARGIN = 1,FUN = function(x)
colnames(.SD)[min(which(!is.na(x)))])]
      dat[,Value_First_month := apply(X = .SD,MARGIN = 1,FUN = function(x)
x[min(which(!is.na(x)))])]
    },
    Olivier2={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )

      dat[,jugador:=1:.N]
      dat2=melt(dat,id.vars="jugador")
      setkey(dat2,jugador)
      dat2[,index:=min(which(!is.na(value))),by=jugador]
      dat3 <- dat2[,list(First_month_Olivier
=variable[index[1]],Value_First_month_Olivier =value[index[1]]),by=jugador]
      setkey(x = dat, jugador)
      dat0 <- merge( x = dat, y = dat3, all.x = TRUE, all.y = FALSE)

    },
    times = N, unit = "s")

tabla %>%  print
beepr::beep(3)

# Unit: seconds
#       expr       min        lq      mean    median        uq      max
neval
# JVG       0.6479002 0.7518933 0.8673007 0.8216553 0.9137195 1.251891    10
# Olivier  2.9568197 3.6663586 3.9364770 3.9069826 4.5619519 4.685842    10
# Olivier2 1.1316970 1.4463621 1.4735507 1.4874366 1.5681243 1.631713    10
# E comparativa -----------------------------------------------------------

El 27 de octubre de 2016, 15:39, Olivier Nuñez <onunez en unex.es>
escribió:
> Otra solución algo más rapida:
> > t <- Sys.time()
> > dat[,jugador:=1:.N]
> > dat2=melt(dat,id.vars="jugador")
> > setkey(dat2,jugador)
> > dat2[,index:=min(which(!is.na(value))),by=jugador]
> > dat2[,.(First_month=variable[index[1]],Value_First_month>
value[index[1]]),by=jugador]
>         jugador First_month Value_First_month
>      1:       1         Uno        0.93520715
>      2:       2         Uno        0.85930634
>      3:       3         dos        0.13521503
>      4:       4         Uno        0.86996341
>      5:       5         dos        0.65879889
>     ---
>  99996:   99996         Uno        0.94728423
>  99997:   99997         Uno        0.24088571
>  99998:   99998         Uno        0.07458581
>  99999:   99999         Uno        0.30535050
> 100000:  100000         Uno        0.54640585
> > difftime( Sys.time(), t)
> Time difference of 1.060787 secs
>
>
> ----- Mensaje original -----
> De: "Olivier Nuñez" <onunez en unex.es>
> Para: "Javier Villacampa González" <javier.villacampa.gonzalez
en gmail.com>
> CC: "R ayuda" <r-help-es en r-project.org>
> Enviados: Jueves, 27 de Octubre 2016 15:10:07
> Asunto: Re: [R-es] Encontrar la primera columna no NA
>
> Prueba lo siguiente, no es óptimo, pero creo va bastnate más rapido que
> los que mencionaste:
>
> t <- Sys.time()
> dat[,First_month := apply(.SD,1,function(x) colnames(.SD)[min(which(!is.na
> (x)))])]
> dat[,Value_First_month := apply(.SD,1,function(x) x[min(which(!is.na
> (x)))])]
> difftime( Sys.time(), t)
>
> Time difference of 3.478778 secs
>
>
> ----- Mensaje original -----
> De: "Javier Villacampa González" <javier.villacampa.gonzalez
en gmail.com>
> Para: "R ayuda" <r-help-es en r-project.org>
> Enviados: Jueves, 27 de Octubre 2016 13:43:19
> Asunto: [R-es] Encontrar la primera columna no NA
>
> Imaginemos que tenemos una matriz con datos temporales por sujetos.
> Pongamos que numero de veces que ha jugado una carta en un juego online. Y
> que quiero saber cuantas veces jugo la carta el primer mes que estuvo en el
> juego.
>
> Pero claro mi matriz guarda los datos temporalmente de tal manera que:
>
> # data.table( Enero = c( 1, 4, NA , NA , NA) , Febrero = c( 2, 6, 1, NA, NA
> ) , Marzo = c( 8,6,7,3, NA) ,  Abril = c( NA, 15, 5, 6,6 ))
> #    Enero Febrero Marzo Abril
> # 1:     1       2     8    NA
> # 2:     4       6     6    15
> # 3:    NA       1     7     5
> # 4:    NA      NA     3     6
> # 5:    NA      NA    NA     6
> # Suponiendo que cada fila es un jugador
> # En este caso la solucion debería ser
> # 1 para el primero que empezó en Enero
> # 4 para el segundo jugador  que empezó en Enero
> # 1 para el tercero  que empezó en Febrero
> # 3 Para el cuarto  que empezó en Marzo
> # 6 para el quinto  que empezó en Abril
>
>
> A alguno se os ocurre una solucion más eficiente que la siguiente. Esto
> seguro que con data table o dplyr se puede. Ya he quitados los pipes que
> facilitan la lectura pero que no se llevan bien con data.table. Pero estoy
> seguro que se puede mejorar más.
>
> #======================================================> # Como ejemplo
de codigo
> #======================================================> # S Primera
solucion ------------------------------
> ------------------------
> # First not NA colum per subject
> library(data.table)
> library(dplyr)
> set.seed(123456)
> numero <- 1e5
> dat <-
>   data.table( Uno  = sample( c(runif(numero) , rep(NA , numero /2e0  )),
> size = numero ) ,
>             dos    = sample( c(runif(numero) , rep(NA , numero /1e1  )),
> size = numero ) ,
>             tres   = sample( c(runif(numero) , rep(NA , numero /2e1 )) ,
> size = numero ) ,
>             cuatro = sample( c(runif(numero) , rep(NA , numero /1e2 )) ,
> size = numero ) ,
>             cinco  = sample( c(runif(numero) , rep(NA , numero /2e2 )) ,
> size = numero ) ,
>             seis   = sample( c(runif(numero) , rep(NA , numero /1e3 )) ,
> size = numero )
>             )
>
>
> t <- Sys.time()
> First_month <-
>   dat %>%
>   apply( MARGIN = 1, FUN >            function(x){
>              which( !is.na(x)  ) %>%  min( na.rm = TRUE ) %>% 
return()
>            }
>   )
>
>
>
> First_month %>%  table %>%  prop.table
> dat[ , First_month := First_month]
> N_for <- length( unique(First_month ))
> for( j in 1:N_for){
>   x <- dat[  First_month == j,  j,  with = FALSE]
>   dat[ First_month == j , Value_First_month := x ]
> }
>
> dat %>%  print
> # dat %>%  summary
>
> cat( "===============================\n", difftime( Sys.time(),
t, units > "min") , " minutos que cuesta
\n===============================\n" )
> beepr::beep(3)
> # E Primera solucion ------------------------------
> ------------------------
>
>
>
>
> # S comparativa ------------------------------
> -----------------------------
> library(microbenchmark)
> N <- 1e2
> tabla <-
>   microbenchmark(
>     JVG_dplyr ={  dat %>%
>         apply( MARGIN = 1, FUN >                  function(x){
>                    which( !is.na(x)  ) %>%  min( na.rm = TRUE ) %>%
> return()
>                    }
>         )
>     },
>     JVG ={
>         apply(X = dat,  MARGIN = 1, FUN >                  function(x){
>                    return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
>                  }
>         )
>     },
>     times = N, unit = "s")
>
> tabla %>%  print
> beepr::beep(3)
>
> # Unit: seconds
> #       expr        min        lq       mean     median        uq       max
> neval
> # JVG_dplyr 21.2321152 22.233428 22.9575357 22.5701781 23.444432
> 26.642730    10
> # JVG        0.7628928  0.843067  0.9260389  0.8495834  1.027036
> 1.295868    10
> # E comparativa ------------------------------
> -----------------------------
>
> --
>
>         [[alternative HTML version deleted]]
>
> _______________________________________________
> R-help-es mailing list
> R-help-es en r-project.org
> https://stat.ethz.ch/mailman/listinfo/r-help-es
>
> _______________________________________________
> R-help-es mailing list
> R-help-es en r-project.org
> https://stat.ethz.ch/mailman/listinfo/r-help-es
>


--

Carlos J. Gil Bellosta

2016-Oct-27 16:42 UTC

head link

[R-es] Encontrar la primera columna no NA

Las operaciones con columnas de data.frames (y sus variantes modernas) son
muy caras. Así que:

t <- Sys.time()

tmp <- (as.matrix(dat))
cols <- col(tmp)
cols[is.na(tmp)] <- Inf
my.cols <- apply(cols, 1, min)
my.values <- tmp[cbind(1:nrow(tmp), my.cols)]

difftime(Sys.time(), t)

Y una pregunta: ¿alguien programa en R base todavía?

Un saludo,

Carlos J. Gil Bellosta
http://www.datanalytics.com

El 27 de octubre de 2016, 18:11, Olivier Nuñez <onunez en unex.es>
escribió:
>
> Por último, utilizando la indexación lineal de matriz que propusó luisfo
> en su momento:
>
> > t <- Sys.time()
> > M=as.matrix(dat)
> > index <- which(!is.na(M)) - 1
> > meses<-colnames(M)
> > M2<- data.table(columna=index %/% nrow(M) +1L, jugador=index %%
nrow(M)
> +1L , valor=M[index+1L])
> > setkey(M2,jugador,columna)
> > M2[,.(First_month=meses[columna[1]],Value_First_month>
valor[1]),by=jugador]
>         jugador First_month Value_First_month
>      1:       1         Uno        0.93520715
>      2:       2         Uno        0.85930634
>      3:       3         dos        0.13521503
>      4:       4         Uno        0.86996341
>      5:       5         dos        0.65879889
>     ---
>  99996:   99996         Uno        0.94728423
>  99997:   99997         Uno        0.24088571
>  99998:   99998         Uno        0.07458581
>  99999:   99999         Uno        0.30535050
> 100000:  100000         Uno        0.54640585
> > difftime( Sys.time(), t)
> Time difference of 0.3299999 secs
> >
> ----- Mensaje original -----
> De: Javier Villacampa González <javier.villacampa.gonzalez en
gmail.com>
> Para: Olivier Nuñez <onunez en unex.es>
> CC: R ayuda <r-help-es en r-project.org>
> Enviado: Thu, 27 Oct 2016 17:17:12 +0200 (CEST)
> Asunto: Re: [R-es] Encontrar la primera columna no NA
>
> Hemos mejorado bastante desde el inicio. Pero aun andamos lejos. Igual es
> por el merge que hago. Seguire mirando
> library(microbenchmark)
> N <- 1e1
> tabla <-
>   microbenchmark(
>     # JVG_dplyr ={
>     #   dat %>%
>     #     apply( MARGIN = 1, FUN >     #              function(x){
>     #                which( !is.na(x)  ) %>%  min( na.rm = TRUE ) %>%
> return()
>     #                }
>     #     )
>     #   dat[ , First_month := First_month]
>     #   N_for <- length( unique(First_month ))
>     #   for( j in 1:N_for){
>     #     x <- dat[  First_month == j,  j,  with = FALSE]
>     #     dat[ First_month == j , Value_First_month := x ]
>     #   }
>     # },
>     JVG ={
>       dat <-
>         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
> )) , size = numero ) ,
>                     dos    = sample( c(runif(numero) , rep(NA , numero /1e1
> )) , size = numero ) ,
>                     tres   = sample( c(runif(numero) , rep(NA , numero /2e1
> )) , size = numero ) ,
>                     cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
> )) , size = numero ) ,
>                     cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
> )) , size = numero ) ,
>                     seis   = sample( c(runif(numero) , rep(NA , numero /1e3
> )) , size = numero )
>         )
>
>       apply(X = dat,  MARGIN = 1, FUN >               function(x){
>                 return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
>               }
>       )
>       dat[ , First_month := First_month]
>       N_for <- length( unique(First_month ))
>       for( j in 1:N_for){
>         x <- dat[  First_month == j,  j,  with = FALSE]
>         dat[ First_month == j , Value_First_month := x ]
>       }
>     },
>     Olivier ={
>     dat <-
>       data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
> )) , size = numero ) ,
>                   dos    = sample( c(runif(numero) , rep(NA , numero /1e1
> )) , size = numero ) ,
>                   tres   = sample( c(runif(numero) , rep(NA , numero /2e1
> )) , size = numero ) ,
>                   cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
> )) , size = numero ) ,
>                   cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
> )) , size = numero ) ,
>                   seis   = sample( c(runif(numero) , rep(NA , numero /1e3
> )) , size = numero )
>       )
>       dat[,First_month       := apply(X = .SD,MARGIN = 1,FUN = function(x)
> colnames(.SD)[min(which(!is.na(x)))])]
>       dat[,Value_First_month := apply(X = .SD,MARGIN = 1,FUN = function(x)
> x[min(which(!is.na(x)))])]
>     },
>     Olivier2={
>       dat <-
>         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
> )) , size = numero ) ,
>                     dos    = sample( c(runif(numero) , rep(NA , numero /1e1
> )) , size = numero ) ,
>                     tres   = sample( c(runif(numero) , rep(NA , numero /2e1
> )) , size = numero ) ,
>                     cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
> )) , size = numero ) ,
>                     cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
> )) , size = numero ) ,
>                     seis   = sample( c(runif(numero) , rep(NA , numero /1e3
> )) , size = numero )
>         )
>
>       dat[,jugador:=1:.N]
>       dat2=melt(dat,id.vars="jugador")
>       setkey(dat2,jugador)
>       dat2[,index:=min(which(!is.na(value))),by=jugador]
>       dat3 <- dat2[,list(First_month_Olivier
> =variable[index[1]],Value_First_month_Olivier
> =value[index[1]]),by=jugador]
>       setkey(x = dat, jugador)
>       dat0 <- merge( x = dat, y = dat3, all.x = TRUE, all.y = FALSE)
>
>     },
>     times = N, unit = "s")
>
> tabla %>%  print
> beepr::beep(3)
>
> # Unit: seconds
> #       expr       min        lq      mean    median        uq      max
> neval
> # JVG       0.6479002 0.7518933 0.8673007 0.8216553 0.9137195 1.251891
> 10
> # Olivier  2.9568197 3.6663586 3.9364770 3.9069826 4.5619519 4.685842    10
> # Olivier2 1.1316970 1.4463621 1.4735507 1.4874366 1.5681243 1.631713    10
> # E comparativa ------------------------------
> -----------------------------
>
> El 27 de octubre de 2016, 15:39, Olivier Nuñez <onunez en unex.es>
escribió:
>
> > Otra solución algo más rapida:
> > > t <- Sys.time()
> > > dat[,jugador:=1:.N]
> > > dat2=melt(dat,id.vars="jugador")
> > > setkey(dat2,jugador)
> > > dat2[,index:=min(which(!is.na(value))),by=jugador]
> > > dat2[,.(First_month=variable[index[1]],Value_First_month> >
value[index[1]]),by=jugador]
> >         jugador First_month Value_First_month
> >      1:       1         Uno        0.93520715
> >      2:       2         Uno        0.85930634
> >      3:       3         dos        0.13521503
> >      4:       4         Uno        0.86996341
> >      5:       5         dos        0.65879889
> >     ---
> >  99996:   99996         Uno        0.94728423
> >  99997:   99997         Uno        0.24088571
> >  99998:   99998         Uno        0.07458581
> >  99999:   99999         Uno        0.30535050
> > 100000:  100000         Uno        0.54640585
> > > difftime( Sys.time(), t)
> > Time difference of 1.060787 secs
> >
> >
> > ----- Mensaje original -----
> > De: "Olivier Nuñez" <onunez en unex.es>
> > Para: "Javier Villacampa González"
<javier.villacampa.gonzalez en gmail.com
> >
> > CC: "R ayuda" <r-help-es en r-project.org>
> > Enviados: Jueves, 27 de Octubre 2016 15:10:07
> > Asunto: Re: [R-es] Encontrar la primera columna no NA
> >
> > Prueba lo siguiente, no es óptimo, pero creo va bastnate más rapido
que
> > los que mencionaste:
> >
> > t <- Sys.time()
> > dat[,First_month := apply(.SD,1,function(x) colnames(.SD)[min(which(!
> is.na
> > (x)))])]
> > dat[,Value_First_month := apply(.SD,1,function(x) x[min(which(!is.na
> > (x)))])]
> > difftime( Sys.time(), t)
> >
> > Time difference of 3.478778 secs
> >
> >
> > ----- Mensaje original -----
> > De: "Javier Villacampa González"
<javier.villacampa.gonzalez en gmail.com>
> > Para: "R ayuda" <r-help-es en r-project.org>
> > Enviados: Jueves, 27 de Octubre 2016 13:43:19
> > Asunto: [R-es] Encontrar la primera columna no NA
> >
> > Imaginemos que tenemos una matriz con datos temporales por sujetos.
> > Pongamos que numero de veces que ha jugado una carta en un juego
online.
> Y
> > que quiero saber cuantas veces jugo la carta el primer mes que estuvo
en
> el
> > juego.
> >
> > Pero claro mi matriz guarda los datos temporalmente de tal manera que:
> >
> > # data.table( Enero = c( 1, 4, NA , NA , NA) , Febrero = c( 2, 6, 1,
NA,
> NA
> > ) , Marzo = c( 8,6,7,3, NA) ,  Abril = c( NA, 15, 5, 6,6 ))
> > #    Enero Febrero Marzo Abril
> > # 1:     1       2     8    NA
> > # 2:     4       6     6    15
> > # 3:    NA       1     7     5
> > # 4:    NA      NA     3     6
> > # 5:    NA      NA    NA     6
> > # Suponiendo que cada fila es un jugador
> > # En este caso la solucion debería ser
> > # 1 para el primero que empezó en Enero
> > # 4 para el segundo jugador  que empezó en Enero
> > # 1 para el tercero  que empezó en Febrero
> > # 3 Para el cuarto  que empezó en Marzo
> > # 6 para el quinto  que empezó en Abril
> >
> >
> > A alguno se os ocurre una solucion más eficiente que la siguiente.
Esto
> > seguro que con data table o dplyr se puede. Ya he quitados los pipes
que
> > facilitan la lectura pero que no se llevan bien con data.table. Pero
> estoy
> > seguro que se puede mejorar más.
> >
> > #======================================================> > #
Como ejemplo de codigo
> > #======================================================> > # S
Primera solucion ------------------------------
> > ------------------------
> > # First not NA colum per subject
> > library(data.table)
> > library(dplyr)
> > set.seed(123456)
> > numero <- 1e5
> > dat <-
> >   data.table( Uno  = sample( c(runif(numero) , rep(NA , numero /2e0 
)),
> > size = numero ) ,
> >             dos    = sample( c(runif(numero) , rep(NA , numero /1e1 
)),
> > size = numero ) ,
> >             tres   = sample( c(runif(numero) , rep(NA , numero /2e1 ))
,
> > size = numero ) ,
> >             cuatro = sample( c(runif(numero) , rep(NA , numero /1e2 ))
,
> > size = numero ) ,
> >             cinco  = sample( c(runif(numero) , rep(NA , numero /2e2 ))
,
> > size = numero ) ,
> >             seis   = sample( c(runif(numero) , rep(NA , numero /1e3 ))
,
> > size = numero )
> >             )
> >
> >
> > t <- Sys.time()
> > First_month <-
> >   dat %>%
> >   apply( MARGIN = 1, FUN > >            function(x){
> >              which( !is.na(x)  ) %>%  min( na.rm = TRUE ) %>% 
return()
> >            }
> >   )
> >
> >
> >
> > First_month %>%  table %>%  prop.table
> > dat[ , First_month := First_month]
> > N_for <- length( unique(First_month ))
> > for( j in 1:N_for){
> >   x <- dat[  First_month == j,  j,  with = FALSE]
> >   dat[ First_month == j , Value_First_month := x ]
> > }
> >
> > dat %>%  print
> > # dat %>%  summary
> >
> > cat( "===============================\n", difftime(
Sys.time(), t,
> units > > "min") , " minutos que cuesta
\n===============================\n" )
> > beepr::beep(3)
> > # E Primera solucion ------------------------------
> > ------------------------
> >
> >
> >
> >
> > # S comparativa ------------------------------
> > -----------------------------
> > library(microbenchmark)
> > N <- 1e2
> > tabla <-
> >   microbenchmark(
> >     JVG_dplyr ={  dat %>%
> >         apply( MARGIN = 1, FUN > >                  function(x){
> >                    which( !is.na(x)  ) %>%  min( na.rm = TRUE )
%>%
> > return()
> >                    }
> >         )
> >     },
> >     JVG ={
> >         apply(X = dat,  MARGIN = 1, FUN > >                 
function(x){
> >                    return(   min(  which( !is.na(x)  ),  na.rm = TRUE
)
> )
> >                  }
> >         )
> >     },
> >     times = N, unit = "s")
> >
> > tabla %>%  print
> > beepr::beep(3)
> >
> > # Unit: seconds
> > #       expr        min        lq       mean     median        uq
>  max
> > neval
> > # JVG_dplyr 21.2321152 22.233428 22.9575357 22.5701781 23.444432
> > 26.642730    10
> > # JVG        0.7628928  0.843067  0.9260389  0.8495834  1.027036
> > 1.295868    10
> > # E comparativa ------------------------------
> > -----------------------------
> >
> > --
> >
> >         [[alternative HTML version deleted]]
> >
> > _______________________________________________
> > R-help-es mailing list
> > R-help-es en r-project.org
> > https://stat.ethz.ch/mailman/listinfo/r-help-es
> >
> > _______________________________________________
> > R-help-es mailing list
> > R-help-es en r-project.org
> > https://stat.ethz.ch/mailman/listinfo/r-help-es
> >
>
>
>
> --
>
> _______________________________________________
> R-help-es mailing list
> R-help-es en r-project.org
> https://stat.ethz.ch/mailman/listinfo/r-help-es
>
	[[alternative HTML version deleted]]

Javier Villacampa González

2016-Oct-27 17:16 UTC

head link

[R-es] Encontrar la primera columna no NA

Tengo que comprobar si todos hacen lo mismo del todo. Pero los resultados
no dejan de sorprenderme. Carlos, ya no programa nadie en base y empiezo a
sospechar que igual nos equivocamos.


==================================================================================================#
Unit: seconds
          expr        min         lq      mean    median
uq          max        neval   Lift
#   JVG         0,6716004   0,7210757   1,0513104   0,9597415   1,1624642
2,0997470       10  3,17
#   Olivier     3,0642166   3,424266    3,7383201   3,745616    3,9909474
4,4795947       10  12,3
#   Olivier2    1,2263557   1,340338    1,5031451   1,5140908   1,6264349
1,7548450       10  5,00
#   Adolfo      0,3401764   0,3425798   0,446328    0,3992639   0,5313764
0,7357900       10  1,32
#   Olivier3    0,3684704   0,3875006   0,5157852   0,4959741   0,6414696
0,6954977       10  1,64
#   GilBellosta 0,2089104   0,265419    0,3599796   0,3023052   0,4038109
0,7859248       10  1
==================================================================================================





#
==================================================================================================#
Codigo
#
==================================================================================================library(microbenchmark)
N <- 1e1
tabla <-
  microbenchmark(
    # JVG_dplyr ={
    #   dat %>%
    #     apply( MARGIN = 1, FUN     #              function(x){
    #                which( !is.na(x)  ) %>%  min( na.rm = TRUE ) %>%
return()
    #                }
    #     )
    #   dat[ , First_month := First_month]
    #   N_for <- length( unique(First_month ))
    #   for( j in 1:N_for){
    #     x <- dat[  First_month == j,  j,  with = FALSE]
    #     dat[ First_month == j , Value_First_month := x ]
    #   }
    # },
    JVG ={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )

      apply(X = dat,  MARGIN = 1, FUN               function(x){
                return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
              }
      )
      dat[ , First_month := First_month]
      N_for <- length( unique(First_month ))
      for( j in 1:N_for){
        x <- dat[  First_month == j,  j,  with = FALSE]
        dat[ First_month == j , Value_First_month := x ]
      }
    },
    Olivier ={
    dat <-
      data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                  dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                  tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                  cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                  cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                  seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
      )
      dat[,First_month       := apply(X = .SD,MARGIN = 1,FUN = function(x)
colnames(.SD)[min(which(!is.na(x)))])]
      dat[,Value_First_month := apply(X = .SD,MARGIN = 1,FUN = function(x)
x[min(which(!is.na(x)))])]
    },
    Olivier2={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )

      dat[,jugador:=1:.N]
      dat2=melt(dat,id.vars="jugador")
      setkey(dat2,jugador)
      dat2[,index:=min(which(!is.na(value))),by=jugador]
      dat3 <- dat2[,list(First_month_Olivier
=variable[index[1]],Value_First_month_Olivier =value[index[1]]),by=jugador]
      setkey(x = dat, jugador)
      dat0 <- merge( x = dat, y = dat3, all.x = TRUE, all.y = FALSE)

    },

    Adolfo = {

      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )
      # 1) Creamos una columna con la informacion de los jugadores,
      # Como es un jugador por fila, hacemos 1:nrow.
      step1 <- dat %>%
        mutate(player = 1:nrow(dat))

      #2) Convertimos las columnas de tiempo (uno, dos, tres, ...) en dos
columnas, mes y numero de juegos. (Ojo, asumimos que en los datos las
columnas estan ordenadas como en el ejemplo, es decir uno, dos, tres y no
tres, uno, dos)

      step2 <- gather(step1, month, games, -player)

      #y 3) Filtramos los meses con NA y por cada jugador nos quedamos con
el primer dato:
      step3 <- step2 %>%
        filter(!is.na(games)) %>%
        group_by(player) %>%
        slice(1)



      dat %>%  print

    },

    Olivier3 = {
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )
      M=as.matrix(dat)
      index <- which(!is.na(M)) - 1
      meses<-colnames(M)
      M2<- data.table(columna=index %/% nrow(M) +1L, jugador=index %%
nrow(M) +1L , valor=M[index+1L])
      setkey(M2,jugador,columna)

M2[,.(First_month=meses[columna[1]],Value_First_month=valor[1]),by=jugador]
    },
    GilBellosta = {

      dat <-
        data.frame( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )
      tmp <- (as.matrix(dat))
      cols <- col(tmp)
      cols[is.na(tmp)] <- Inf
      my.cols <- apply(cols, 1, min)
      my.values <- tmp[cbind(1:nrow(tmp), my.cols)]

      difftime(Sys.time(), t)

    },

    times = N, unit = "s")

tabla %>%  print
beepr::beep(3)

# Unit: seconds
#        expr       min        lq      mean    median        uq       max
neval
# JVG         0.6716004 0.7210757 1.0513104 0.9597415 1.1624642
2.0997470    10
# Olivier     3.0642166 3.4242660 3.7383201 3.7456160 3.9909474
4.4795947    10
# Olivier2    1.2263557 1.3403380 1.5031451 1.5140908 1.6264349
1.7548450    10
# Adolfo      0.3401764 0.3425798 0.4463280 0.3992639 0.5313764
0.7357900    10
# Olivier3    0.3684704 0.3875006 0.5157852 0.4959741 0.6414696
0.6954977    10
# GilBellosta 0.2089104 0.2654190 0.3599796 0.3023052 0.4038109
0.7859248    10

	[[alternative HTML version deleted]]

Jose Luis Cañadas Reche

2016-Oct-27 18:54 UTC

head link

[R-es] Encontrar la primera columna no NA

jajaja.

Hoy estaba dando un curso y mis alumnos sabían más dplyr y cosas de esas 
que yo.
Siempre que tengo que usar dplyr tengo que mirar la vignette ;).


Saludos



El 27/10/16 a las 18:42, Carlos J. Gil Bellosta
escribió:> Las operaciones con columnas de data.frames (y sus variantes modernas) son
> muy caras. Así que:
>
> t <- Sys.time()
>
> tmp <- (as.matrix(dat))
> cols <- col(tmp)
> cols[is.na(tmp)] <- Inf
> my.cols <- apply(cols, 1, min)
> my.values <- tmp[cbind(1:nrow(tmp), my.cols)]
>
> difftime(Sys.time(), t)
>
> Y una pregunta: ¿alguien programa en R base todavía?
>
> Un saludo,
>
> Carlos J. Gil Bellosta
> http://www.datanalytics.com
>
> El 27 de octubre de 2016, 18:11, Olivier Nuñez <onunez en unex.es>
escribió:
>
>> Por último, utilizando la indexación lineal de matriz que propusó
luisfo
>> en su momento:
>>
>>> t <- Sys.time()
>>> M=as.matrix(dat)
>>> index <- which(!is.na(M)) - 1
>>> meses<-colnames(M)
>>> M2<- data.table(columna=index %/% nrow(M) +1L, jugador=index %%
nrow(M)
>> +1L , valor=M[index+1L])
>>> setkey(M2,jugador,columna)
>>> M2[,.(First_month=meses[columna[1]],Value_First_month>>
valor[1]),by=jugador]
>>          jugador First_month Value_First_month
>>       1:       1         Uno        0.93520715
>>       2:       2         Uno        0.85930634
>>       3:       3         dos        0.13521503
>>       4:       4         Uno        0.86996341
>>       5:       5         dos        0.65879889
>>      ---
>>   99996:   99996         Uno        0.94728423
>>   99997:   99997         Uno        0.24088571
>>   99998:   99998         Uno        0.07458581
>>   99999:   99999         Uno        0.30535050
>> 100000:  100000         Uno        0.54640585
>>> difftime( Sys.time(), t)
>> Time difference of 0.3299999 secs
>> ----- Mensaje original -----
>> De: Javier Villacampa González <javier.villacampa.gonzalez en
gmail.com>
>> Para: Olivier Nuñez <onunez en unex.es>
>> CC: R ayuda <r-help-es en r-project.org>
>> Enviado: Thu, 27 Oct 2016 17:17:12 +0200 (CEST)
>> Asunto: Re: [R-es] Encontrar la primera columna no NA
>>
>> Hemos mejorado bastante desde el inicio. Pero aun andamos lejos. Igual
es
>> por el merge que hago. Seguire mirando
>> library(microbenchmark)
>> N <- 1e1
>> tabla <-
>>    microbenchmark(
>>      # JVG_dplyr ={
>>      #   dat %>%
>>      #     apply( MARGIN = 1, FUN >>      #             
function(x){
>>      #                which( !is.na(x)  ) %>%  min( na.rm = TRUE )
%>%
>> return()
>>      #                }
>>      #     )
>>      #   dat[ , First_month := First_month]
>>      #   N_for <- length( unique(First_month ))
>>      #   for( j in 1:N_for){
>>      #     x <- dat[  First_month == j,  j,  with = FALSE]
>>      #     dat[ First_month == j , Value_First_month := x ]
>>      #   }
>>      # },
>>      JVG ={
>>        dat <-
>>          data.table( Uno    = sample( c(runif(numero) , rep(NA , numero
/2e0
>> )) , size = numero ) ,
>>                      dos    = sample( c(runif(numero) , rep(NA , numero
/1e1
>> )) , size = numero ) ,
>>                      tres   = sample( c(runif(numero) , rep(NA , numero
/2e1
>> )) , size = numero ) ,
>>                      cuatro = sample( c(runif(numero) , rep(NA , numero
/1e2
>> )) , size = numero ) ,
>>                      cinco  = sample( c(runif(numero) , rep(NA , numero
/2e2
>> )) , size = numero ) ,
>>                      seis   = sample( c(runif(numero) , rep(NA , numero
/1e3
>> )) , size = numero )
>>          )
>>
>>        apply(X = dat,  MARGIN = 1, FUN >>               
function(x){
>>                  return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
>>                }
>>        )
>>        dat[ , First_month := First_month]
>>        N_for <- length( unique(First_month ))
>>        for( j in 1:N_for){
>>          x <- dat[  First_month == j,  j,  with = FALSE]
>>          dat[ First_month == j , Value_First_month := x ]
>>        }
>>      },
>>      Olivier ={
>>      dat <-
>>        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero
/2e0
>> )) , size = numero ) ,
>>                    dos    = sample( c(runif(numero) , rep(NA , numero
/1e1
>> )) , size = numero ) ,
>>                    tres   = sample( c(runif(numero) , rep(NA , numero
/2e1
>> )) , size = numero ) ,
>>                    cuatro = sample( c(runif(numero) , rep(NA , numero
/1e2
>> )) , size = numero ) ,
>>                    cinco  = sample( c(runif(numero) , rep(NA , numero
/2e2
>> )) , size = numero ) ,
>>                    seis   = sample( c(runif(numero) , rep(NA , numero
/1e3
>> )) , size = numero )
>>        )
>>        dat[,First_month       := apply(X = .SD,MARGIN = 1,FUN =
function(x)
>> colnames(.SD)[min(which(!is.na(x)))])]
>>        dat[,Value_First_month := apply(X = .SD,MARGIN = 1,FUN =
function(x)
>> x[min(which(!is.na(x)))])]
>>      },
>>      Olivier2={
>>        dat <-
>>          data.table( Uno    = sample( c(runif(numero) , rep(NA , numero
/2e0
>> )) , size = numero ) ,
>>                      dos    = sample( c(runif(numero) , rep(NA , numero
/1e1
>> )) , size = numero ) ,
>>                      tres   = sample( c(runif(numero) , rep(NA , numero
/2e1
>> )) , size = numero ) ,
>>                      cuatro = sample( c(runif(numero) , rep(NA , numero
/1e2
>> )) , size = numero ) ,
>>                      cinco  = sample( c(runif(numero) , rep(NA , numero
/2e2
>> )) , size = numero ) ,
>>                      seis   = sample( c(runif(numero) , rep(NA , numero
/1e3
>> )) , size = numero )
>>          )
>>
>>        dat[,jugador:=1:.N]
>>        dat2=melt(dat,id.vars="jugador")
>>        setkey(dat2,jugador)
>>        dat2[,index:=min(which(!is.na(value))),by=jugador]
>>        dat3 <- dat2[,list(First_month_Olivier
>> =variable[index[1]],Value_First_month_Olivier
>> =value[index[1]]),by=jugador]
>>        setkey(x = dat, jugador)
>>        dat0 <- merge( x = dat, y = dat3, all.x = TRUE, all.y =
FALSE)
>>
>>      },
>>      times = N, unit = "s")
>>
>> tabla %>%  print
>> beepr::beep(3)
>>
>> # Unit: seconds
>> #       expr       min        lq      mean    median        uq      max
>> neval
>> # JVG       0.6479002 0.7518933 0.8673007 0.8216553 0.9137195 1.251891
>> 10
>> # Olivier  2.9568197 3.6663586 3.9364770 3.9069826 4.5619519 4.685842  
10
>> # Olivier2 1.1316970 1.4463621 1.4735507 1.4874366 1.5681243 1.631713  
10
>> # E comparativa ------------------------------
>> -----------------------------
>>
>> El 27 de octubre de 2016, 15:39, Olivier Nuñez <onunez en
unex.es> escribió:
>>
>>> Otra solución algo más rapida:
>>>> t <- Sys.time()
>>>> dat[,jugador:=1:.N]
>>>> dat2=melt(dat,id.vars="jugador")
>>>> setkey(dat2,jugador)
>>>> dat2[,index:=min(which(!is.na(value))),by=jugador]
>>>>
dat2[,.(First_month=variable[index[1]],Value_First_month>>>
value[index[1]]),by=jugador]
>>>          jugador First_month Value_First_month
>>>       1:       1         Uno        0.93520715
>>>       2:       2         Uno        0.85930634
>>>       3:       3         dos        0.13521503
>>>       4:       4         Uno        0.86996341
>>>       5:       5         dos        0.65879889
>>>      ---
>>>   99996:   99996         Uno        0.94728423
>>>   99997:   99997         Uno        0.24088571
>>>   99998:   99998         Uno        0.07458581
>>>   99999:   99999         Uno        0.30535050
>>> 100000:  100000         Uno        0.54640585
>>>> difftime( Sys.time(), t)
>>> Time difference of 1.060787 secs
>>>
>>>
>>> ----- Mensaje original -----
>>> De: "Olivier Nuñez" <onunez en unex.es>
>>> Para: "Javier Villacampa González"
<javier.villacampa.gonzalez en gmail.com
>>>
>>> CC: "R ayuda" <r-help-es en r-project.org>
>>> Enviados: Jueves, 27 de Octubre 2016 15:10:07
>>> Asunto: Re: [R-es] Encontrar la primera columna no NA
>>>
>>> Prueba lo siguiente, no es óptimo, pero creo va bastnate más rapido
que
>>> los que mencionaste:
>>>
>>> t <- Sys.time()
>>> dat[,First_month := apply(.SD,1,function(x)
colnames(.SD)[min(which(!
>> is.na
>>> (x)))])]
>>> dat[,Value_First_month := apply(.SD,1,function(x)
x[min(which(!is.na
>>> (x)))])]
>>> difftime( Sys.time(), t)
>>>
>>> Time difference of 3.478778 secs
>>>
>>>
>>> ----- Mensaje original -----
>>> De: "Javier Villacampa González"
<javier.villacampa.gonzalez en gmail.com>
>>> Para: "R ayuda" <r-help-es en r-project.org>
>>> Enviados: Jueves, 27 de Octubre 2016 13:43:19
>>> Asunto: [R-es] Encontrar la primera columna no NA
>>>
>>> Imaginemos que tenemos una matriz con datos temporales por sujetos.
>>> Pongamos que numero de veces que ha jugado una carta en un juego
online.
>> Y
>>> que quiero saber cuantas veces jugo la carta el primer mes que
estuvo en
>> el
>>> juego.
>>>
>>> Pero claro mi matriz guarda los datos temporalmente de tal manera
que:
>>>
>>> # data.table( Enero = c( 1, 4, NA , NA , NA) , Febrero = c( 2, 6,
1, NA,
>> NA
>>> ) , Marzo = c( 8,6,7,3, NA) ,  Abril = c( NA, 15, 5, 6,6 ))
>>> #    Enero Febrero Marzo Abril
>>> # 1:     1       2     8    NA
>>> # 2:     4       6     6    15
>>> # 3:    NA       1     7     5
>>> # 4:    NA      NA     3     6
>>> # 5:    NA      NA    NA     6
>>> # Suponiendo que cada fila es un jugador
>>> # En este caso la solucion debería ser
>>> # 1 para el primero que empezó en Enero
>>> # 4 para el segundo jugador  que empezó en Enero
>>> # 1 para el tercero  que empezó en Febrero
>>> # 3 Para el cuarto  que empezó en Marzo
>>> # 6 para el quinto  que empezó en Abril
>>>
>>>
>>> A alguno se os ocurre una solucion más eficiente que la siguiente.
Esto
>>> seguro que con data table o dplyr se puede. Ya he quitados los
pipes que
>>> facilitan la lectura pero que no se llevan bien con data.table.
Pero
>> estoy
>>> seguro que se puede mejorar más.
>>>
>>> #======================================================>>>
# Como ejemplo de codigo
>>> #======================================================>>>
# S Primera solucion ------------------------------
>>> ------------------------
>>> # First not NA colum per subject
>>> library(data.table)
>>> library(dplyr)
>>> set.seed(123456)
>>> numero <- 1e5
>>> dat <-
>>>    data.table( Uno  = sample( c(runif(numero) , rep(NA , numero
/2e0  )),
>>> size = numero ) ,
>>>              dos    = sample( c(runif(numero) , rep(NA , numero
/1e1  )),
>>> size = numero ) ,
>>>              tres   = sample( c(runif(numero) , rep(NA , numero
/2e1 )) ,
>>> size = numero ) ,
>>>              cuatro = sample( c(runif(numero) , rep(NA , numero
/1e2 )) ,
>>> size = numero ) ,
>>>              cinco  = sample( c(runif(numero) , rep(NA , numero
/2e2 )) ,
>>> size = numero ) ,
>>>              seis   = sample( c(runif(numero) , rep(NA , numero
/1e3 )) ,
>>> size = numero )
>>>              )
>>>
>>>
>>> t <- Sys.time()
>>> First_month <-
>>>    dat %>%
>>>    apply( MARGIN = 1, FUN >>>             function(x){
>>>               which( !is.na(x)  ) %>%  min( na.rm = TRUE )
%>%  return()
>>>             }
>>>    )
>>>
>>>
>>>
>>> First_month %>%  table %>%  prop.table
>>> dat[ , First_month := First_month]
>>> N_for <- length( unique(First_month ))
>>> for( j in 1:N_for){
>>>    x <- dat[  First_month == j,  j,  with = FALSE]
>>>    dat[ First_month == j , Value_First_month := x ]
>>> }
>>>
>>> dat %>%  print
>>> # dat %>%  summary
>>>
>>> cat( "===============================\n", difftime(
Sys.time(), t,
>> units >>> "min") , " minutos que cuesta
\n===============================\n" )
>>> beepr::beep(3)
>>> # E Primera solucion ------------------------------
>>> ------------------------
>>>
>>>
>>>
>>>
>>> # S comparativa ------------------------------
>>> -----------------------------
>>> library(microbenchmark)
>>> N <- 1e2
>>> tabla <-
>>>    microbenchmark(
>>>      JVG_dplyr ={  dat %>%
>>>          apply( MARGIN = 1, FUN >>>                  
function(x){
>>>                     which( !is.na(x)  ) %>%  min( na.rm = TRUE )
%>%
>>> return()
>>>                     }
>>>          )
>>>      },
>>>      JVG ={
>>>          apply(X = dat,  MARGIN = 1, FUN >>>              
function(x){
>>>                     return(   min(  which( !is.na(x)  ),  na.rm =
TRUE )
>> )
>>>                   }
>>>          )
>>>      },
>>>      times = N, unit = "s")
>>>
>>> tabla %>%  print
>>> beepr::beep(3)
>>>
>>> # Unit: seconds
>>> #       expr        min        lq       mean     median        uq
>>   max
>>> neval
>>> # JVG_dplyr 21.2321152 22.233428 22.9575357 22.5701781 23.444432
>>> 26.642730    10
>>> # JVG        0.7628928  0.843067  0.9260389  0.8495834  1.027036
>>> 1.295868    10
>>> # E comparativa ------------------------------
>>> -----------------------------
>>>
>>> --
>>>
>>>          [[alternative HTML version deleted]]
>>>
>>> _______________________________________________
>>> R-help-es mailing list
>>> R-help-es en r-project.org
>>> https://stat.ethz.ch/mailman/listinfo/r-help-es
>>>
>>> _______________________________________________
>>> R-help-es mailing list
>>> R-help-es en r-project.org
>>> https://stat.ethz.ch/mailman/listinfo/r-help-es
>>>
>>
>>
>> --
>>
>> _______________________________________________
>> R-help-es mailing list
>> R-help-es en r-project.org
>> https://stat.ethz.ch/mailman/listinfo/r-help-es
>>
> 	[[alternative HTML version deleted]]
>
> _______________________________________________
> R-help-es mailing list
> R-help-es en r-project.org
> https://stat.ethz.ch/mailman/listinfo/r-help-es

Jorge I Velez

2016-Oct-27 19:35 UTC

head link

[R-es] Encontrar la primera columna no NA

Excelente!

Aqui hay otra aproximación en base:

R> t <- Sys.time()
R> out <- apply(as.matrix(dat), 1, function(x) x[!is.na(x)][1])
R> difftime(Sys.time(), t)
## Time difference of 0.656173 secs

Nada mal :)

Saludos,
Jorge.-


2016-10-27 11:42 GMT-05:00 Carlos J. Gil Bellosta <cgb en
datanalytics.com>:
> Las operaciones con columnas de data.frames (y sus variantes modernas) son
> muy caras. Así que:
>
> t <- Sys.time()
>
> tmp <- (as.matrix(dat))
> cols <- col(tmp)
> cols[is.na(tmp)] <- Inf
> my.cols <- apply(cols, 1, min)
> my.values <- tmp[cbind(1:nrow(tmp), my.cols)]
>
> difftime(Sys.time(), t)
>
> Y una pregunta: ¿alguien programa en R base todavía?
>
> Un saludo,
>
> Carlos J. Gil Bellosta
> http://www.datanalytics.com
>
> El 27 de octubre de 2016, 18:11, Olivier Nuñez <onunez en unex.es>
escribió:
>
> >
> > Por último, utilizando la indexación lineal de matriz que propusó
luisfo
> > en su momento:
> >
> > > t <- Sys.time()
> > > M=as.matrix(dat)
> > > index <- which(!is.na(M)) - 1
> > > meses<-colnames(M)
> > > M2<- data.table(columna=index %/% nrow(M) +1L, jugador=index
%% nrow(M)
> > +1L , valor=M[index+1L])
> > > setkey(M2,jugador,columna)
> > > M2[,.(First_month=meses[columna[1]],Value_First_month> >
valor[1]),by=jugador]
> >         jugador First_month Value_First_month
> >      1:       1         Uno        0.93520715
> >      2:       2         Uno        0.85930634
> >      3:       3         dos        0.13521503
> >      4:       4         Uno        0.86996341
> >      5:       5         dos        0.65879889
> >     ---
> >  99996:   99996         Uno        0.94728423
> >  99997:   99997         Uno        0.24088571
> >  99998:   99998         Uno        0.07458581
> >  99999:   99999         Uno        0.30535050
> > 100000:  100000         Uno        0.54640585
> > > difftime( Sys.time(), t)
> > Time difference of 0.3299999 secs
> > >
> > ----- Mensaje original -----
> > De: Javier Villacampa González <javier.villacampa.gonzalez en
gmail.com>
> > Para: Olivier Nuñez <onunez en unex.es>
> > CC: R ayuda <r-help-es en r-project.org>
> > Enviado: Thu, 27 Oct 2016 17:17:12 +0200 (CEST)
> > Asunto: Re: [R-es] Encontrar la primera columna no NA
> >
> > Hemos mejorado bastante desde el inicio. Pero aun andamos lejos. Igual
es
> > por el merge que hago. Seguire mirando
> > library(microbenchmark)
> > N <- 1e1
> > tabla <-
> >   microbenchmark(
> >     # JVG_dplyr ={
> >     #   dat %>%
> >     #     apply( MARGIN = 1, FUN > >     #             
function(x){
> >     #                which( !is.na(x)  ) %>%  min( na.rm = TRUE )
%>%
> > return()
> >     #                }
> >     #     )
> >     #   dat[ , First_month := First_month]
> >     #   N_for <- length( unique(First_month ))
> >     #   for( j in 1:N_for){
> >     #     x <- dat[  First_month == j,  j,  with = FALSE]
> >     #     dat[ First_month == j , Value_First_month := x ]
> >     #   }
> >     # },
> >     JVG ={
> >       dat <-
> >         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero
> /2e0
> > )) , size = numero ) ,
> >                     dos    = sample( c(runif(numero) , rep(NA , numero
> /1e1
> > )) , size = numero ) ,
> >                     tres   = sample( c(runif(numero) , rep(NA , numero
> /2e1
> > )) , size = numero ) ,
> >                     cuatro = sample( c(runif(numero) , rep(NA , numero
> /1e2
> > )) , size = numero ) ,
> >                     cinco  = sample( c(runif(numero) , rep(NA , numero
> /2e2
> > )) , size = numero ) ,
> >                     seis   = sample( c(runif(numero) , rep(NA , numero
> /1e3
> > )) , size = numero )
> >         )
> >
> >       apply(X = dat,  MARGIN = 1, FUN > >              
function(x){
> >                 return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
> >               }
> >       )
> >       dat[ , First_month := First_month]
> >       N_for <- length( unique(First_month ))
> >       for( j in 1:N_for){
> >         x <- dat[  First_month == j,  j,  with = FALSE]
> >         dat[ First_month == j , Value_First_month := x ]
> >       }
> >     },
> >     Olivier ={
> >     dat <-
> >       data.table( Uno    = sample( c(runif(numero) , rep(NA , numero
/2e0
> > )) , size = numero ) ,
> >                   dos    = sample( c(runif(numero) , rep(NA , numero
/1e1
> > )) , size = numero ) ,
> >                   tres   = sample( c(runif(numero) , rep(NA , numero
/2e1
> > )) , size = numero ) ,
> >                   cuatro = sample( c(runif(numero) , rep(NA , numero
/1e2
> > )) , size = numero ) ,
> >                   cinco  = sample( c(runif(numero) , rep(NA , numero
/2e2
> > )) , size = numero ) ,
> >                   seis   = sample( c(runif(numero) , rep(NA , numero
/1e3
> > )) , size = numero )
> >       )
> >       dat[,First_month       := apply(X = .SD,MARGIN = 1,FUN >
function(x)
> > colnames(.SD)[min(which(!is.na(x)))])]
> >       dat[,Value_First_month := apply(X = .SD,MARGIN = 1,FUN >
function(x)
> > x[min(which(!is.na(x)))])]
> >     },
> >     Olivier2={
> >       dat <-
> >         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero
> /2e0
> > )) , size = numero ) ,
> >                     dos    = sample( c(runif(numero) , rep(NA , numero
> /1e1
> > )) , size = numero ) ,
> >                     tres   = sample( c(runif(numero) , rep(NA , numero
> /2e1
> > )) , size = numero ) ,
> >                     cuatro = sample( c(runif(numero) , rep(NA , numero
> /1e2
> > )) , size = numero ) ,
> >                     cinco  = sample( c(runif(numero) , rep(NA , numero
> /2e2
> > )) , size = numero ) ,
> >                     seis   = sample( c(runif(numero) , rep(NA , numero
> /1e3
> > )) , size = numero )
> >         )
> >
> >       dat[,jugador:=1:.N]
> >       dat2=melt(dat,id.vars="jugador")
> >       setkey(dat2,jugador)
> >       dat2[,index:=min(which(!is.na(value))),by=jugador]
> >       dat3 <- dat2[,list(First_month_Olivier
> > =variable[index[1]],Value_First_month_Olivier
> > =value[index[1]]),by=jugador]
> >       setkey(x = dat, jugador)
> >       dat0 <- merge( x = dat, y = dat3, all.x = TRUE, all.y =
FALSE)
> >
> >     },
> >     times = N, unit = "s")
> >
> > tabla %>%  print
> > beepr::beep(3)
> >
> > # Unit: seconds
> > #       expr       min        lq      mean    median        uq     
max
> > neval
> > # JVG       0.6479002 0.7518933 0.8673007 0.8216553 0.9137195 1.251891
> > 10
> > # Olivier  2.9568197 3.6663586 3.9364770 3.9069826 4.5619519 4.685842
> 10
> > # Olivier2 1.1316970 1.4463621 1.4735507 1.4874366 1.5681243 1.631713
> 10
> > # E comparativa ------------------------------
> > -----------------------------
> >
> > El 27 de octubre de 2016, 15:39, Olivier Nuñez <onunez en
unex.es>
> escribió:
> >
> > > Otra solución algo más rapida:
> > > > t <- Sys.time()
> > > > dat[,jugador:=1:.N]
> > > > dat2=melt(dat,id.vars="jugador")
> > > > setkey(dat2,jugador)
> > > > dat2[,index:=min(which(!is.na(value))),by=jugador]
> > > > dat2[,.(First_month=variable[index[1]],Value_First_month>
> > value[index[1]]),by=jugador]
> > >         jugador First_month Value_First_month
> > >      1:       1         Uno        0.93520715
> > >      2:       2         Uno        0.85930634
> > >      3:       3         dos        0.13521503
> > >      4:       4         Uno        0.86996341
> > >      5:       5         dos        0.65879889
> > >     ---
> > >  99996:   99996         Uno        0.94728423
> > >  99997:   99997         Uno        0.24088571
> > >  99998:   99998         Uno        0.07458581
> > >  99999:   99999         Uno        0.30535050
> > > 100000:  100000         Uno        0.54640585
> > > > difftime( Sys.time(), t)
> > > Time difference of 1.060787 secs
> > >
> > >
> > > ----- Mensaje original -----
> > > De: "Olivier Nuñez" <onunez en unex.es>
> > > Para: "Javier Villacampa González"
<javier.villacampa.gonzalez@
> gmail.com
> > >
> > > CC: "R ayuda" <r-help-es en r-project.org>
> > > Enviados: Jueves, 27 de Octubre 2016 15:10:07
> > > Asunto: Re: [R-es] Encontrar la primera columna no NA
> > >
> > > Prueba lo siguiente, no es óptimo, pero creo va bastnate más
rapido que
> > > los que mencionaste:
> > >
> > > t <- Sys.time()
> > > dat[,First_month := apply(.SD,1,function(x)
colnames(.SD)[min(which(!
> > is.na
> > > (x)))])]
> > > dat[,Value_First_month := apply(.SD,1,function(x)
x[min(which(!is.na
> > > (x)))])]
> > > difftime( Sys.time(), t)
> > >
> > > Time difference of 3.478778 secs
> > >
> > >
> > > ----- Mensaje original -----
> > > De: "Javier Villacampa González"
<javier.villacampa.gonzalez en gmail.com
> >
> > > Para: "R ayuda" <r-help-es en r-project.org>
> > > Enviados: Jueves, 27 de Octubre 2016 13:43:19
> > > Asunto: [R-es] Encontrar la primera columna no NA
> > >
> > > Imaginemos que tenemos una matriz con datos temporales por
sujetos.
> > > Pongamos que numero de veces que ha jugado una carta en un juego
> online.
> > Y
> > > que quiero saber cuantas veces jugo la carta el primer mes que
estuvo
> en
> > el
> > > juego.
> > >
> > > Pero claro mi matriz guarda los datos temporalmente de tal manera
que:
> > >
> > > # data.table( Enero = c( 1, 4, NA , NA , NA) , Febrero = c( 2, 6,
1,
> NA,
> > NA
> > > ) , Marzo = c( 8,6,7,3, NA) ,  Abril = c( NA, 15, 5, 6,6 ))
> > > #    Enero Febrero Marzo Abril
> > > # 1:     1       2     8    NA
> > > # 2:     4       6     6    15
> > > # 3:    NA       1     7     5
> > > # 4:    NA      NA     3     6
> > > # 5:    NA      NA    NA     6
> > > # Suponiendo que cada fila es un jugador
> > > # En este caso la solucion debería ser
> > > # 1 para el primero que empezó en Enero
> > > # 4 para el segundo jugador  que empezó en Enero
> > > # 1 para el tercero  que empezó en Febrero
> > > # 3 Para el cuarto  que empezó en Marzo
> > > # 6 para el quinto  que empezó en Abril
> > >
> > >
> > > A alguno se os ocurre una solucion más eficiente que la
siguiente. Esto
> > > seguro que con data table o dplyr se puede. Ya he quitados los
pipes
> que
> > > facilitan la lectura pero que no se llevan bien con data.table.
Pero
> > estoy
> > > seguro que se puede mejorar más.
> > >
> > > #======================================================> >
> # Como ejemplo de codigo
> > > #======================================================> >
> # S Primera solucion ------------------------------
> > > ------------------------
> > > # First not NA colum per subject
> > > library(data.table)
> > > library(dplyr)
> > > set.seed(123456)
> > > numero <- 1e5
> > > dat <-
> > >   data.table( Uno  = sample( c(runif(numero) , rep(NA , numero
/2e0
> )),
> > > size = numero ) ,
> > >             dos    = sample( c(runif(numero) , rep(NA , numero
/1e1
> )),
> > > size = numero ) ,
> > >             tres   = sample( c(runif(numero) , rep(NA , numero
/2e1 ))
> ,
> > > size = numero ) ,
> > >             cuatro = sample( c(runif(numero) , rep(NA , numero
/1e2 ))
> ,
> > > size = numero ) ,
> > >             cinco  = sample( c(runif(numero) , rep(NA , numero
/2e2 ))
> ,
> > > size = numero ) ,
> > >             seis   = sample( c(runif(numero) , rep(NA , numero
/1e3 ))
> ,
> > > size = numero )
> > >             )
> > >
> > >
> > > t <- Sys.time()
> > > First_month <-
> > >   dat %>%
> > >   apply( MARGIN = 1, FUN > > >            function(x){
> > >              which( !is.na(x)  ) %>%  min( na.rm = TRUE )
%>%
> return()
> > >            }
> > >   )
> > >
> > >
> > >
> > > First_month %>%  table %>%  prop.table
> > > dat[ , First_month := First_month]
> > > N_for <- length( unique(First_month ))
> > > for( j in 1:N_for){
> > >   x <- dat[  First_month == j,  j,  with = FALSE]
> > >   dat[ First_month == j , Value_First_month := x ]
> > > }
> > >
> > > dat %>%  print
> > > # dat %>%  summary
> > >
> > > cat( "===============================\n", difftime(
Sys.time(), t,
> > units > > > "min") , " minutos que cuesta
\n===============================\n" )
> > > beepr::beep(3)
> > > # E Primera solucion ------------------------------
> > > ------------------------
> > >
> > >
> > >
> > >
> > > # S comparativa ------------------------------
> > > -----------------------------
> > > library(microbenchmark)
> > > N <- 1e2
> > > tabla <-
> > >   microbenchmark(
> > >     JVG_dplyr ={  dat %>%
> > >         apply( MARGIN = 1, FUN > > >                 
function(x){
> > >                    which( !is.na(x)  ) %>%  min( na.rm = TRUE
) %>%
> > > return()
> > >                    }
> > >         )
> > >     },
> > >     JVG ={
> > >         apply(X = dat,  MARGIN = 1, FUN > > >           
function(x){
> > >                    return(   min(  which( !is.na(x)  ),  na.rm =
TRUE
> )
> > )
> > >                  }
> > >         )
> > >     },
> > >     times = N, unit = "s")
> > >
> > > tabla %>%  print
> > > beepr::beep(3)
> > >
> > > # Unit: seconds
> > > #       expr        min        lq       mean     median        uq
> >  max
> > > neval
> > > # JVG_dplyr 21.2321152 22.233428 22.9575357 22.5701781 23.444432
> > > 26.642730    10
> > > # JVG        0.7628928  0.843067  0.9260389  0.8495834  1.027036
> > > 1.295868    10
> > > # E comparativa ------------------------------
> > > -----------------------------
> > >
> > > --
> > >
> > >         [[alternative HTML version deleted]]
> > >
> > > _______________________________________________
> > > R-help-es mailing list
> > > R-help-es en r-project.org
> > > https://stat.ethz.ch/mailman/listinfo/r-help-es
> > >
> > > _______________________________________________
> > > R-help-es mailing list
> > > R-help-es en r-project.org
> > > https://stat.ethz.ch/mailman/listinfo/r-help-es
> > >
> >
> >
> >
> > --
> >
> > _______________________________________________
> > R-help-es mailing list
> > R-help-es en r-project.org
> > https://stat.ethz.ch/mailman/listinfo/r-help-es
> >
>
>         [[alternative HTML version deleted]]
>
> _______________________________________________
> R-help-es mailing list
> R-help-es en r-project.org
> https://stat.ethz.ch/mailman/listinfo/r-help-es
>
	[[alternative HTML version deleted]]

R help es - Oct 2016 - Encontrar la primera columna no NA

[R-es] Encontrar la primera columna no NA

[R-es] Encontrar la primera columna no NA

[R-es] Encontrar la primera columna no NA

[R-es] Encontrar la primera columna no NA

[R-es] Encontrar la primera columna no NA