thr3ads.net - R help es - [R-es] Encontrar la primera columna no NA [Oct 2016]

If this information is useful, please help other people find it:
Share via:

Adolfo Álvarez

2016-Oct-28 11:48 UTC

[R-es] Encontrar la primera columna no NA

Hola a todos, me ha gustado mucho la solución de Carlos, muy eficiente y
muy ingeniosa al utilizar la funcion col() que o no la conocia o no me
acordaba de ella.

La parte mas "lenta" sigue siendo el apply que en el fondo no es mas
que un
ciclo for a traves de las filas, asi que inspirado por el metodo de Carlos
pense que podria ser mas rapido si iteramos a traves de las columnas por lo
que en general seran menos iteraciones. He incluido esta modificacion en el
benchmark, es un poco menos elegante que la original de Carlos pero algo
mas rapida. Seguro que aun se puede mejorar un poco mas en R base o
incorporar Rcpp, pero creo que al menos por mi parte llego hasta aqui.

Muy interesante tanto el problema como las soluciones propuestas, un saludo!
Adolfo.

library(microbenchmark)
library(data.table)
library(dplyr)
library(tidyr)
set.seed(123456)
numero <- 1e5
N <- 1e1
tabla <-
  microbenchmark(
    JVG ={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
        )) , size = numero ) ,
        dos    = sample( c(runif(numero) , rep(NA , numero /1e1
        )) , size = numero ) ,
        tres   = sample( c(runif(numero) , rep(NA , numero /2e1
        )) , size = numero ) ,
        cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
        )) , size = numero ) ,
        cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
        )) , size = numero ) ,
        seis   = sample( c(runif(numero) , rep(NA , numero /1e3
        )) , size = numero )
        )
    First_month <-
      apply(X = dat,  MARGIN = 1, FUN               function(x){
                return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
              }
      )
      dat[ , First_month := First_month]
      N_for <- length( unique(First_month ))
      for( j in 1:N_for){
        x <- dat[  First_month == j,  j,  with = FALSE]
        dat[ First_month == j , Value_First_month := x ]
      }
    },
    Olivier ={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
        )) , size = numero ) ,
        dos    = sample( c(runif(numero) , rep(NA , numero /1e1
        )) , size = numero ) ,
        tres   = sample( c(runif(numero) , rep(NA , numero /2e1
        )) , size = numero ) ,
        cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
        )) , size = numero ) ,
        cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
        )) , size = numero ) ,
        seis   = sample( c(runif(numero) , rep(NA , numero /1e3
        )) , size = numero )
        )
      dat[,First_month       := apply(X = .SD,MARGIN = 1,FUN = function(x)
        colnames(.SD)[min(which(!is.na(x)))])]
      dat[,Value_First_month := apply(X = .SD,MARGIN = 1,FUN = function(x)
        x[min(which(!is.na(x)))])]
    },
    Olivier2={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
        )) , size = numero ) ,
        dos    = sample( c(runif(numero) , rep(NA , numero /1e1
        )) , size = numero ) ,
        tres   = sample( c(runif(numero) , rep(NA , numero /2e1
        )) , size = numero ) ,
        cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
        )) , size = numero ) ,
        cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
        )) , size = numero ) ,
        seis   = sample( c(runif(numero) , rep(NA , numero /1e3
        )) , size = numero )
        )

      dat[,jugador:=1:.N]
      dat2=melt(dat,id.vars="jugador")
      setkey(dat2,jugador)
      dat2[,index:=min(which(!is.na(value))),by=jugador]
      dat3 <- dat2[,list(First_month_Olivier
                         =variable[index[1]],Value_First_month_Olivier
=value[index[1]]),by=jugador]
      setkey(x = dat, jugador)
      dat0 <- merge( x = dat, y = dat3, all.x = TRUE, all.y = FALSE)

    },

    Adolfo = {

      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
        )) , size = numero ) ,
        dos    = sample( c(runif(numero) , rep(NA , numero /1e1
        )) , size = numero ) ,
        tres   = sample( c(runif(numero) , rep(NA , numero /2e1
        )) , size = numero ) ,
        cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
        )) , size = numero ) ,
        cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
        )) , size = numero ) ,
        seis   = sample( c(runif(numero) , rep(NA , numero /1e3
        )) , size = numero )
        )
      # 1) Creamos una columna con la informacion de los jugadores,
      # Como es un jugador por fila, hacemos 1:nrow.
      step1 <- dat %>%
        mutate(player = 1:nrow(dat))

      #2) Convertimos las columnas de tiempo (uno, dos, tres, ...) en dos
      # columnas, mes y numero de juegos. (Ojo, asumimos que en los datos
las
      #                                    columnas estan ordenadas como en
el ejemplo, es decir uno, dos, tres y no
      #                                    tres, uno, dos)
      #
      step2 <- gather(step1, month, games, -player)

      #y 3) Filtramos los meses con NA y por cada jugador nos quedamos con
      # el primer dato:
        step3 <- step2 %>%
        filter(!is.na(games)) %>%
        group_by(player) %>%
        slice(1)
    },

    Olivier3 = {
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
        )) , size = numero ) ,
        dos    = sample( c(runif(numero) , rep(NA , numero /1e1
        )) , size = numero ) ,
        tres   = sample( c(runif(numero) , rep(NA , numero /2e1
        )) , size = numero ) ,
        cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
        )) , size = numero ) ,
        cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
        )) , size = numero ) ,
        seis   = sample( c(runif(numero) , rep(NA , numero /1e3
        )) , size = numero )
        )
      M=as.matrix(dat)
      index <- which(!is.na(M)) - 1
      meses<-colnames(M)
      M2<- data.table(columna=index %/% nrow(M) +1L, jugador=index %%
                        nrow(M) +1L , valor=M[index+1L])
      setkey(M2,jugador,columna)


M2[,.(First_month=meses[columna[1]],Value_First_month=valor[1]),by=jugador]
    },
    GilBellosta = {

      dat <-
        data.frame( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
        )) , size = numero ) ,
        dos    = sample( c(runif(numero) , rep(NA , numero /1e1
        )) , size = numero ) ,
        tres   = sample( c(runif(numero) , rep(NA , numero /2e1
        )) , size = numero ) ,
        cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
        )) , size = numero ) ,
        cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
        )) , size = numero ) ,
        seis   = sample( c(runif(numero) , rep(NA , numero /1e3
        )) , size = numero )
        )
      tmp <- (as.matrix(dat))
      cols <- col(tmp)
      cols[is.na(tmp)] <- Inf
      my.cols <- apply(cols, 1, min)
      my.values <- tmp[cbind(1:nrow(tmp), my.cols)]
    },
    Adolfo2 = {
      dat <-
        data.frame( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
        )) , size = numero ) ,
        dos    = sample( c(runif(numero) , rep(NA , numero /1e1
        )) , size = numero ) ,
        tres   = sample( c(runif(numero) , rep(NA , numero /2e1
        )) , size = numero ) ,
        cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
        )) , size = numero ) ,
        cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
        )) , size = numero ) ,
        seis   = sample( c(runif(numero) , rep(NA , numero /1e3
        )) , size = numero )
        )
      tmp <- (as.matrix(dat))
      cols <- col(tmp)
      cols[is.na(tmp)] <- NA
      my.cols <- cols[,ncol(cols)]
      for (j in (ncol(cols)-1):1){
        my.cols <- ifelse(is.na(cols[,j]), my.cols, cols[,j])
      }
      my.values <- tmp[cbind(1:nrow(tmp), my.cols)]
    },
    times = N, unit = "s")
> tablaUnit: seconds
        expr       min        lq      mean    median        uq       max
neval
         JVG 1.0458327 1.3045354 1.3660296 1.3486868 1.4004353 2.0389759
 10
     Olivier 4.4031746 4.6501372 4.9638930 4.9841975 5.2855783 5.5569627
 10
    Olivier2 1.7937688 2.1531256 2.4749540 2.5052893 2.8389349 3.0933835
 10
      Adolfo 0.3520900 0.3615358 0.4764479 0.3942295 0.5072621 1.0266727
 10
    Olivier3 0.3936536 0.4454847 0.5254894 0.4784246 0.5269834 0.8900983
 10
 GilBellosta 0.2721629 0.3097020 0.3901691 0.3466332 0.4294069 0.7126116
 10
     Adolfo2 0.1110292 0.1611071 0.1812212 0.1639743 0.2007791 0.2948245
 10

	[[alternative HTML version deleted]]

Carlos J. Gil Bellosta

2016-Oct-28 13:19 UTC

head link

[R-es] Encontrar la primera columna no NA

Inspirado por Adolfo, otra vueltica de tuerca:

#matrices mejor que dfs
tmp <- as.matrix(dat)

# min implementado a mano:
my.cols <- rep(ncol(tmp), nrow(tmp))
for (i in (ncol(tmp) - 1):1)
  my.cols[!is.na(tmp[,i])] <- i

# al canasto:
my.values <- tmp[cbind(1:nrow(tmp), my.cols)]


Un saludo,

Carlos J. Gil Bellosta
http://www.datanalytics.com

El 28 de octubre de 2016, 13:48, Adolfo Álvarez <adalvarez en gmail.com>
escribió:
> Hola a todos, me ha gustado mucho la solución de Carlos, muy eficiente y
> muy ingeniosa al utilizar la funcion col() que o no la conocia o no me
> acordaba de ella.
>
> La parte mas "lenta" sigue siendo el apply que en el fondo no es
mas que un
> ciclo for a traves de las filas, asi que inspirado por el metodo de Carlos
> pense que podria ser mas rapido si iteramos a traves de las columnas por lo
> que en general seran menos iteraciones. He incluido esta modificacion en el
> benchmark, es un poco menos elegante que la original de Carlos pero algo
> mas rapida. Seguro que aun se puede mejorar un poco mas en R base o
> incorporar Rcpp, pero creo que al menos por mi parte llego hasta aqui.
>
> Muy interesante tanto el problema como las soluciones propuestas, un
> saludo!
> Adolfo.
>
> library(microbenchmark)
> library(data.table)
> library(dplyr)
> library(tidyr)
> set.seed(123456)
> numero <- 1e5
> N <- 1e1
> tabla <-
>   microbenchmark(
>     JVG ={
>       dat <-
>         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
>         )) , size = numero ) ,
>         dos    = sample( c(runif(numero) , rep(NA , numero /1e1
>         )) , size = numero ) ,
>         tres   = sample( c(runif(numero) , rep(NA , numero /2e1
>         )) , size = numero ) ,
>         cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
>         )) , size = numero ) ,
>         cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
>         )) , size = numero ) ,
>         seis   = sample( c(runif(numero) , rep(NA , numero /1e3
>         )) , size = numero )
>         )
>     First_month <-
>       apply(X = dat,  MARGIN = 1, FUN >               function(x){
>                 return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
>               }
>       )
>       dat[ , First_month := First_month]
>       N_for <- length( unique(First_month ))
>       for( j in 1:N_for){
>         x <- dat[  First_month == j,  j,  with = FALSE]
>         dat[ First_month == j , Value_First_month := x ]
>       }
>     },
>     Olivier ={
>       dat <-
>         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
>         )) , size = numero ) ,
>         dos    = sample( c(runif(numero) , rep(NA , numero /1e1
>         )) , size = numero ) ,
>         tres   = sample( c(runif(numero) , rep(NA , numero /2e1
>         )) , size = numero ) ,
>         cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
>         )) , size = numero ) ,
>         cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
>         )) , size = numero ) ,
>         seis   = sample( c(runif(numero) , rep(NA , numero /1e3
>         )) , size = numero )
>         )
>       dat[,First_month       := apply(X = .SD,MARGIN = 1,FUN = function(x)
>         colnames(.SD)[min(which(!is.na(x)))])]
>       dat[,Value_First_month := apply(X = .SD,MARGIN = 1,FUN = function(x)
>         x[min(which(!is.na(x)))])]
>     },
>     Olivier2={
>       dat <-
>         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
>         )) , size = numero ) ,
>         dos    = sample( c(runif(numero) , rep(NA , numero /1e1
>         )) , size = numero ) ,
>         tres   = sample( c(runif(numero) , rep(NA , numero /2e1
>         )) , size = numero ) ,
>         cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
>         )) , size = numero ) ,
>         cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
>         )) , size = numero ) ,
>         seis   = sample( c(runif(numero) , rep(NA , numero /1e3
>         )) , size = numero )
>         )
>
>       dat[,jugador:=1:.N]
>       dat2=melt(dat,id.vars="jugador")
>       setkey(dat2,jugador)
>       dat2[,index:=min(which(!is.na(value))),by=jugador]
>       dat3 <- dat2[,list(First_month_Olivier
>                          =variable[index[1]],Value_First_month_Olivier
> =value[index[1]]),by=jugador]
>       setkey(x = dat, jugador)
>       dat0 <- merge( x = dat, y = dat3, all.x = TRUE, all.y = FALSE)
>
>     },
>
>     Adolfo = {
>
>       dat <-
>         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
>         )) , size = numero ) ,
>         dos    = sample( c(runif(numero) , rep(NA , numero /1e1
>         )) , size = numero ) ,
>         tres   = sample( c(runif(numero) , rep(NA , numero /2e1
>         )) , size = numero ) ,
>         cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
>         )) , size = numero ) ,
>         cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
>         )) , size = numero ) ,
>         seis   = sample( c(runif(numero) , rep(NA , numero /1e3
>         )) , size = numero )
>         )
>       # 1) Creamos una columna con la informacion de los jugadores,
>       # Como es un jugador por fila, hacemos 1:nrow.
>       step1 <- dat %>%
>         mutate(player = 1:nrow(dat))
>
>       #2) Convertimos las columnas de tiempo (uno, dos, tres, ...) en dos
>       # columnas, mes y numero de juegos. (Ojo, asumimos que en los datos
> las
>       #                                    columnas estan ordenadas como en
> el ejemplo, es decir uno, dos, tres y no
>       #                                    tres, uno, dos)
>       #
>       step2 <- gather(step1, month, games, -player)
>
>       #y 3) Filtramos los meses con NA y por cada jugador nos quedamos con
>       # el primer dato:
>         step3 <- step2 %>%
>         filter(!is.na(games)) %>%
>         group_by(player) %>%
>         slice(1)
>     },
>
>     Olivier3 = {
>       dat <-
>         data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
>         )) , size = numero ) ,
>         dos    = sample( c(runif(numero) , rep(NA , numero /1e1
>         )) , size = numero ) ,
>         tres   = sample( c(runif(numero) , rep(NA , numero /2e1
>         )) , size = numero ) ,
>         cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
>         )) , size = numero ) ,
>         cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
>         )) , size = numero ) ,
>         seis   = sample( c(runif(numero) , rep(NA , numero /1e3
>         )) , size = numero )
>         )
>       M=as.matrix(dat)
>       index <- which(!is.na(M)) - 1
>       meses<-colnames(M)
>       M2<- data.table(columna=index %/% nrow(M) +1L, jugador=index %%
>                         nrow(M) +1L , valor=M[index+1L])
>       setkey(M2,jugador,columna)
>
>
> M2[,.(First_month=meses[columna[1]],Value_First_month>
valor[1]),by=jugador]
>     },
>     GilBellosta = {
>
>       dat <-
>         data.frame( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
>         )) , size = numero ) ,
>         dos    = sample( c(runif(numero) , rep(NA , numero /1e1
>         )) , size = numero ) ,
>         tres   = sample( c(runif(numero) , rep(NA , numero /2e1
>         )) , size = numero ) ,
>         cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
>         )) , size = numero ) ,
>         cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
>         )) , size = numero ) ,
>         seis   = sample( c(runif(numero) , rep(NA , numero /1e3
>         )) , size = numero )
>         )
>       tmp <- (as.matrix(dat))
>       cols <- col(tmp)
>       cols[is.na(tmp)] <- Inf
>       my.cols <- apply(cols, 1, min)
>       my.values <- tmp[cbind(1:nrow(tmp), my.cols)]
>     },
>     Adolfo2 = {
>       dat <-
>         data.frame( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
>         )) , size = numero ) ,
>         dos    = sample( c(runif(numero) , rep(NA , numero /1e1
>         )) , size = numero ) ,
>         tres   = sample( c(runif(numero) , rep(NA , numero /2e1
>         )) , size = numero ) ,
>         cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
>         )) , size = numero ) ,
>         cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
>         )) , size = numero ) ,
>         seis   = sample( c(runif(numero) , rep(NA , numero /1e3
>         )) , size = numero )
>         )
>       tmp <- (as.matrix(dat))
>       cols <- col(tmp)
>       cols[is.na(tmp)] <- NA
>       my.cols <- cols[,ncol(cols)]
>       for (j in (ncol(cols)-1):1){
>         my.cols <- ifelse(is.na(cols[,j]), my.cols, cols[,j])
>       }
>       my.values <- tmp[cbind(1:nrow(tmp), my.cols)]
>     },
>     times = N, unit = "s")
>
> > tabla
> Unit: seconds
>         expr       min        lq      mean    median        uq       max
> neval
>          JVG 1.0458327 1.3045354 1.3660296 1.3486868 1.4004353 2.0389759
>  10
>      Olivier 4.4031746 4.6501372 4.9638930 4.9841975 5.2855783 5.5569627
>  10
>     Olivier2 1.7937688 2.1531256 2.4749540 2.5052893 2.8389349 3.0933835
>  10
>       Adolfo 0.3520900 0.3615358 0.4764479 0.3942295 0.5072621 1.0266727
>  10
>     Olivier3 0.3936536 0.4454847 0.5254894 0.4784246 0.5269834 0.8900983
>  10
>  GilBellosta 0.2721629 0.3097020 0.3901691 0.3466332 0.4294069 0.7126116
>  10
>      Adolfo2 0.1110292 0.1611071 0.1812212 0.1639743 0.2007791 0.2948245
>  10
>
>         [[alternative HTML version deleted]]
>
> _______________________________________________
> R-help-es mailing list
> R-help-es en r-project.org
> https://stat.ethz.ch/mailman/listinfo/r-help-es
>
	[[alternative HTML version deleted]]

Adolfo Álvarez

2016-Oct-28 17:01 UTC

head link

[R-es] Encontrar la primera columna no NA

Se ha borrado un adjunto en formato HTML...
URL:
<https://stat.ethz.ch/pipermail/r-help-es/attachments/20161028/24c17d52/attachment-0001.html>

Apparently Analagous Threads

Search for more reasonably related threads

R help es - Oct 2016 - Encontrar la primera columna no NA

[R-es] Encontrar la primera columna no NA

[R-es] Encontrar la primera columna no NA

[R-es] Encontrar la primera columna no NA

Apparently Analagous Threads