Eugeniusz Kałuża
2011-Jul-28 09:47 UTC
[R] _: how to replace values in x by means in subgroups created in ...(not loops)
Re:_: how to replace values in x by means in subgroups created in ...(not loops) Thanks, below some code and reply: #_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_# #------------------------------------------------------- # my slow function with loops: # replace_x_by_locallyMean_x_4_0s_in_y.f(x,y) #------------------------------------------------------- #------------------------------------------------------- # replace_x_by_locallyMean_x_4_0s_in_y.f # Arguments: function takes vectors x, y # these vector have the same length: length(x)==length(y) # function replaces in x # values that have in indexes in y # at least one zero in the same position in y # by locally compted mean of values in x # from all local x that also have continous 000 in their positions close to this x # in y vector #x # 0 1 2 3 4 5 6 7 8 9 #y # 0 0 0 1 0 0 0 1 1 1 # Value expected: #x_m # 1.5 1.5 1.5 3.0 5.0 5.0 5.0 7.0 8.0 9.0 #------------------------------------------------------- # author: me # replace_x_by_locallyMean_x_4_0s_in_y.f(x,y) #------------------------------------------------------- #initialisation Dff<-NULL index_Dff.start_0<-NULL index_Dff.stop_0<-NULL x<-NULL y<-NULL replace_x_by_locallyMean_x_4_0s_in_y.f<-NULL replace_x_by_locallyMean_x_4_0s_in_y.f<-function(x=Df$x, y=Df$y){ Dff<-as.data.frame(cbind(as.vector(x),as.vector(y))) colnames(Dff)<-c('x','y') index_Dff.start_0<-which( ( (Dff$y==0) & rev(rev(c(0,Dff$y))[-1]) !=0) ) if (Dff$y[1]==0) { index_Dff.start_0=c(1,index_Dff.start_0) } index_Dff.stop_0<-which( ( (Dff$y==0) & c(Dff$y[-1],rev(Dff$y)[1]) !=0) ) if (rev(Dff$y)[1]==0) { index_Dff.stop_0=c(index_Dff.stop_0,length(x)) } Dff$x_m<-Dff$x for (i in ( 1:length(index_Dff.start_0) ) ) { Dff$x_m[index_Dff.start_0[i]:index_Dff.stop_0[i]]<-mean( Dff$x[index_Dff.start_0[i]:index_Dff.stop_0[i]],na.rm=T) } for (i in ( 1:length(index_Dff.start_0) ) ) { Dff$x_m[index_Dff.start_0[i]:index_Dff.stop_0[i]]<-mean( Dff$x[index_Dff.start_0[i]:index_Dff.stop_0[i]],na.rm=T) } Dff$x_m } #replace_x_by_locallyMean_x_4_0s_in_y.f() #------------------------------------------------------- # replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry # Arguments: function takes vectors x, y # these vector have the same length: length(x)==length(y) # function replaces in x # values that have in indexes in y # at least one zero in the same position in y # by locally compted mean of values in x # from all local x that also have continous 000 in their positions close to this x # in y vector #x # 0 1 2 3 4 5 6 7 8 9 #y # 0 0 0 1 0 0 0 1 1 1 # Value expected: #x_m # 1.5 1.5 1.5 3.0 5.0 5.0 5.0 7.0 8.0 9.0 #------------------------------------------------------- # author: Thierry # replace_x_by_Avgx_for000iny_Thierry.f(x,y) #------------------------------------------------------- #initialisation dataset <- NULL tmp <- NULL result <- NULL replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry<-NULL library(plyr) replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry<-function(x=x, y=y){ dataset <- data.frame(x = x, y = y) dataset$Group <- cumsum(c(0, diff(!is.na(dataset$y) & dataset$y == 0)) == 1) #library(plyr) tmp <- ddply(subset(dataset, y == 0), .(Group), function(z){c(Mean = mean(z$x, na.rm = TRUE))}) result <- merge(dataset, tmp) result$Mean[is.na(result$y) | result$y != 0] <- result$x[is.na(result$y) | result$y != 0] #result result$Mean } #replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry(x,y) # Let's test it for longer vector x<-1:(50*1000) y<-sample(c(0,1), length(x), replace = TRUE) # for Thierry we must start by 1 if it start by 0, y<-c(0,1,sample(c(0,1), length(x)-2, replace = TRUE) ) paste(length(x),length(y)) # let's comapre times: # slow solution with loops, my function x_by_locallyMean_x_4_0s_in_y__loop() # system.time( # x_by_locallyMean_x_4_0s_in_y__loop<- # replace_x_by_locallyMean_x_4_0s_in_y.f(x,y) # ) # _for_ x<-1:(50*1000) # # user system elapsed # 760.34 0.86 769.16 #--------------------------------------------------------------- # time of Thierry's function : system.time( x_by_locallyMean_x_4_0s_in_y__Thierry<- replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry(x,y) ) #Df.x_m_Thierry (10x faster) then my loop # _for_ x<-1:500000 # user system elapsed # 11.61 0.00 11.68 # good news: # 769.16 sec / 11.68 sec => Thierry's solution is 65 (x) times faster than loops, # great! length(x_by_locallyMean_x_4_0s_in_y__Thierry) cbind(x,y,x_by_locallyMean_x_4_0s_in_y__Thierry) [1:15,] paste(' good, faster than my loops ') #1. Thierry, your function is fast, () # and it does perfectly the problem that I proposed. #2. Could we ask for a little modification in that code, # to make it more general, # more generaal, i.e. to do it working when y starts from "1" y=c(1,... x<-1:(50*1000) y<-c(1,sample(c(0,1), length(x)-1, replace = TRUE) ) #------------------------------------------------------------------------- #--------------------------------------------------------------------- # next to try in looking for faster solutions mayby: #require(sqldf) # centered moving average of length 7 #set.seed(1) #DF <- data.frame(x = rnorm(15, 1:15)) #s18 <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b # where a.row_names - b.row_names between -3 and 3 # group by a.row_names having count(*) = 7 # order by a.row_names+0", # row.names = TRUE) #r18 <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7))) #row.names(r18) <- NULL #all.equal(r18, s18) #--------------------------------------------------------------------- #_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_# -----Wiadomo¶æ oryginalna----->Od: ONKELINX, Thierry [mailto:Thierry.ONKELINX@inbo.be] >Wys³ano: ¦r 2011-07-27 11:36 >Do: Eugeniusz Ka³u¿a; r-help@r-project.org >Temat: RE: how to replace values in x by means in subgroups created in ...(not loops) > >Something like this? > >dataset <- data.frame(x = x, y = y) >dataset$Group <- cumsum(c(0, diff(!is.na(dataset$y) & dataset$y == 0)) == 1) >library(plyr) >tmp <- ddply(subset(dataset, y == 0), .(Group), function(z){c(Mean = mean(z$x, na.rm = TRUE))}) >result <- merge(dataset, tmp) >result$Mean[is.na(result$y) | result$y != 0] <- result$x[is.na(result$y) | result$y != 0] > >Best regards, > >Thierry>> -----Oorspronkelijk bericht----- >> Van: r-help-bounces@r-project.org [mailto:r-help-bounces@r-project.org] >> Namens Eugeniusz Kaluza >> Verzonden: woensdag 27 juli 2011 11:21 >> Aan: r-help@r-project.org >> Onderwerp: [R] how to replace values in x by means in subgroups created in ... >> (not loops) >> >> >> # Dear all, >> # how to replace values in x by means in subgroups created in ... >> # replace only these values where y=0 in continous sequence # replace by mean >> calculated locally for each subgroup created by # continous sequence of 0,0,0 in >> parallel y vector, i.e. >> # where there is continous sequence of 0 in data frame vector y >> # but we do not replace values in x[i], if y[i]!=0 >> # we do not want use loops we do not use apply (not very fast) >> >> >> x <-c(0 ,1,2,3,4,5,6,7,8,NA,NA,1 ,1 ,NA,2 ,2) >> y <-c(0 ,0,0,1,0,0,0,1,1,1 ,NA,0 ,0 ,0 ,0 ,1) >> Must_be<-c(1 ,1,1,3,5,5,5,7,8,NA,NA,1.5,1 ,NA,1.5,2) >> >> (df<-as.data.frame(cbind(x,y)) ) >> >> # I have traied many bad colusions based on cumsum, pmin, pmax, ... >> (mean_dfx_if_yIs0<-y*cumsum(x*y)/(cumsum(y)*y) ) >> >> # how to do this? >> # thans for any advice >> # E[[alternative HTML version deleted]]