Eugeniusz Kałuża
2011-Jul-28 09:47 UTC
[R] _: how to replace values in x by means in subgroups created in ...(not loops)
Re:_: how to replace values in x by means in subgroups created in ...(not loops)
Thanks, below some code and reply:
#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_#
#-------------------------------------------------------
# my slow function with loops:
# replace_x_by_locallyMean_x_4_0s_in_y.f(x,y)
#-------------------------------------------------------
#-------------------------------------------------------
# replace_x_by_locallyMean_x_4_0s_in_y.f
# Arguments: function takes vectors x, y
# these vector have the same length: length(x)==length(y)
# function replaces in x
# values that have in indexes in y
# at least one zero in the same position in y
# by locally compted mean of values in x
# from all local x that also have continous 000 in their positions
close to this x
# in y vector
#x # 0 1 2 3 4 5 6 7 8 9
#y # 0 0 0 1 0 0 0 1 1 1
# Value expected:
#x_m # 1.5 1.5 1.5 3.0 5.0 5.0 5.0 7.0 8.0 9.0
#-------------------------------------------------------
# author: me
# replace_x_by_locallyMean_x_4_0s_in_y.f(x,y)
#-------------------------------------------------------
#initialisation
Dff<-NULL
index_Dff.start_0<-NULL
index_Dff.stop_0<-NULL
x<-NULL
y<-NULL
replace_x_by_locallyMean_x_4_0s_in_y.f<-NULL
replace_x_by_locallyMean_x_4_0s_in_y.f<-function(x=Df$x,
y=Df$y){
Dff<-as.data.frame(cbind(as.vector(x),as.vector(y)))
colnames(Dff)<-c('x','y')
index_Dff.start_0<-which( ( (Dff$y==0) &
rev(rev(c(0,Dff$y))[-1]) !=0) )
if (Dff$y[1]==0) {
index_Dff.start_0=c(1,index_Dff.start_0)
}
index_Dff.stop_0<-which( ( (Dff$y==0) &
c(Dff$y[-1],rev(Dff$y)[1]) !=0) )
if (rev(Dff$y)[1]==0) {
index_Dff.stop_0=c(index_Dff.stop_0,length(x))
}
Dff$x_m<-Dff$x
for (i in ( 1:length(index_Dff.start_0) ) ) {
Dff$x_m[index_Dff.start_0[i]:index_Dff.stop_0[i]]<-mean(
Dff$x[index_Dff.start_0[i]:index_Dff.stop_0[i]],na.rm=T)
}
for (i in ( 1:length(index_Dff.start_0) ) ) {
Dff$x_m[index_Dff.start_0[i]:index_Dff.stop_0[i]]<-mean(
Dff$x[index_Dff.start_0[i]:index_Dff.stop_0[i]],na.rm=T)
}
Dff$x_m
}
#replace_x_by_locallyMean_x_4_0s_in_y.f()
#-------------------------------------------------------
# replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry
# Arguments: function takes vectors x, y
# these vector have the same length: length(x)==length(y)
# function replaces in x
# values that have in indexes in y
# at least one zero in the same position in y
# by locally compted mean of values in x
# from all local x that also have continous 000 in their positions
close to this x
# in y vector
#x # 0 1 2 3 4 5 6 7 8 9
#y # 0 0 0 1 0 0 0 1 1 1
# Value expected:
#x_m # 1.5 1.5 1.5 3.0 5.0 5.0 5.0 7.0 8.0 9.0
#-------------------------------------------------------
# author: Thierry
# replace_x_by_Avgx_for000iny_Thierry.f(x,y)
#-------------------------------------------------------
#initialisation
dataset <- NULL
tmp <- NULL
result <- NULL
replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry<-NULL
library(plyr)
replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry<-function(x=x,
y=y){
dataset <- data.frame(x = x, y = y)
dataset$Group <- cumsum(c(0, diff(!is.na(dataset$y) & dataset$y == 0))
== 1)
#library(plyr)
tmp <- ddply(subset(dataset, y == 0), .(Group), function(z){c(Mean =
mean(z$x, na.rm = TRUE))})
result <- merge(dataset, tmp)
result$Mean[is.na(result$y) | result$y != 0] <- result$x[is.na(result$y) |
result$y != 0]
#result
result$Mean
} #replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry(x,y)
# Let's test it for longer vector
x<-1:(50*1000)
y<-sample(c(0,1), length(x), replace = TRUE)
# for Thierry we must start by 1 if it start by 0,
y<-c(0,1,sample(c(0,1), length(x)-2, replace = TRUE) )
paste(length(x),length(y))
# let's comapre times:
# slow solution with loops, my function x_by_locallyMean_x_4_0s_in_y__loop()
# system.time(
# x_by_locallyMean_x_4_0s_in_y__loop<-
# replace_x_by_locallyMean_x_4_0s_in_y.f(x,y)
# )
# _for_ x<-1:(50*1000) #
# user system elapsed
# 760.34 0.86 769.16
#---------------------------------------------------------------
# time of Thierry's function :
system.time(
x_by_locallyMean_x_4_0s_in_y__Thierry<-
replace_x_by_locallyMean_x_4_0s_in_y.f__Thierry(x,y)
)
#Df.x_m_Thierry (10x faster) then my loop
# _for_ x<-1:500000
# user system elapsed
# 11.61 0.00 11.68
# good news:
# 769.16 sec / 11.68 sec => Thierry's solution is 65 (x) times faster
than loops,
# great!
length(x_by_locallyMean_x_4_0s_in_y__Thierry)
cbind(x,y,x_by_locallyMean_x_4_0s_in_y__Thierry) [1:15,]
paste(' good, faster than my loops ')
#1. Thierry, your function is fast, ()
# and it does perfectly the problem that I proposed.
#2. Could we ask for a little modification in that code,
# to make it more general,
# more generaal, i.e. to do it working when y starts from "1"
y=c(1,...
x<-1:(50*1000)
y<-c(1,sample(c(0,1), length(x)-1, replace = TRUE) )
#-------------------------------------------------------------------------
#---------------------------------------------------------------------
# next to try in looking for faster solutions mayby:
#require(sqldf)
# centered moving average of length 7
#set.seed(1)
#DF <- data.frame(x = rnorm(15, 1:15))
#s18 <- sqldf("select a.x x, avg(b.x) movavgx from DF a, DF b
# where a.row_names - b.row_names between -3 and 3
# group by a.row_names having count(*) = 7
# order by a.row_names+0",
# row.names = TRUE)
#r18 <- data.frame(x = DF[4:12,], movavgx = rowMeans(embed(DF$x, 7)))
#row.names(r18) <- NULL
#all.equal(r18, s18)
#---------------------------------------------------------------------
#_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_#
-----Wiadomo¶æ oryginalna----->Od: ONKELINX, Thierry [mailto:Thierry.ONKELINX@inbo.be]
>Wys³ano: ¦r 2011-07-27 11:36
>Do: Eugeniusz Ka³u¿a; r-help@r-project.org
>Temat: RE: how to replace values in x by means in subgroups created in
...(not loops)
>
>Something like this?
>
>dataset <- data.frame(x = x, y = y)
>dataset$Group <- cumsum(c(0, diff(!is.na(dataset$y) & dataset$y ==
0)) == 1)
>library(plyr)
>tmp <- ddply(subset(dataset, y == 0), .(Group), function(z){c(Mean =
mean(z$x, na.rm = TRUE))})
>result <- merge(dataset, tmp)
>result$Mean[is.na(result$y) | result$y != 0] <- result$x[is.na(result$y)
| result$y != 0]
>
>Best regards,
>
>Thierry
>> -----Oorspronkelijk bericht-----
>> Van: r-help-bounces@r-project.org [mailto:r-help-bounces@r-project.org]
>> Namens Eugeniusz Kaluza
>> Verzonden: woensdag 27 juli 2011 11:21
>> Aan: r-help@r-project.org
>> Onderwerp: [R] how to replace values in x by means in subgroups created
in ...
>> (not loops)
>>
>>
>> # Dear all,
>> # how to replace values in x by means in subgroups created in ...
>> # replace only these values where y=0 in continous sequence # replace
by mean
>> calculated locally for each subgroup created by # continous sequence of
0,0,0 in
>> parallel y vector, i.e.
>> # where there is continous sequence of 0 in data frame vector y
>> # but we do not replace values in x[i], if y[i]!=0
>> # we do not want use loops we do not use apply (not very fast)
>>
>>
>> x <-c(0 ,1,2,3,4,5,6,7,8,NA,NA,1 ,1 ,NA,2 ,2)
>> y <-c(0 ,0,0,1,0,0,0,1,1,1 ,NA,0 ,0 ,0 ,0 ,1)
>> Must_be<-c(1 ,1,1,3,5,5,5,7,8,NA,NA,1.5,1 ,NA,1.5,2)
>>
>> (df<-as.data.frame(cbind(x,y)) )
>>
>> # I have traied many bad colusions based on cumsum, pmin, pmax, ...
>> (mean_dfx_if_yIs0<-y*cumsum(x*y)/(cumsum(y)*y) )
>>
>> # how to do this?
>> # thans for any advice
>> # E
[[alternative HTML version deleted]]
