Dear R-mates,
# Here's what I am trying to do. I have a dataset like this:
id = c(rep(1,8), rep(2,8))
dur1 <- c( 17,18,19,18,24,19,24,24 )
est1 <- c( rep(1,5), rep(2,3) )
dur2 <- c(1,1,3,4,8,12,13,14)
est2 <- rep(1,8)
mydata = data.frame(id,
estat=c(est1, est2),
durat=c(dur1, dur2))
# I want to one have this:
id = c(rep(1,8), rep(2,8))
dur1 <- c( 17,18,19,20,28,1,2,3 )
est1 <- c( rep(1,5), rep(2,3) )
dur2 <- c(1,2,3,4,12,13,14,15)
est2 <- rep(1,8)
mydata2 = = data.frame(id,
estat=c(est1, est2),
durat=c(dur1, dur2))
# What is happening here? I have a longitudinal dataset.
# Individuals are observed 8 times, and each time each of them are in a
certain state J (here, J={1,2}).
# Each observation is one unit of time away from the following one, except
observations 4 and 5, which are 8 units of time away from each other.
# So here we have individual 1 migrating from state 1 to state 2 at
observation #6,
# while individual 2 stays in state 1 as long as we can observe her.
# I am interested in the spell (duration) of each state.
# However, the durations are clearly mismesuared, and now I am trying to
give some consistency to the data.
# I am assuming that the first duration is correct. Departing from this, I
wrote the following function:
d <- function(dur,est)
{
if ( sum( diff(est) )==0 ) # for those who didn't change state
{
for( i in c(2:4))
dur[i] <- dur[i-1] + 1
dur[5] <- dur[4] + 8
for( i in c(6:8) )
dur[i] <- dur[i-1] + 1
}
if ( sum( diff(est) )!=0 ) # for those who changed state
{
j = which(diff(est)!=0) + 1 # j is when the change occured
dur[j] = 1
k0 = which( c(1:8) < j )[-c(1)]
k1 = which( c(1:8) > j )
if(length(j) > 1)
{
for( i in 1:(length(j)-1) )
k2 = c(1:8)[c(1:8)> j[i] & c(1:8)< j[i+1]]
k = unique( c(k0,k1,k2) )
}
k = unique( c(k0,k1) )
k = k[!k%in%j]
if(5%in%k)
{
k = k[k != 5]
for(i in k[k<5])
dur[i] = dur[i-1] + 1
dur[5] = dur[4] + 8
for(i in k[k>5])
dur[i] = dur[i-1] + 1
} else
{
for(i in k)
dur[i] = dur[i-1] + 1
}
}
dur
}
# Now, if a do
d(dur1, est1)
# and
d(dur2,est2)
# I get what I want, except from the fact that I couldn't do this for a
large dataset.
# So I decide to use tapply. But this gives me
new.durat <- tapply(mydata$durat, IND=mydata$id, FUN=d,
est=mydata$estat)
mydata$new.durat <- unlist(new.durat)
> mydata
id estat durat new.durat
1 1 1 17 17
2 1 1 18 18
3 1 1 19 19
4 1 1 18 20
5 1 1 24 28
6 1 2 19 29
7 1 2 24 30
8 1 2 24 31
9 2 1 1 1
10 2 1 1 2
11 2 1 3 3
12 2 1 4 4
13 2 1 8 12
14 2 1 12 13
15 2 1 13 14
16 2 1 14 15
# what is not what I want. I can't figure it out why, but when I use tapply,
# the logical expression "sum( diff(est) )==0" turns out to be true
for both
individuals
# (whereas we know this is true only for individual #2).
# I am sorry for the long message. I will be very grateful for any help with
this problem.
[[alternative HTML version deleted]]
On 4/26/06, Dimitri Szerman <dimitrijoe at ipea.gov.br> wrote:> Dear R-mates, > > # Here's what I am trying to do. I have a dataset like this: > > id = c(rep(1,8), rep(2,8)) > dur1 <- c( 17,18,19,18,24,19,24,24 ) > est1 <- c( rep(1,5), rep(2,3) ) > dur2 <- c(1,1,3,4,8,12,13,14) > est2 <- rep(1,8) > > mydata = data.frame(id, > estat=c(est1, est2), > durat=c(dur1, dur2)) > > > # I want to one have this: > > id = c(rep(1,8), rep(2,8)) > dur1 <- c( 17,18,19,20,28,1,2,3 ) > est1 <- c( rep(1,5), rep(2,3) ) > dur2 <- c(1,2,3,4,12,13,14,15) > est2 <- rep(1,8) > > mydata2 = = data.frame(id, > estat=c(est1, est2), > durat=c(dur1, dur2)) > > > # What is happening here? I have a longitudinal dataset. > # Individuals are observed 8 times, and each time each of them are in a > certain state J (here, J={1,2}). > # Each observation is one unit of time away from the following one, except > observations 4 and 5, which are 8 units of time away from each other. > # So here we have individual 1 migrating from state 1 to state 2 at > observation #6, > # while individual 2 stays in state 1 as long as we can observe her. > # I am interested in the spell (duration) of each state. > # However, the durations are clearly mismesuared, and now I am trying to > give some consistency to the data. > # I am assuming that the first duration is correct. Departing from this, I > wrote the following function: > > d <- function(dur,est) > { > if ( sum( diff(est) )==0 ) # for those who didn't change state > { > for( i in c(2:4)) > dur[i] <- dur[i-1] + 1 > > dur[5] <- dur[4] + 8 > > for( i in c(6:8) ) > dur[i] <- dur[i-1] + 1 > } > if ( sum( diff(est) )!=0 ) # for those who changed state > { > j = which(diff(est)!=0) + 1 # j is when the change occured > dur[j] = 1 > > k0 = which( c(1:8) < j )[-c(1)] > k1 = which( c(1:8) > j ) > if(length(j) > 1) > { > for( i in 1:(length(j)-1) ) > k2 = c(1:8)[c(1:8)> j[i] & c(1:8)< j[i+1]] > k = unique( c(k0,k1,k2) ) > } > k = unique( c(k0,k1) ) > k = k[!k%in%j] > if(5%in%k) > { > k = k[k != 5] > for(i in k[k<5]) > dur[i] = dur[i-1] + 1 > > dur[5] = dur[4] + 8 > > for(i in k[k>5]) > dur[i] = dur[i-1] + 1 > } else > { > for(i in k) > dur[i] = dur[i-1] + 1 > } > } > dur > > } > > # Now, if a do > > d(dur1, est1) > # and > d(dur2,est2) > # I get what I want, except from the fact that I couldn't do this for a > large dataset. > # So I decide to use tapply. But this gives me > > new.durat <- tapply(mydata$durat, IND=mydata$id, FUN=d, > est=mydata$estat) > mydata$new.durat <- unlist(new.durat) > > > mydata > id estat durat new.durat > 1 1 1 17 17 > 2 1 1 18 18 > 3 1 1 19 19 > 4 1 1 18 20 > 5 1 1 24 28 > 6 1 2 19 29 > 7 1 2 24 30 > 8 1 2 24 31 > 9 2 1 1 1 > 10 2 1 1 2 > 11 2 1 3 3 > 12 2 1 4 4 > 13 2 1 8 12 > 14 2 1 12 13 > 15 2 1 13 14 > 16 2 1 14 15 > > # what is not what I want. I can't figure it out why, but when I use tapply, > # the logical expression "sum( diff(est) )==0" turns out to be true for both > individuals > # (whereas we know this is true only for individual #2). > # I am sorry for the long message. I will be very grateful for any help with > this problem.I didn't try to read all this carefully but I think you want to tapply over the indices so you can use them in both columns: with(mydata, unlist(tapply(seq(id), id, function(i) d(durat[i], estat[i]))) ) or use by: unlist(by(mydata, mydata$id, function(x) d(x$durat, x$estat)))