arun
2014-Jun-06 03:09 UTC
[R] How do I do a conditional sum which only looks between certain date criteria
Hi, The expected output is confusing. dat1 <- read.table(text="date, user, items_bought 2013-01-01, x, 2 2013-01-02, x, 1 2013-01-03, x, 0 2013-01-04, x, 0 2013-01-05, x, 3 2013-01-06, x, 1 2013-01-01, y, 1 2013-01-02, y, 1 2013-01-03, y, 0 2013-01-04, y, 5 2013-01-05, y, 6 2013-01-06, y, 1",sep=",",header=TRUE,stringsAsFactors=FALSE) ##Assuming that the data is ordered by date and no gaps in date res1 <- unsplit(lapply(split(dat1, dat1$user), function(x) { indx <- (seq(nrow(x)) - 1)%/%3 x$cum_items_bought_3_days <- ave(x$items_bought, indx, FUN = cumsum) x }), dat1$user) ##expected output res2 <- unsplit(lapply(split(dat1, dat1$user), function(x) { indx <- (seq(nrow(x)) - 1)%/%3 x$cum_items_bought_3_days <- ave(x$items_bought, indx, FUN = cumsum) indx2 <- seq(0, length(indx), by = 4) x[indx2, 4] <- x[indx2, 4] + indx[indx2] x }), dat1$user) A.K. Say I have data that looks like date, user, items_bought 2013-01-01, x, 2 2013-01-02, x, 1 2013-01-03, x, 0 2013-01-04, x, 0 2013-01-05, x, 3 2013-01-06, x, 1 2013-01-01, y, 1 2013-01-02, y, 1 2013-01-03, y, 0 2013-01-04, y, 5 2013-01-05, y, 6 2013-01-06, y, 1 to get the cumulative sum per user per data point I was doing data.frame(cum_items_bought=unlist(tapply(as.numeric(data$items_bought), data$user, FUN = cumsum))) output from this looks like date, user, items_bought 2013-01-01, x, 2 2013-01-02, x, 3 2013-01-03, x, 3 2013-01-04, x, 3 2013-01-05, x, 6 2013-01-06, x, 7 2013-01-01, y, 1 2013-01-02, y, 2 2013-01-03, y, 2 2013-01-04, y, 7 2013-01-05, y, 13 2013-01-06, y, 14 However I want to restrict my sum to only add up those that happened within 3 days of each row (relative to the user). i.e. the output needs to look like this: date, user, cum_items_bought_3_days 2013-01-01, x, 2 2013-01-02, x, 3 2013-01-03, x, 3 2013-01-04, x, 1 2013-01-05, x, 3 2013-01-06, x, 4 2013-01-01, y, 1 2013-01-02, y, 2 2013-01-03, y, 2 2013-01-04, y, 6 2013-01-05, y, 11 2013-01-06, y, 12
arun
2014-Jun-06 14:25 UTC
[R] How do I do a conditional sum which only looks between certain date criteria
If the data is like: dat1 <- read.table(text="date, user, items_bought 2013-01-01, x, 2 2013-01-02, x, 1 2013-01-05, x, 1 2013-01-06, x, 5 2013-01-11, x, 3 2013-01-14, x, 1 2013-01-01, y, 1 2013-01-02, y, 1 2013-01-03, y, 0 2013-01-04, y, 5 2013-01-05, y, 6 2013-01-06, y, 1",sep=",",header=TRUE,stringsAsFactors=FALSE) dat1$date <- as.Date(dat1$date) res3 <- unsplit(lapply(split(dat1, dat1$user), function(x) { ??? date1 <- seq(min(x$date), max(x$date), by = "1 day") ??? indx <- (seq_along(date1) - 1)%/%3 ??? indx1 <- date1 %in% x$date ??? x$cum_times_bought_3_days <- ave(x$items_bought, indx[indx1], FUN = cumsum) ??? indx2 <- seq(0, length(indx), 4) ??? indx3 <- c(FALSE, diff(indx[indx1]) > 0) ??? x[indx3, 4] <- x[indx3, 4] + indx[indx2] ??? x }), dat1$user) ##Here also, the same confusion persists. A.K. On Thursday, June 5, 2014 11:09 PM, arun <smartpink111 at yahoo.com> wrote: Hi, The expected output is confusing. dat1 <- read.table(text="date, user, items_bought 2013-01-01, x, 2 2013-01-02, x, 1 2013-01-03, x, 0 2013-01-04, x, 0 2013-01-05, x, 3 2013-01-06, x, 1 2013-01-01, y, 1 2013-01-02, y, 1 2013-01-03, y, 0 2013-01-04, y, 5 2013-01-05, y, 6 2013-01-06, y, 1",sep=",",header=TRUE,stringsAsFactors=FALSE) ##Assuming that the data is ordered by date and no gaps in date res1 <- unsplit(lapply(split(dat1, dat1$user), function(x) { ? ? indx <- (seq(nrow(x)) - 1)%/%3 ? ? x$cum_items_bought_3_days <- ave(x$items_bought, indx, FUN = cumsum) ? ? x }), dat1$user) ##expected output res2 <- unsplit(lapply(split(dat1, dat1$user), function(x) { ? ? indx <- (seq(nrow(x)) - 1)%/%3 ? ? x$cum_items_bought_3_days <- ave(x$items_bought, indx, FUN = cumsum) ? ? indx2 <- seq(0, length(indx), by = 4) ? ? x[indx2, 4] <- x[indx2, 4] + indx[indx2] ? ? x }), dat1$user) A.K. Say I have data that looks like date, user, items_bought 2013-01-01, x, 2 2013-01-02, x, 1 2013-01-03, x, 0 2013-01-04, x, 0 2013-01-05, x, 3 2013-01-06, x, 1 2013-01-01, y, 1 2013-01-02, y, 1 2013-01-03, y, 0 2013-01-04, y, 5 2013-01-05, y, 6 2013-01-06, y, 1 to get the cumulative sum per user per data point I was doing data.frame(cum_items_bought=unlist(tapply(as.numeric(data$items_bought), data$user, FUN = cumsum))) output from this looks like date, user, items_bought 2013-01-01, x, 2 2013-01-02, x, 3 2013-01-03, x, 3 2013-01-04, x, 3 2013-01-05, x, 6 2013-01-06, x, 7 2013-01-01, y, 1 2013-01-02, y, 2 2013-01-03, y, 2 2013-01-04, y, 7 2013-01-05, y, 13 2013-01-06, y, 14 However I want to restrict my sum to only add up those that happened within 3 days of each row (relative to the user). i.e. the output needs to look like this: date, user, cum_items_bought_3_days 2013-01-01, x, 2 2013-01-02, x, 3 2013-01-03, x, 3 2013-01-04, x, 1 2013-01-05, x, 3 2013-01-06, x, 4 2013-01-01, y, 1 2013-01-02, y, 2 2013-01-03, y, 2 2013-01-04, y, 6 2013-01-05, y, 11 2013-01-06, y, 12