arun
2013-Oct-18 20:19 UTC
[R] Loop for taking sum of rows based on proximity to other non-NA rows
Hi, May be this helps: dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L, 14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L, 29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA, NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L )), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA, -23L)) #There might be simple solutions. fun1 <- function(dat,n) { ?rl <- rle(is.na(dat[,"Count"])) indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n] ?lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) { ??? ??? ??? ??? ???????? x1 <- dat[c(min(x)-1L,x,max(x)+1L),] ??? ??? ??? ??? ??? ?x2 <- x1[!is.na(x1$Count),] ??? ??? ??? ??? ??? ?datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count)) ??? ??? ??? ??? ??? ?rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]??? ??? ??? ??? ??? ??? ?row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN ??? ??? ??? ??? ??? ?datN ??? ??? ??? ??? ??? }) names(lst1) <- NULL dat2 <- do.call(rbind,lst1) indx2 <-? sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE)) dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],] dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2 row.names(dat1New) <- 1:nrow(dat1New) dat1New } dat1N <- fun1(dat1,1) dat1N ?? Position Count 1??????? 15??? 15 2??????? 22??? NA 3??????? 38??? NA 4??????? 61??? 22 5??????? 62??? 18 6??????? 14??? NA 7??????? 29??? NA 8??????? 63??? NA 9??????? 46??? 28 10?????? 24??? NA 11?????? 22??? NA 12?????? 49??? 16 13?????? 42??? 19 14?????? 38??? NA 15?????? 29??? NA 16?????? 22??? NA 17?????? 42??? 46 dat2N <- fun1(dat1N,2) dat2N ?? Position Count 1??????? 61??? 37 2??????? 62??? 18 3??????? 14??? NA 4??????? 29??? NA 5??????? 63??? NA 6??????? 49??? 44 7??????? 42??? 19 8??????? 38??? NA 9??????? 29??? NA 10?????? 22??? NA 11?????? 42??? 46 dat3N <- fun1(dat2N,3) dat3N ? Position Count 1?????? 61??? 37 2?????? 62??? 62 3?????? 42??? 65 A.K. Hi all, I have a dataset with 2 important columns, "Position" and "Count". There are a total of 34,532 rows, but only 457 non-NA values in the "Count" column (every cell in "Position" column has a value). I need to write a loop to march down the rows, and if there are 2 rows in "Count" where there is only 1 NA row between them, sum the two values up and print only one row with the summed Count value and the Position value that corresponds to the larger Count value, thus making the three rows into one. For example: Position Count 15 15 22 NA 38 NA 49 5 55 NA 61 17 would become Position Count 15 15 22 NA 38 NA 61 22 After this step, I also need to write another script to march down the rows and look for rows with only two NA's between non-NA rows in Count. This would make the previous data become Position Count 61 37 Ideally I would like a loop that can be flexibly adjusted to the number of NA's in between adjacent non-NA values that can be freely changed. I would greatly appreciate any insight for this.
arun
2013-Oct-19 04:31 UTC
[R] Loop for taking sum of rows based on proximity to other non-NA rows
Hi, Found a bug in the function when tested.? So, try this (added one more line): #Modified function fun1 <- function(dat,n) { ?rl <- rle(is.na(dat[,"Count"])) indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n] ?lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) { ???????????????????????? x1 <- dat[c(min(x)-1L,x,max(x)+1L),] ???????????????????? x2 <- x1[!is.na(x1$Count),] ???????????????????? datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count)) ???????????????????? rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]?? ???????????????????? row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN ???????????????????? datN ??????????????????? }) names(lst1) <- NULL lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added dat2 <- do.call(rbind,lst1) indx2 <-? sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE)) dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],] dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2 row.names(dat1New) <- 1:nrow(dat1New) dat1New } #Another function fun2 <- function(dat,n){ ?indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"]))))) ?indx1 <- indx[is.na(dat[,"Count"])] ?names(indx1) <- which(is.na(dat[,"Count"])) indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]] lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) { ??? ??? ??? ??? ??? ??? ??? x1 <- indx2[i] ??? ??? ??? ??? ??? ??? ??? ?x2 <- as.numeric(names(x1)) ??? ??? ??? ??? ??? ??? ??? ?x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),] ??? ??? ??? ??? ??? ??? ??? ?x4 <- subset(x3, !is.na(Count)) ??? ??? ??? ??? ??? ??? ??? ?x5 <- data.frame(Position=max(x4$Position),Count=sum(x4$Count)) ??? ??? ??? ??? ??? ??? ??? ind <- x4$Position %in% max(x4$Position) ??? ??? ??? ??? ??? ??? ??? ?row.names(x5) <- if(sum(ind)>1) row.names(x4)[ind][1] else row.names(x4)[ind] ??? ??? ??? ??? ??? ??? ??? x5 ??? ??? ??? ??? ??? ??? }) attr(lst1,"dimnames") <- NULL ?dat2 <- do.call(rbind,lst1) indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- indx2[i] ??? ??? ??? ??? ??? ??? ??? ??? ??? ?x2 <- as.numeric(names(x1)) ??? ??? ??? ??? ??? ??? ??? ??? ??? ?c(min(x2)-1L, x2, max(x2)+1L)}),use.names=FALSE)) dat$id <- 1:nrow(dat) dat2$id <- as.numeric(row.names(dat2)) library(plyr) res <- join(dat,dat2[,-1],by="id",type="left") res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),] res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])] res2 <- res1[,1:2] row.names(res2) <- 1:nrow(res2) res2 } identical(fun1(dat1,1),fun2(dat1,1)) #[1] TRUE identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2)) #[1] TRUE identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3)) #[1] TRUE #Speed set.seed(185) datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count= sample(c(NA, 10:100),1e5, replace=TRUE)) ?system.time(res <- fun1(datT,1)) ?#? user? system elapsed ?# 0.676?? 0.000?? 0.676 ?system.time(res2 <- fun2(datT,1)) #?? user? system elapsed #? 1.240?? 0.000?? 1.237 ?identical(res,res2) #[1] TRUE A.K. On Friday, October 18, 2013 4:19 PM, arun <smartpink111 at yahoo.com> wrote: Hi, May be this helps: dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L, 14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L, 29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA, NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L )), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA, -23L)) #There might be simple solutions. fun1 <- function(dat,n) { ?rl <- rle(is.na(dat[,"Count"])) indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n] ?lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) { ??? ??? ??? ??? ???????? x1 <- dat[c(min(x)-1L,x,max(x)+1L),] ??? ??? ??? ??? ??? ?x2 <- x1[!is.na(x1$Count),] ??? ??? ??? ??? ??? ?datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count)) ??? ??? ??? ??? ??? ?rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]??? ??? ??? ??? ??? ??? ?row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN ??? ??? ??? ??? ??? ?datN ??? ??? ??? ??? ??? }) names(lst1) <- NULL dat2 <- do.call(rbind,lst1) indx2 <-? sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE)) dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],] dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2 row.names(dat1New) <- 1:nrow(dat1New) dat1New } dat1N <- fun1(dat1,1) dat1N ?? Position Count 1??????? 15??? 15 2??????? 22??? NA 3??????? 38??? NA 4??????? 61??? 22 5??????? 62??? 18 6??????? 14??? NA 7??????? 29??? NA 8??????? 63??? NA 9??????? 46??? 28 10?????? 24??? NA 11?????? 22??? NA 12?????? 49??? 16 13?????? 42??? 19 14?????? 38??? NA 15?????? 29??? NA 16?????? 22??? NA 17?????? 42??? 46 dat2N <- fun1(dat1N,2) dat2N ?? Position Count 1??????? 61??? 37 2??????? 62??? 18 3??????? 14??? NA 4??????? 29??? NA 5??????? 63??? NA 6??????? 49??? 44 7??????? 42??? 19 8??????? 38??? NA 9??????? 29??? NA 10?????? 22??? NA 11?????? 42??? 46 dat3N <- fun1(dat2N,3) dat3N ? Position Count 1?????? 61??? 37 2?????? 62??? 62 3?????? 42??? 65 A.K. Hi all, I have a dataset with 2 important columns, "Position" and "Count". There are a total of 34,532 rows, but only 457 non-NA values in the "Count" column (every cell in "Position" column has a value). I need to write a loop to march down the rows, and if there are 2 rows in "Count" where there is only 1 NA row between them, sum the two values up and print only one row with the summed Count value and the Position value that corresponds to the larger Count value, thus making the three rows into one. For example: Position Count 15 15 22 NA 38 NA 49 5 55 NA 61 17 would become Position Count 15 15 22 NA 38 NA 61 22 After this step, I also need to write another script to march down the rows and look for rows with only two NA's between non-NA rows in Count. This would make the previous data become Position Count 61 37 Ideally I would like a loop that can be flexibly adjusted to the number of NA's in between adjacent non-NA values that can be freely changed. I would greatly appreciate any insight for this.