thr3ads.net - R help - [R] Loop for taking sum of rows based on proximity to other non-NA rows [Oct 2013]

If this information is useful, please help other people find it:
Share via:

arun

2013-Oct-18 20:19 UTC

[R] Loop for taking sum of rows based on proximity to other non-NA rows

Hi,

May be this helps:

dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L, 
14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L, 
29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA, 
NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
)), .Names = c("Position", "Count"), class =
"data.frame", row.names = c(NA,
-23L))


#There might be simple solutions.

fun1 <- function(dat,n) {
?rl <- rle(is.na(dat[,"Count"]))
indx <-
which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
?lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
??? ??? ??? ??? ???????? x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
??? ??? ??? ??? ??? ?x2 <- x1[!is.na(x1$Count),]
??? ??? ??? ??? ??? ?datN <-
data.frame(Position=max(x2$Position),Count=sum(x2$Count))
??? ??? ??? ??? ??? ?rowN <- row.names(x2)[x2$Position %in%
max(x2$Position)]???
??? ??? ??? ??? ??? ?row.names(datN) <- if(length(rowN)>1) rowN[1] else
rowN
??? ??? ??? ??? ??? ?datN
??? ??? ??? ??? ??? })
names(lst1) <- NULL
dat2 <- do.call(rbind,lst1)
indx2 <-?
sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))

dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}

dat1N <- fun1(dat1,1)
dat1N
?? Position Count
1??????? 15??? 15
2??????? 22??? NA
3??????? 38??? NA
4??????? 61??? 22
5??????? 62??? 18
6??????? 14??? NA
7??????? 29??? NA
8??????? 63??? NA
9??????? 46??? 28
10?????? 24??? NA
11?????? 22??? NA
12?????? 49??? 16
13?????? 42??? 19
14?????? 38??? NA
15?????? 29??? NA
16?????? 22??? NA
17?????? 42??? 46

dat2N <- fun1(dat1N,2)
dat2N
?? Position Count
1??????? 61??? 37
2??????? 62??? 18
3??????? 14??? NA
4??????? 29??? NA
5??????? 63??? NA
6??????? 49??? 44
7??????? 42??? 19
8??????? 38??? NA
9??????? 29??? NA
10?????? 22??? NA
11?????? 42??? 46
dat3N <- fun1(dat2N,3)
dat3N
? Position Count
1?????? 61??? 37
2?????? 62??? 62
3?????? 42??? 65

A.K.









Hi all, I have a dataset with 2 important columns, "Position" and 
"Count". There are a total of 34,532 rows, but only 457 non-NA values
in the "Count" column (every cell in "Position" column has a
value). I
need to write a loop to march down the rows, and if there are 2 rows in 
"Count" where there is only 1 NA row between them, sum the two values
up and print only one row with the summed Count value and the Position
value that corresponds to the larger Count value, thus making the three 
rows into one. For example: 

Position Count 
15 15 
22 NA 
38 NA 
49 5 
55 NA 
61 17 

would become 

Position Count 
15 15 
22 NA 
38 NA 
61 22 

After this step, I also need to write another script to march 
down the rows and look for rows with only two NA's between non-NA rows 
in Count. This would make the previous data become 

Position Count 
61 37 

Ideally I would like a loop that can be flexibly adjusted to the
 number of NA's in between adjacent non-NA values that can be freely 
changed. I would greatly appreciate any insight for this.

arun

2013-Oct-19 04:31 UTC

head link

[R] Loop for taking sum of rows based on proximity to other non-NA rows

Hi,

Found a bug in the function when tested.? So, try this (added one more line):

#Modified function
fun1 <- function(dat,n) {
?rl <- rle(is.na(dat[,"Count"]))
indx <-
which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
?lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
???????????????????????? x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
???????????????????? x2 <- x1[!is.na(x1$Count),]
???????????????????? datN <-
data.frame(Position=max(x2$Position),Count=sum(x2$Count))
???????????????????? rowN <- row.names(x2)[x2$Position %in%
max(x2$Position)]??
???????????????????? row.names(datN) <- if(length(rowN)>1) rowN[1] else
rowN
???????????????????? datN
??????????????????? })
names(lst1) <- NULL
lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added
dat2 <- do.call(rbind,lst1)
indx2 <-?
sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))

dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}



#Another function
fun2 <- function(dat,n){
?indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
?indx1 <- indx[is.na(dat[,"Count"])]
?names(indx1) <- which(is.na(dat[,"Count"]))
indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
??? ??? ??? ??? ??? ??? ??? x1 <- indx2[i]
??? ??? ??? ??? ??? ??? ??? ?x2 <- as.numeric(names(x1))
??? ??? ??? ??? ??? ??? ??? ?x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
??? ??? ??? ??? ??? ??? ??? ?x4 <- subset(x3, !is.na(Count))
??? ??? ??? ??? ??? ??? ??? ?x5 <-
data.frame(Position=max(x4$Position),Count=sum(x4$Count))
??? ??? ??? ??? ??? ??? ??? ind <- x4$Position %in% max(x4$Position)
??? ??? ??? ??? ??? ??? ??? ?row.names(x5) <- if(sum(ind)>1)
row.names(x4)[ind][1] else row.names(x4)[ind]
??? ??? ??? ??? ??? ??? ??? x5
??? ??? ??? ??? ??? ??? })
attr(lst1,"dimnames") <- NULL
?dat2 <- do.call(rbind,lst1)
indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1
<- indx2[i]
??? ??? ??? ??? ??? ??? ??? ??? ??? ?x2 <- as.numeric(names(x1))
??? ??? ??? ??? ??? ??? ??? ??? ??? ?c(min(x2)-1L, x2,
max(x2)+1L)}),use.names=FALSE))

dat$id <- 1:nrow(dat)
dat2$id <- as.numeric(row.names(dat2))
library(plyr)
res <- join(dat,dat2[,-1],by="id",type="left")
res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
res2 <- res1[,1:2]
row.names(res2) <- 1:nrow(res2)
res2
}


identical(fun1(dat1,1),fun2(dat1,1))
#[1] TRUE
identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
#[1] TRUE

identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
#[1] TRUE


#Speed
set.seed(185)
datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count=
sample(c(NA, 10:100),1e5, replace=TRUE))
?system.time(res <- fun1(datT,1))
?#? user? system elapsed 
?# 0.676?? 0.000?? 0.676 
?system.time(res2 <- fun2(datT,1))
#?? user? system elapsed 
#? 1.240?? 0.000?? 1.237 
?identical(res,res2)
#[1] TRUE

A.K.









On Friday, October 18, 2013 4:19 PM, arun <smartpink111 at yahoo.com>
wrote:
Hi,

May be this helps:

dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L, 
14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L, 
29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA, 
NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
)), .Names = c("Position", "Count"), class =
"data.frame", row.names = c(NA,
-23L))


#There might be simple solutions.

fun1 <- function(dat,n) {
?rl <- rle(is.na(dat[,"Count"]))
indx <-
which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
?lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
??? ??? ??? ??? ???????? x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
??? ??? ??? ??? ??? ?x2 <- x1[!is.na(x1$Count),]
??? ??? ??? ??? ??? ?datN <-
data.frame(Position=max(x2$Position),Count=sum(x2$Count))
??? ??? ??? ??? ??? ?rowN <- row.names(x2)[x2$Position %in%
max(x2$Position)]???
??? ??? ??? ??? ??? ?row.names(datN) <- if(length(rowN)>1) rowN[1] else
rowN
??? ??? ??? ??? ??? ?datN
??? ??? ??? ??? ??? })
names(lst1) <- NULL
dat2 <- do.call(rbind,lst1)
indx2 <-?
sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))

dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}

dat1N <- fun1(dat1,1)
dat1N
?? Position Count
1??????? 15??? 15
2??????? 22??? NA
3??????? 38??? NA
4??????? 61??? 22
5??????? 62??? 18
6??????? 14??? NA
7??????? 29??? NA
8??????? 63??? NA
9??????? 46??? 28
10?????? 24??? NA
11?????? 22??? NA
12?????? 49??? 16
13?????? 42??? 19
14?????? 38??? NA
15?????? 29??? NA
16?????? 22??? NA
17?????? 42??? 46

dat2N <- fun1(dat1N,2)
dat2N
?? Position Count
1??????? 61??? 37
2??????? 62??? 18
3??????? 14??? NA
4??????? 29??? NA
5??????? 63??? NA
6??????? 49??? 44
7??????? 42??? 19
8??????? 38??? NA
9??????? 29??? NA
10?????? 22??? NA
11?????? 42??? 46
dat3N <- fun1(dat2N,3)
dat3N
? Position Count
1?????? 61??? 37
2?????? 62??? 62
3?????? 42??? 65

A.K.









Hi all, I have a dataset with 2 important columns, "Position" and 
"Count". There are a total of 34,532 rows, but only 457 non-NA values
in the "Count" column (every cell in "Position" column has a
value). I
need to write a loop to march down the rows, and if there are 2 rows in 
"Count" where there is only 1 NA row between them, sum the two values
up and print only one row with the summed Count value and the Position
value that corresponds to the larger Count value, thus making the three 
rows into one. For example: 

Position Count 
15 15 
22 NA 
38 NA 
49 5 
55 NA 
61 17 

would become 

Position Count 
15 15 
22 NA 
38 NA 
61 22 

After this step, I also need to write another script to march 
down the rows and look for rows with only two NA's between non-NA rows 
in Count. This would make the previous data become 

Position Count 
61 37 

Ideally I would like a loop that can be flexibly adjusted to the
number of NA's in between adjacent non-NA values that can be freely 
changed. I would greatly appreciate any insight for this.

R help - Oct 2013 - Loop for taking sum of rows based on proximity to other non-NA rows

[R] Loop for taking sum of rows based on proximity to other non-NA rows

[R] Loop for taking sum of rows based on proximity to other non-NA rows