I have a large data frame with 2,000 rows and 600 columns. I can write
loops to solve a smaller problem, but I need a better strategy for this
data frame.
Below is a simple example with just two stocks.
In the data frame, each row represents a trading day. The first column
is dates. The next group of columns represents the prices of the stocks
on the specified dates. The next group of columns represents how many
trading days I wish to offset. So if the first trading day is 2006-01-03
and OF1 == 3, then I need to go to row 1+3 and get the price in column
P1. The result is placed in row 1 of column 6.
df1 = data.frame(matrix(rep(NA, 10*7), nrow = 10))
Dates =as.Date(c("2006-01-03", "2006-01-04",
"2006-01-05", "2006-01-06",
"2006-01-09", "2006-01-10", "2006-01-11",
"2006-01-12", "2006-01-13", "2006-01-16"), format
= "%Y-%m-%d")
P1 = 10:19
P2 = 100:109
OF1 = c(3,3,4,5,2,2,2,1,1,0)
OF2 = c(5,3,4,2,1,2,2,1,1,0)
df1 = data.frame(Dates = Dates, P1 = P1, P2 = P2, OF1 = OF1, OF2 = OF2)
df1$newPrice1 = rep(NA, 10)
df1$newPrice2 = rep(NA, 10)
for(j in 6:7) {
for(i in 1:10 ) {
rowNumber = i + df1[i,j-2]
#print(rowNumber)
df1[i,j] = df1[rowNumber, j-4]
} # end i loop
} # end j loop
df1
> df1
Dates P1P2 OF1 OF2 newPrice1 newPrice2
1 2006-01-03 10 1003513105
22006-01-04 11 1013314104
32006-01-05 12 1024416106
42006-01-06 13 1035218105
52006-01-09 14 1042116105
62006-01-10 15 1052217107
72006-01-11 16 1062218108
82006-01-12 17 1071118108
92006-01-13 18 1081119109
10 2006-01-16 19 1090019109
[[alternative HTML version deleted]]
William Dunlap
2013-Sep-21 18:26 UTC
[R] Obtaining data from a different row of data frame
Note that your inner loop> for(i in 1:10 ) { > rowNumber = i + df1[i,j-2] > df1[i,j] = df1[rowNumber, j-4] > } # end i loopis equivalent to the much quicker i <- 1:10 rowNumbers <- i + df1[i, j-2] df1[i, j] <- df1[rowNumbers, j-4] Bill Dunlap Spotfire, TIBCO Software wdunlap tibco.com> -----Original Message----- > From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf > Of Ira Sharenow > Sent: Saturday, September 21, 2013 8:31 AM > To: r-help at r-project.org > Subject: [R] Obtaining data from a different row of data frame > > I have a large data frame with 2,000 rows and 600 columns. I can write > loops to solve a smaller problem, but I need a better strategy for this > data frame. > > Below is a simple example with just two stocks. > > In the data frame, each row represents a trading day. The first column > is dates. The next group of columns represents the prices of the stocks > on the specified dates. The next group of columns represents how many > trading days I wish to offset. So if the first trading day is 2006-01-03 > and OF1 == 3, then I need to go to row 1+3 and get the price in column > P1. The result is placed in row 1 of column 6. > > > df1 = data.frame(matrix(rep(NA, 10*7), nrow = 10)) > > Dates =as.Date(c("2006-01-03", "2006-01-04", "2006-01-05", "2006-01-06", > "2006-01-09", "2006-01-10", "2006-01-11", > > "2006-01-12", "2006-01-13", "2006-01-16"), format = "%Y-%m-%d") > > P1 = 10:19 > > P2 = 100:109 > > OF1 = c(3,3,4,5,2,2,2,1,1,0) > > OF2 = c(5,3,4,2,1,2,2,1,1,0) > > df1 = data.frame(Dates = Dates, P1 = P1, P2 = P2, OF1 = OF1, OF2 = OF2) > > df1$newPrice1 = rep(NA, 10) > > df1$newPrice2 = rep(NA, 10) > > for(j in 6:7) { > > for(i in 1:10 ) { > > rowNumber = i + df1[i,j-2] > > #print(rowNumber) > > df1[i,j] = df1[rowNumber, j-4] > > } # end i loop > > } # end j loop > > df1 > > > df1 > > Dates P1P2 OF1 OF2 newPrice1 newPrice2 > > 1 2006-01-03 10 1003513105 > > 22006-01-04 11 1013314104 > > 32006-01-05 12 1024416106 > > 42006-01-06 13 1035218105 > > 52006-01-09 14 1042116105 > > 62006-01-10 15 1052217107 > > 72006-01-11 16 1062218108 > > 82006-01-12 17 1071118108 > > 92006-01-13 18 1081119109 > > 10 2006-01-16 19 1090019109 > > > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.
Hi Ira,
Some suggestions:
1. Not sure why you created a dataframe with NA's in the beginning.? It
seemed to be not required here.
df1 = data.frame(matrix(rep(NA, 10*7), nrow = 10))
2. The below code:
df1 = data.frame(Dates = Dates, P1 = P1, P2 = P2, OF1 = OF1, OF2 = OF2)
can be simplifed as:
df1 = data.frame(Dates,P1,P2, P3, P4, OF1, OF2,OF3, OF4)
?head(df1,2)
#?????? Dates P1? P2 P3 P4 OF1 OF2 OF3 OF4
#1 2006-01-03 10 100 90 70?? 3?? 5?? 4?? 3
#2 2006-01-04 13 102 94 75?? 3?? 3?? 3?? 5
3.? If you have multiple columns of NA's to create
df1$newPrice1 = rep(NA, 10)
?df1$newPrice2 = rep(NA, 10)
This could be also done by:
df1[,6:7]<- NA
?colnames(df1)[6:7]<- paste0("newPrice",1:2)
#Regarding the question:
#Created another dataset with couple more columns:
?Dates =? as.Date(c("2006-01-03", "2006-01-04",
"2006-01-05", "2006-01-06", "2006-01-09",
"2006-01-10", "2006-01-11",
??????????????????? "2006-01-12", "2006-01-13",
"2006-01-16"), format = "%Y-%m-%d")
?P1 = seq(from = 10, by = 3, length.out = 10)
?P2 = seq(from = 100, by = 2, length.out = 10)
P3= seq(from= 90, by=4,length.out=10)
?P4= seq(from=70,by=5,length.out=10)
?OF1 = c(3,3,4,5,2,2,2,1,1,0)
?OF2 = c(5,3,4,2,1,2,2,1,1,0)
?OF3 <- c(4,3,4,1,3,2,2,1,1,0)
?OF4<- c(3,5,4,2,3,1,2,1,1,0)
df1 = data.frame(Dates,P1,P2, P3, P4, OF1, OF2,OF3, OF4)
df2<- df1
?df2[,10:13]<- NA
colnames(df2)[10:13]<- paste0("newPrice",1:4)
##your code
for(j in 2:5) {
?df2[j+8] = df2[df2[,j+4] + row(df2)[,j], j]
?}
#modified code #didn't check the speed.
indx1<- unlist(df1[,grep("OF",colnames(df1))],use.names=FALSE)
val1<- unlist(df1[,grep("P",colnames(df1))],use.names=FALSE)
?df1[,10:13]<- val1[indx1+seq_along(indx1)]
?colnames(df1)[10:13]<- colnames(df2)[10:13]
?identical(df1,df2)
#[1] TRUE
The "average" part is not clear.
For ex:
df2$OF1
# [1] 3 3 4 5 2 2 2 1 1 0
df2$P1
# [1] 10 13 16 19 22 25 28 31 34 37
Could you explain it in terms of the above values?
A.K.
________________________________
From: Ira Sharenow <irasharenow100 at yahoo.com>
To: arun <smartpink111 at yahoo.com>
Sent: Saturday, September 21, 2013 5:09 PM
Subject: Re: [R] Obtaining data from a different row of data frame
Arun,
Thanks for helping me improve my question.
I made a slight change to the data, so that the coincidence will not occur.
Please note that eventually I will have to write a function for a user. Also
after my rewrite, I have better code, but I do not know how to get rid of the
loop. Also eventually I will need to generalize and take the average of r rows
above and below the row that this algorithm is attempting to retrieve. For
example if r = 5 and the row 1 offset gets me to row 100, I will eventually need
the price in rows 95 through 105. Yes, I realize that I need to deal with a
number of annoying details such as possibly winding up with row -3.
> df1 = data.frame(matrix(rep(NA, 10*7), nrow = 10))
> Dates =? as.Date(c("2006-01-03", "2006-01-04",
"2006-01-05", "2006-01-06", "2006-01-09",
"2006-01-10", "2006-01-11",
+??????????????????? "2006-01-12", "2006-01-13",
"2006-01-16"), format = "%Y-%m-%d")> P1 = seq(from = 10, by = 3, length.out = 10)
> P2 = seq(from = 100, by = 2, length.out = 10)
> OF1 = c(3,3,4,5,2,2,2,1,1,0)
> OF2 = c(5,3,4,2,1,2,2,1,1,0)
> df1 = data.frame(Dates = Dates, P1 = P1, P2 = P2, OF1 = OF1, OF2 = OF2)
> df1$newPrice1 = rep(NA, 10)
> df1$newPrice2 = rep(NA, 10)
>
> df1
??????? Dates P1? P2 OF1 OF2 newPrice1 newPrice2
1? 2006-01-03 10 100?? 3?? 5??????? NA??????? NA
2? 2006-01-04 13 102?? 3?? 3??????? NA??????? NA
3? 2006-01-05 16 104?? 4?? 4??????? NA??????? NA
4? 2006-01-06 19 106?? 5?? 2??????? NA??????? NA
5? 2006-01-09 22 108?? 2?? 1??????? NA???? ???NA
6? 2006-01-10 25 110?? 2?? 2??????? NA??????? NA
7? 2006-01-11 28 112?? 2?? 2??????? NA??????? NA
8? 2006-01-12 31 114?? 1?? 1??????? NA??????? NA
9? 2006-01-13 34 116?? 1?? 1??????? NA??????? NA
10 2006-01-16 37 118?? 0?? 0??????? NA??????? NA>
> for(j in 2:3) {
+ df1[j+4] = df1[df1[,j+2] + row(df1)[,j], j]
+ }> df1
??????? Dates P1? P2 OF1 OF2 newPrice1 newPrice2
1? 2006-01-03 10 100?? 3?? 5??????? 19?????? 110
2? 2006-01-04 13 102?? 3?? 3??????? 22?????? 108
3? 2006-01-05 16 104?? 4?? 4?????? ?28?????? 112
4? 2006-01-06 19 106?? 5?? 2??????? 34?????? 110
5? 2006-01-09 22 108?? 2?? 1??????? 28?????? 110
6? 2006-01-10 25 110?? 2?? 2??????? 31?????? 114
7? 2006-01-11 28 112?? 2?? 2??????? 34?????? 116
8? 2006-01-12 31 114?? 1?? 1??????? 34?????? 116
9? 2006-01-13 34 116?? 1?? 1??????? 37?????? 118
10 2006-01-16 37 118?? 0?? 0??????? 37?????? 118> ?
> # Better code. Produces exactly the same results as above.
> for(j in 2:3) {
+ df1[j+4] = df1[df1[,j+2] + row(df1)[,j], j]
+ }> df1
On 9/21/2013 1:39 PM, arun wrote:
Hi,
Your example dataset could be confusing as: with(df1,P1+OF1)
# [1] 13 14 16 18 16 17 18 18 19 19
?with(df1,P2+OF2)
# [1] 105 104 106 105 105 107 108 108 109 109 which is the same as:
?df1$newPrice1
# [1] 13 14 16 18 16 17 18 18 19 19
?df1$newPrice2
# [1] 105 104 106 105 105 107 108 108 109 109 ----- Original Message -----
From: Ira Sharenow <irasharenow100 at yahoo.com> To: r-help at
r-project.org Cc:
Sent: Saturday, September 21, 2013 11:30 AM
Subject: [R] Obtaining data from a different row of data frame I have a large
data frame with 2,000 rows and 600 columns. I can write
loops to solve a smaller problem, but I need a better strategy for this
data frame. Below is a simple example with just two stocks. In the data frame,
each row represents a trading day. The first column
is dates. The next group of columns represents the prices of the stocks
on the specified dates. The next group of columns represents how many
trading days I wish to offset. So if the first trading day is 2006-01-03
and OF1 == 3, then I need to go to row 1+3 and get the price in column
P1. The result is placed in row 1 of column 6. df1 = data.frame(matrix(rep(NA,
10*7), nrow = 10)) Dates =as.Date(c("2006-01-03",
"2006-01-04", "2006-01-05", "2006-01-06",
"2006-01-09", "2006-01-10", "2006-01-11",
"2006-01-12", "2006-01-13", "2006-01-16"), format
= "%Y-%m-%d") P1 = 10:19 P2 = 100:109 OF1 = c(3,3,4,5,2,2,2,1,1,0) OF2
= c(5,3,4,2,1,2,2,1,1,0) df1 = data.frame(Dates = Dates, P1 = P1, P2 = P2, OF1 =
OF1, OF2 = OF2) df1$newPrice1 = rep(NA, 10) df1$newPrice2 = rep(NA, 10) for(j in
6:7) { for(i in 1:10 ) { rowNumber = i + df1[i,j-2] #print(rowNumber) df1[i,j] =
df1[rowNumber, j-4] } # end i loop } # end j loop df1 >df1
>Dates P1P2 OF1 OF2 newPrice1 newPrice2 1 2006-01-03 10 1003513105
22006-01-04 11 1013314104 32006-01-05 12 1024416106 42006-01-06 13 1035218105
52006-01-09 14 1042116105 62006-01-10 15 1052217107 72006-01-11 16 1062218108
82006-01-12 17 1071118108 92006-01-13 18 1081119109 10 2006-01-16 19 1090019109
??? [[alternative HTML version deleted]]
______________________________________________ R-help at r-project.org mailing
list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting
guide http://www.R-project.org/posting-guide.html and provide commented,
minimal, self-contained, reproducible code. ???????
Hi Ira,
#Speed comparison
set.seed(29)
?df2<-
data.frame(Dates=seq(as.Date("2006-01-03"),length.out=2000,by="1
day"),cbind(matrix(sample(10:120,2000*300,replace=TRUE),ncol=300),matrix(sample(0:6,2000*300,replace=TRUE),ncol=300)))
?colnames(df2)[2:301]<- paste0("P",1:300)
?colnames(df2)[302:601]<- paste0("OF",1:300)
df2[2000,grep("OF",colnames(df2))]<-0
?df2[1995:1999,grep("OF",colnames(df2))]<-1
df3<- df2
df2[,602:901]<-NA
colnames(df2)[602:901]<- paste0("newPrice",1:300)
dim(df2)
#[1] 2000? 901
system.time({
for(j in grep("^P",colnames(df2))) {
?df2[j+600] = df2[df2[,j+300] + row(df2)[,j], j]
?}
})
#? user? system elapsed
# 12.292?? 0.256? 12.576
system.time({
indx1<- unlist(df3[,grep("OF",colnames(df3))],use.names=FALSE)
val1<- unlist(df3[,grep("P",colnames(df3))],use.names=FALSE)
?df3[,602:901]<- val1[indx1+seq_along(indx1)]
?colnames(df3)[602:901]<- colnames(df2)[602:901]
})
#? user? system elapsed
#? 0.600?? 0.016?? 0.616
identical(df2,df3)
#[1] TRUE
A.K.
________________________________
From: Ira Sharenow <irasharenow100 at yahoo.com>
To: arun <smartpink111 at yahoo.com>
Sent: Saturday, September 21, 2013 5:09 PM
Subject: Re: [R] Obtaining data from a different row of data frame
Arun,
Thanks for helping me improve my question.
I made a slight change to the data, so that the coincidence will not occur.
Please note that eventually I will have to write a function for a user. Also
after my rewrite, I have better code, but I do not know how to get rid of the
loop. Also eventually I will need to generalize and take the average of r rows
above and below the row that this algorithm is attempting to retrieve. For
example if r = 5 and the row 1 offset gets me to row 100, I will eventually need
the price in rows 95 through 105. Yes, I realize that I need to deal with a
number of annoying details such as possibly winding up with row -3.
> df1 = data.frame(matrix(rep(NA, 10*7), nrow = 10))
> Dates =? as.Date(c("2006-01-03", "2006-01-04",
"2006-01-05", "2006-01-06", "2006-01-09",
"2006-01-10", "2006-01-11",
+??????????????????? "2006-01-12", "2006-01-13",
"2006-01-16"), format = "%Y-%m-%d")> P1 = seq(from = 10, by = 3, length.out = 10)
> P2 = seq(from = 100, by = 2, length.out = 10)
> OF1 = c(3,3,4,5,2,2,2,1,1,0)
> OF2 = c(5,3,4,2,1,2,2,1,1,0)
> df1 = data.frame(Dates = Dates, P1 = P1, P2 = P2, OF1 = OF1, OF2 = OF2)
> df1$newPrice1 = rep(NA, 10)
> df1$newPrice2 = rep(NA, 10)
>
> df1
??????? Dates P1? P2 OF1 OF2 newPrice1 newPrice2
1? 2006-01-03 10 100?? 3?? 5??????? NA??????? NA
2? 2006-01-04 13 102?? 3?? 3??????? NA??????? NA
3? 2006-01-05 16 104?? 4?? 4??????? NA??????? NA
4? 2006-01-06 19 106?? 5?? 2??????? NA??????? NA
5? 2006-01-09 22 108?? 2?? 1??????? NA???? ???NA
6? 2006-01-10 25 110?? 2?? 2??????? NA??????? NA
7? 2006-01-11 28 112?? 2?? 2??????? NA??????? NA
8? 2006-01-12 31 114?? 1?? 1??????? NA??????? NA
9? 2006-01-13 34 116?? 1?? 1??????? NA??????? NA
10 2006-01-16 37 118?? 0?? 0??????? NA??????? NA>
> for(j in 2:3) {
+ df1[j+4] = df1[df1[,j+2] + row(df1)[,j], j]
+ }> df1
??????? Dates P1? P2 OF1 OF2 newPrice1 newPrice2
1? 2006-01-03 10 100?? 3?? 5??????? 19?????? 110
2? 2006-01-04 13 102?? 3?? 3??????? 22?????? 108
3? 2006-01-05 16 104?? 4?? 4?????? ?28?????? 112
4? 2006-01-06 19 106?? 5?? 2??????? 34?????? 110
5? 2006-01-09 22 108?? 2?? 1??????? 28?????? 110
6? 2006-01-10 25 110?? 2?? 2??????? 31?????? 114
7? 2006-01-11 28 112?? 2?? 2??????? 34?????? 116
8? 2006-01-12 31 114?? 1?? 1??????? 34?????? 116
9? 2006-01-13 34 116?? 1?? 1??????? 37?????? 118
10 2006-01-16 37 118?? 0?? 0??????? 37?????? 118> ?
> # Better code. Produces exactly the same results as above.
> for(j in 2:3) {
+ df1[j+4] = df1[df1[,j+2] + row(df1)[,j], j]
+ }> df1
On 9/21/2013 1:39 PM, arun wrote:
Hi,
Your example dataset could be confusing as: with(df1,P1+OF1)
# [1] 13 14 16 18 16 17 18 18 19 19
?with(df1,P2+OF2)
# [1] 105 104 106 105 105 107 108 108 109 109 which is the same as:
?df1$newPrice1
# [1] 13 14 16 18 16 17 18 18 19 19
?df1$newPrice2
# [1] 105 104 106 105 105 107 108 108 109 109 ----- Original Message -----
From: Ira Sharenow <irasharenow100 at yahoo.com> To: r-help at
r-project.org Cc:
Sent: Saturday, September 21, 2013 11:30 AM
Subject: [R] Obtaining data from a different row of data frame I have a large
data frame with 2,000 rows and 600 columns. I can write
loops to solve a smaller problem, but I need a better strategy for this
data frame. Below is a simple example with just two stocks. In the data frame,
each row represents a trading day. The first column
is dates. The next group of columns represents the prices of the stocks
on the specified dates. The next group of columns represents how many
trading days I wish to offset. So if the first trading day is 2006-01-03
and OF1 == 3, then I need to go to row 1+3 and get the price in column
P1. The result is placed in row 1 of column 6. df1 = data.frame(matrix(rep(NA,
10*7), nrow = 10)) Dates =as.Date(c("2006-01-03",
"2006-01-04", "2006-01-05", "2006-01-06",
"2006-01-09", "2006-01-10", "2006-01-11",
"2006-01-12", "2006-01-13", "2006-01-16"), format
= "%Y-%m-%d") P1 = 10:19 P2 = 100:109 OF1 = c(3,3,4,5,2,2,2,1,1,0) OF2
= c(5,3,4,2,1,2,2,1,1,0) df1 = data.frame(Dates = Dates, P1 = P1, P2 = P2, OF1 =
OF1, OF2 = OF2) df1$newPrice1 = rep(NA, 10) df1$newPrice2 = rep(NA, 10) for(j in
6:7) { for(i in 1:10 ) { rowNumber = i + df1[i,j-2] #print(rowNumber) df1[i,j] =
df1[rowNumber, j-4] } # end i loop } # end j loop df1 >df1
>Dates P1P2 OF1 OF2 newPrice1 newPrice2 1 2006-01-03 10 1003513105
22006-01-04 11 1013314104 32006-01-05 12 1024416106 42006-01-06 13 1035218105
52006-01-09 14 1042116105 62006-01-10 15 1052217107 72006-01-11 16 1062218108
82006-01-12 17 1071118108 92006-01-13 18 1081119109 10 2006-01-16 19 1090019109
??? [[alternative HTML version deleted]]
______________________________________________ R-help at r-project.org mailing
list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting
guide http://www.R-project.org/posting-guide.html and provide commented,
minimal, self-contained, reproducible code. ???????
Hi,
May be you can try this:
Change the
df1<- structure(list(Dates = structure(c(13151, 13152, 13153, 13154,
?13157, 13158, 13159, 13160, 13161, 13164), class = "Date"), P1 =
c(10,
?13, 16, 19, 22, 25, 28, 31, 34, 37), P2 = c(100, 102, 104, 106,
?108, 110, 112, 114, 116, 118), P3 = c(90, 94, 98, 102, 106, 110,
?114, 118, 122, 126), P4 = c(70, 75, 80, 85, 90, 95, 100, 105,
?110, 115), OF1 = c(3, 3, 4, 5, 2, 2, 2, 1, 1, 5), OF2 = c(5,
?3, 4, 2, 1, 2, 2, 1, 1, 0), OF3 = c(4, 3, 4, 1, 3, 2, 2, 1, 1,
?0), OF4 = c(3, 5, 4, 2, 3, 1, 2, 1, 1, 0)), .Names = c("Dates",
?"P1", "P2", "P3", "P4",
"OF1", "OF2", "OF3", "OF4"), row.names =
c(NA,
?-10L), class = "data.frame")
df1$OF2[9]<-4
df2<- df1
?df2[,10:13]<- NA
colnames(df2)[10:13]<- paste0("newPrice",1:4)
##your code
for(j in 2:5) {
?df2[j+8] = df2[df2[,j+4] + row(df2)[,j], j]
?}
?vec1<- 5:1 ##change values according to the range of actual values in your
rows
?vec2<- 6:10 ##change accordingly
df1[vec2,grep("OF",colnames(df1))]<-
t(sapply(seq_along(vec1),function(i)
{x1<-as.matrix(df1[vec2[i],grep("OF",colnames(df1))]);
x1[x1>vec1[i]]<-NA; x1}))
indx1<- unlist(df1[,grep("OF",colnames(df1))],use.names=FALSE)
val1<- unlist(df1[,grep("P",colnames(df1))],use.names=FALSE)
?df1[,10:13]<- val1[indx1+seq_along(indx1)]
?colnames(df1)[10:13]<- colnames(df2)[10:13]
identical(df1[,10:13],df2[,10:13])
#[1] TRUE
A.K.
________________________________
From: Ira Sharenow <irasharenow100 at yahoo.com>
To: arun <smartpink111 at yahoo.com>
Sent: Saturday, September 21, 2013 11:36 PM
Subject: Re: [R] Obtaining data from a different row of data frame
Arun,
Yes, I definitely want NA, and I certainly do not want to pick off values from a
different stock.
I set up the example the way I did because I was more concerned about finding
the values than dealing with these bad cases. I figured I could work them out
later and with my strategy I get the NA values.
I had not considered your approach.
Ira
On 9/21/2013 8:30 PM, arun wrote:
Hi Ira, This info was not provided before.? In fact, I wanted to ask about this
in cases where the last row entries are not 0.? So, you wanted the newPrices to
be NA's for the corresponding rows, right?? ________________________________
From: Ira Sharenow <irasharenow100 at yahoo.com> To: arun <smartpink111
at yahoo.com> Sent: Saturday, September 21, 2013 11:24 PM
Subject: Re: [R] Obtaining data from a different row of data frame Arun, Thanks.
I appreciate the details. I am learning a lot. But there may be a bug in your
code. This information is gathered on an ongoing basis. So on September 1, 2013
a prediction for A1 stock may have been made and the prediction date is December
15, 2013, which has not yet occurred. My method produces NA. For example go to?
OF4 and change the last entry from 0 to 6. Ira ># change OF1[10] = 5 <------
df1<- structure(list(Dates = structure(c(13151, 13152,
>????13153, 13154,
+ 13157, 13158, 13159, 13160, 13161, 13164), class = "Date"), P1
????c(10,
+ 13, 16, 19, 22, 25, 28, 31, 34, 37), P2 = c(100, 102, 104, 106,
+ 108, 110, 112, 114, 116, 118), P3 = c(90, 94, 98, 102, 106, 110,
+ 114, 118, 122, 126), P4 = c(70, 75, 80, 85, 90, 95, 100, 105,
+ 110, 115), OF1 = c(3, 3, 4, 5, 2, 2, 2, 1, 1, 5), OF2 = c(5,
+ 3, 4, 2, 1, 2, 2, 1, 1, 0), OF3 = c(4, 3, 4, 1, 3, 2, 2, 1, 1,
+ 0), OF4 = c(3, 5, 4, 2, 3, 1, 2, 1, 1, 0)), .Names = c("Dates",
+ "P1", "P2", "P3", "P4",
"OF1", "OF2", "OF3", "OF4"), row.names
????c(NA,
+ -10L), class = "data.frame") >#Splitting the code grep("OF",colnames(df1)) #gives the column
numbers? having "OF"
>????as column names
[1] 6 7 8 9 >[1] 6 7 8 9
>Error: unexpected '[' in "["
>#Subset those columns ? df1[,grep("OF",colnames(df1))]
>?? OF1 OF2 OF3 OF4
1??? 3?? 5?? 4?? 3
2??? 3?? 3?? 3?? 5
3??? 4?? 4?? 4?? 4
4??? 5?? 2?? 1?? 2
5??? 2?? 1?? 3?? 3
6??? 2?? 2?? 2?? 1
7??? 2?? 2?? 2?? 2
8??? 1?? 1?? 1?? 1
9??? 1?? 1?? 1?? 1
10??5?? 0?? 0?? 0 >#unlist those columns to create a vector
? indx1<- >????unlist(df1[,grep("OF",colnames(df1))],use.names=FALSE)
>indx1
>?[1] 3 3 4 5 2 2 2 1 1 5 5 3 4 2 1 2 2 1 1 0 4 3 4 1 3 2 2 1 1 0 3 5
????4 2 3 1 2 1 1 0 >#Same steps done with columns having names "P"
val1<- unlist(df1[,grep("P",colnames(df1))],use.names=FALSE)
val1 >?[1]? 10? 13? 16? 19? 22? 25? 28? 31? 34? 37 100 102 104 106 108 110
????112 114 116 118? 90? 94? 98 102 106
[26] 110 114 118 122 126? 70? 75? 80? 85? 90? 95 100 105 110 115
>? val1[indx1+seq_along(indx1)]
>?[1]? 19? 22? 28? 34? 28? 31? 34? 34? 37 108 110 108 112 110 110 114
????116 116 118 118 106 106 114 106 118
[26] 118 122 122 126 126? 85 100 100? 95 105 100 110 110 115 115
>df1[,10:13]<- val1[indx1+seq_along(indx1)]
? df1 >??????? Dates P1? P2? P3? P4 OF1 OF2 OF3 OF4 V10 V11 V12 V13
1? 2006-01-03 10 100? 90? 70?? 3?? 5?? 4?? 3? 19 110 106? 85
2? 2006-01-04 13 102? 94? 75?? 3?? 3?? 3?? 5? 22 108 106 100
3? 2006-01-05 16 104? 98? 80?? 4?? 4?? 4?? 4? 28 112 114 100
4? 2006-01-06 19 106 102? 85?? 5?? 2?? 1?? 2? 34 110 106? 95
5? 2006-01-09 22 108 106? 90?? 2?? 1?? 3?? 3? 28 110 118 105
6? 2006-01-10 25 110 110? 95?? 2?? 2?? 2?? 1? 31 114 118 100
7? 2006-01-11 28 112 114 100?? 2?? 2?? 2?? 2? 34 116 122 110
8? 2006-01-12 31 114 118 105?? 1?? 1?? 1?? 1? 34 116 122 110
9? 2006-01-13 34 116 122 110?? 1?? 1?? 1?? 1? 37 118 126 115
10 2006-01-16 37 118 126 115?? 5?? 0?? 0?? 0 108 118 126 115 On 9/21/2013 7:47
PM, arun wrote: Hi Ira, The code I used was based on how you calculated the
values from your previous explanation. Let us, take ##dput??df1<-
structure(list(Dates = structure(c(13151, 13152, 13153, 13154,
13157, 13158, 13159, 13160, 13161, 13164), class = "Date"), P1 = c(10,
13, 16, 19, 22, 25, 28, 31, 34, 37), P2 = c(100, 102, 104, 106,
108, 110, 112, 114, 116, 118), P3 = c(90, 94, 98, 102, 106, 110,
114, 118, 122, 126), P4 = c(70, 75, 80, 85, 90, 95, 100, 105,
110, 115), OF1 = c(3, 3, 4, 5, 2, 2, 2, 1, 1, 0), OF2 = c(5,
3, 4, 2, 1, 2, 2, 1, 1, 0), OF3 = c(4, 3, 4, 1, 3, 2, 2, 1, 1,
0), OF4 = c(3, 5, 4, 2, 3, 1, 2, 1, 1, 0)), .Names = c("Dates",
"P1", "P2", "P3", "P4", "OF1",
"OF2", "OF3", "OF4"), row.names = c(NA,
-10L), class = "data.frame") #Splitting the code
grep("OF",colnames(df1)) #gives the column numbers? having
"OF" as column names
[1] 6 7 8 9 #Subset those columns ?df1[,grep("OF",colnames(df1))]
?? OF1 OF2 OF3 OF4
1??? 3?? 5?? 4?? 3
2??? 3?? 3?? 3?? 5
3??? 4?? 4?? 4?? 4
4??? 5?? 2?? 1?? 2
5??? 2?? 1?? 3?? 3
6??? 2?? 2?? 2?? 1
7??? 2?? 2?? 2?? 2
8??? 1?? 1?? 1?? 1
9??? 1?? 1?? 1?? 1
10?? 0?? 0?? 0?? 0 #unlist those columns to create a vector
?indx1<- unlist(df1[,grep("OF",colnames(df1))],use.names=FALSE)
indx1
# [1] 3 3 4 5 2 2 2 1 1 0 5 3 4 2 1 2 2 1 1 0 4 3 4 1 3 2 2 1 1 0 3 5 4 2 3 1 2
1
#[39] 1 0 #Same steps done with columns having names "P"
val1<- unlist(df1[,grep("P",colnames(df1))],use.names=FALSE)
val1
# [1]? 10? 13? 16? 19? 22? 25? 28? 31? 34? 37 100 102 104 106 108 110 112 114
116
#[20] 118? 90? 94? 98 102 106 110 114 118 122 126? 70? 75? 80? 85? 90? 95 100
105
#[39] 110 115
indx1+seq_along(indx1)? #get the index
# [1]? 4? 5? 7? 9? 7? 8? 9? 9 10 10 16 15 17 16 16 18 19 19 20 20 25 25 27 25 28
#[26] 28 29 29 30 30 34 37 37 36 38 37 39 39 40 40
?val1[indx1+seq_along(indx1)]
# [1]? 19? 22? 28? 34? 28? 31? 34? 34? 37? 37 110 108 112 110 110 114 116 116
118
#[20] 118 106 106 114 106 118 118 122 122 126 126? 85 100 100? 95 105 100 110
110
#[39] 115 115 df1[,10:13]<- val1[indx1+seq_along(indx1)]
?df1
??????? Dates P1? P2? P3? P4 OF1 OF2 OF3 OF4 V10 V11 V12 V13
1? 2006-01-03 10 100? 90? 70?? 3?? 5?? 4?? 3? 19 110 106? 85
2? 2006-01-04 13 102? 94? 75?? 3?? 3?? 3?? 5? 22 108 106 100
3? 2006-01-05 16 104? 98? 80?? 4?? 4?? 4?? 4? 28 112 114 100
4? 2006-01-06 19 106 102? 85?? 5?? 2?? 1?? 2? 34 110 106? 95
5? 2006-01-09 22 108 106? 90?? 2?? 1?? 3?? 3? 28 110 118 105
6? 2006-01-10 25 110 110? 95?? 2?? 2?? 2?? 1? 31 114 118 100
7? 2006-01-11 28 112 114 100?? 2?? 2?? 2?? 2? 34 116 122 110
8? 2006-01-12 31 114 118 105?? 1?? 1?? 1?? 1? 34 116 122 110
9? 2006-01-13 34 116 122 110?? 1?? 1?? 1?? 1? 37 118 126 115
10 2006-01-16 37 118 126 115?? 0?? 0?? 0?? 0? 37 118 126 115 Hope it helps.
A.K.????? ????
HI,
A modified code to avoid the ?sapply()
df1<- structure(list(Dates = structure(c(13151, 13152, 13153, 13154,
?13157, 13158, 13159, 13160, 13161, 13164), class = "Date"), P1 =
c(10,
?13, 16, 19, 22, 25, 28, 31, 34, 37), P2 = c(100, 102, 104, 106,
?108, 110, 112, 114, 116, 118), P3 = c(90, 94, 98, 102, 106, 110,
?114, 118, 122, 126), P4 = c(70, 75, 80, 85, 90, 95, 100, 105,
?110, 115), OF1 = c(3, 3, 4, 5, 2, 2, 2, 1, 1, 5), OF2 = c(5,
?3, 4, 2, 1, 2, 2, 1, 1, 0), OF3 = c(4, 3, 4, 1, 3, 2, 2, 1, 1,
?0), OF4 = c(3, 5, 4, 2, 3, 1, 2, 1, 1, 0)), .Names = c("Dates",
?"P1", "P2", "P3", "P4",
"OF1", "OF2", "OF3", "OF4"), row.names =
c(NA,
?-10L), class = "data.frame")
df1$OF2[9]<-4
df2<- df1
?df2[,10:13]<- NA
colnames(df2)[10:13]<- paste0("newPrice",1:4)
##your code
for(j in 2:5) {
?df2[j+8] = df2[df2[,j+4] + row(df2)[,j], j]
?}
indx1<- unlist(df1[,grep("OF",colnames(df1))],use.names=FALSE)
?indx1[rep(seq(nrow(df1)),4)%in% 6:10][indx1[rep(seq(nrow(df1)),4)%in% 6:10]-
rep(5:1,4)>=0]<- NA
val1<- unlist(df1[,grep("P",colnames(df1))],use.names=FALSE)
?df1[,10:13]<- val1[indx1+seq_along(indx1)]
?colnames(df1)[10:13]<- colnames(df2)[10:13]
identical(df1[,10:13],df2[,10:13])
#[1] TRUE
###On a bigger dataset:
set.seed(29)
?df2<-
data.frame(Dates=seq(as.Date("2006-01-03"),length.out=2000,by="1
day"),cbind(matrix(sample(10:120,2000*300,replace=TRUE),ncol=300),matrix(sample(0:6,2000*300,replace=TRUE),ncol=300)))
?colnames(df2)[2:301]<- paste0("P",1:300)
?colnames(df2)[302:601]<- paste0("OF",1:300)
?df3<- df2
df2[,602:901]<-NA
?colnames(df2)[602:901]<- paste0("newPrice",1:300)
?system.time({
?for(j in grep("^P",colnames(df2))) {
? df2[j+600] = df2[df2[,j+300] + row(df2)[,j], j]
? }
?})
#?? user? system elapsed
?#? 8.508?? 0.000?? 8.523
colN_OF<- ncol(df3[,grep("OF",colnames(df3))])
system.time({
?indx1<- unlist(df3[,grep("OF",colnames(df3))],use.names=FALSE)
?indx1[rep(seq(nrow(df3)),colN_OF) %in%
1995:2000][indx1[rep(seq(nrow(df3)),colN_OF) %in% 1995:2000] -
rep(6:1,colN_OF)>=0] <-NA
? val1<- unlist(df3[,grep("P",colnames(df3))],use.names=FALSE)
? df3[,602:901]<- val1[indx1+seq_along(indx1)]
? colnames(df3)[602:901]<- colnames(df2)[602:901]
?})
#? user? system elapsed
#? 0.568?? 0.000?? 0.569
?identical(df2,df3)
#[1] TRUE
A.K.
----- Original Message -----
From: arun <smartpink111 at yahoo.com>
To: Ira Sharenow <irasharenow100 at yahoo.com>
Cc:
Sent: Sunday, September 22, 2013 1:28 AM
Subject: Re: [R] Obtaining data from a different row of data frame
Ira,
I tried with a bigger dataset to look for any errors in the code:
set.seed(29)
?df2<-
data.frame(Dates=seq(as.Date("2006-01-03"),length.out=2000,by="1
day"),cbind(matrix(sample(10:120,2000*300,replace=TRUE),ncol=300),matrix(sample(0:6,2000*300,replace=TRUE),ncol=300)))
?colnames(df2)[2:301]<- paste0("P",1:300)
?colnames(df2)[302:601]<- paste0("OF",1:300)
?df3<- df2
df2[,602:901]<-NA
?colnames(df2)[602:901]<- paste0("newPrice",1:300)
?system.time({
?for(j in grep("^P",colnames(df2))) {
? df2[j+600] = df2[df2[,j+300] + row(df2)[,j], j]
? }
?})
#?? user? system elapsed
?# 9.584?? 0.000?? 9.601
vec1<- 6:1 ##change values according to the range of actual values in your
rows.
?vec2<- 1995:2000 ##change accordingly. If the maximum value is say 100, take
100 rows from the tail end.? Change the vec1 also so that both are of the same
length
system.time({
?df3[vec2,grep("OF",colnames(df3))]<-
t(sapply(seq_along(vec1),function(i)
{x1<-as.matrix(df3[vec2[i],grep("OF",colnames(df3))]);
x1[x1>=vec1[i]]<-NA; x1}))
?indx1<- unlist(df3[,grep("OF",colnames(df3))],use.names=FALSE)
?val1<- unlist(df3[,grep("P",colnames(df3))],use.names=FALSE)
? df3[,602:901]<- val1[indx1+seq_along(indx1)]
? colnames(df3)[602:901]<- colnames(df2)[602:901]
?})
#?? user? system elapsed
?# 0.552?? 0.000?? 0.553
identical(df2[,602:901],df3[,602:901])
#[1] TRUE
A.K.
Ira,
No problem.
If you change the ?for() loop to ?lapply, there should be increase in speed
(Usually, it is not the case.? Here it is does, but still not as good in terms
of speed as the method I showed).
df1<- structure(list(Dates = structure(c(13151, 13152, 13153, 13154,
?13157, 13158, 13159, 13160, 13161, 13164), class = "Date"), P1 =
c(10,
?13, 16, 19, 22, 25, 28, 31, 34, 37), P2 = c(100, 102, 104, 106,
?108, 110, 112, 114, 116, 118), P3 = c(90, 94, 98, 102, 106, 110,
?114, 118, 122, 126), P4 = c(70, 75, 80, 85, 90, 95, 100, 105,
?110, 115), OF1 = c(3, 3, 4, 5, 2, 2, 2, 1, 1, 5), OF2 = c(5,
?3, 4, 2, 1, 2, 2, 1, 1, 0), OF3 = c(4, 3, 4, 1, 3, 2, 2, 1, 1,
?0), OF4 = c(3, 5, 4, 2, 3, 1, 2, 1, 1, 0)), .Names = c("Dates",
?"P1", "P2", "P3", "P4",
"OF1", "OF2", "OF3", "OF4"), row.names =
c(NA,
?-10L), class = "data.frame")
df1$OF2[9]<-4
df2<- df1
?df2[,10:13]<- NA
colnames(df2)[10:13]<- paste0("newPrice",1:4)
##your code
for(j in 2:5) {
?df2[j+8] = df2[df2[,j+4] + row(df2)[,j], j]
?}
#using ?lapply()
?df1[,10:13]<-lapply(2:5,function(j) {df1[df1[,j+4]+row(df2)[,j],j]})
colnames(df1)[10:13]<- colnames(df2)[10:13]
?identical(df1,df2)
#[1] TRUE
#######Speed check:
set.seed(29)
?df2<-
data.frame(Dates=seq(as.Date("2006-01-03"),length.out=2000,by="1
day"),cbind(matrix(sample(10:120,2000*300,replace=TRUE),ncol=300),matrix(sample(0:6,2000*300,replace=TRUE),ncol=300)))
?colnames(df2)[2:301]<- paste0("P",1:300)
?colnames(df2)[302:601]<- paste0("OF",1:300)
?df3<- df2
df2[,602:901]<-NA
?colnames(df2)[602:901]<- paste0("newPrice",1:300)
system.time({
?for(j in grep("^P",colnames(df2))) {
? df2[j+600] = df2[df2[,j+300] + row(df2)[,j], j]
? }
?})
?# user? system elapsed
?#11.652?? 0.148? 11.822
system.time({df3[,602:901]<-lapply(2:301,function(j)
{df3[df3[,j+300]+row(df3)[,j],j]}) })
#? user? system elapsed
#? 2.960?? 0.000?? 2.962
colnames(df3)[602:901]<- colnames(df2)[602:901]
?identical(df2,df3)
#[1] TRUE
A.K.
?
________________________________
From: Ira Sharenow <irasharenow100 at yahoo.com>
To: arun <smartpink111 at yahoo.com>
Sent: Sunday, September 22, 2013 10:49 AM
Subject: Re: [R] Obtaining data from a different row of data frame
Arun,
Thanks for the time you spent helping me.
I always learned to use the apply family (but maybe your strategies are faster),
and now I think I am going to learn Hadley Wickham?s methods. Right now I need
to do other parts of the project. In a few days I will take another look at your
code to see if I can get more out of my code.
For my current project once I am finished my boss will use my code and possibly
modify it, so speed is just one factor. Transparency and his future coding time
is another consideration. I need to balance things off.
I need tolerable speed and relatively easy to understand code. It is an
interesting trade off.
Thank again for your help. I?ll get back to you when I take another look at the
details of what you wrote.
Ira
On 9/21/2013 11:27 PM, arun wrote:
HI, A modified code to avoid the ?sapply()
df1<- structure(list(Dates = structure(c(13151, 13152, 13153, 13154,
?13157, 13158, 13159, 13160, 13161, 13164), class = "Date"), P1 =
c(10,
?13, 16, 19, 22, 25, 28, 31, 34, 37), P2 = c(100, 102, 104, 106,
?108, 110, 112, 114, 116, 118), P3 = c(90, 94, 98, 102, 106, 110,
?114, 118, 122, 126), P4 = c(70, 75, 80, 85, 90, 95, 100, 105,
?110, 115), OF1 = c(3, 3, 4, 5, 2, 2, 2, 1, 1, 5), OF2 = c(5,
?3, 4, 2, 1, 2, 2, 1, 1, 0), OF3 = c(4, 3, 4, 1, 3, 2, 2, 1, 1,
?0), OF4 = c(3, 5, 4, 2, 3, 1, 2, 1, 1, 0)), .Names = c("Dates",
?"P1", "P2", "P3", "P4",
"OF1", "OF2", "OF3", "OF4"), row.names =
c(NA,
?-10L), class = "data.frame")
df1$OF2[9]<-4 df2<- df1
?df2[,10:13]<- NA
colnames(df2)[10:13]<- paste0("newPrice",1:4) ##your code for(j in
2:5) {
?df2[j+8] = df2[df2[,j+4] + row(df2)[,j], j]
?}
indx1<- unlist(df1[,grep("OF",colnames(df1))],use.names=FALSE)
?indx1[rep(seq(nrow(df1)),4)%in% 6:10][indx1[rep(seq(nrow(df1)),4)%in% 6:10]-
rep(5:1,4)>=0]<- NA val1<-
unlist(df1[,grep("P",colnames(df1))],use.names=FALSE)
?df1[,10:13]<- val1[indx1+seq_along(indx1)]
?colnames(df1)[10:13]<- colnames(df2)[10:13]
identical(df1[,10:13],df2[,10:13])
#[1] TRUE ###On a bigger dataset:
set.seed(29)
?df2<-
data.frame(Dates=seq(as.Date("2006-01-03"),length.out=2000,by="1
day"),cbind(matrix(sample(10:120,2000*300,replace=TRUE),ncol=300),matrix(sample(0:6,2000*300,replace=TRUE),ncol=300)))
?colnames(df2)[2:301]<- paste0("P",1:300)
?colnames(df2)[302:601]<- paste0("OF",1:300)
?df3<- df2 df2[,602:901]<-NA
?colnames(df2)[602:901]<- paste0("newPrice",1:300)
?system.time({
?for(j in grep("^P",colnames(df2))) {
? df2[j+600] = df2[df2[,j+300] + row(df2)[,j], j]
? }
?})
#?? user? system elapsed
?#? 8.508?? 0.000?? 8.523??colN_OF<-
ncol(df3[,grep("OF",colnames(df3))])
system.time({
?indx1<- unlist(df3[,grep("OF",colnames(df3))],use.names=FALSE)
?indx1[rep(seq(nrow(df3)),colN_OF) %in%
1995:2000][indx1[rep(seq(nrow(df3)),colN_OF) %in% 1995:2000] -
rep(6:1,colN_OF)>=0] <-NA
? val1<- unlist(df3[,grep("P",colnames(df3))],use.names=FALSE)
? df3[,602:901]<- val1[indx1+seq_along(indx1)]
? colnames(df3)[602:901]<- colnames(df2)[602:901]
?})
#? user? system elapsed
#? 0.568?? 0.000?? 0.569???identical(df2,df3)
#[1] TRUE A.K. ----- Original Message -----
From: arun <smartpink111 at yahoo.com> To: Ira Sharenow <irasharenow100
at yahoo.com> Cc:
Sent: Sunday, September 22, 2013 1:28 AM
Subject: Re: [R] Obtaining data from a different row of data frame Ira, I tried
with a bigger dataset to look for any errors in the code:
set.seed(29)
?df2<-
data.frame(Dates=seq(as.Date("2006-01-03"),length.out=2000,by="1
day"),cbind(matrix(sample(10:120,2000*300,replace=TRUE),ncol=300),matrix(sample(0:6,2000*300,replace=TRUE),ncol=300)))
?colnames(df2)[2:301]<- paste0("P",1:300)
?colnames(df2)[302:601]<- paste0("OF",1:300)
?df3<- df2 df2[,602:901]<-NA
?colnames(df2)[602:901]<- paste0("newPrice",1:300)
?system.time({
?for(j in grep("^P",colnames(df2))) {
? df2[j+600] = df2[df2[,j+300] + row(df2)[,j], j]
? }
?})
#?? user? system elapsed
?# 9.584?? 0.000?? 9.601??vec1<- 6:1 ##change values according to the range
of actual values in your rows.
?vec2<- 1995:2000 ##change accordingly. If the maximum value is say 100, take
100 rows from the tail end.? Change the vec1 also so that both are of the same
length system.time({
?df3[vec2,grep("OF",colnames(df3))]<-
t(sapply(seq_along(vec1),function(i)
{x1<-as.matrix(df3[vec2[i],grep("OF",colnames(df3))]);
x1[x1>=vec1[i]]<-NA; x1}))
?indx1<- unlist(df3[,grep("OF",colnames(df3))],use.names=FALSE)
?val1<- unlist(df3[,grep("P",colnames(df3))],use.names=FALSE)
? df3[,602:901]<- val1[indx1+seq_along(indx1)]
? colnames(df3)[602:901]<- colnames(df2)[602:901]
?})
#?? user? system elapsed
?# 0.552?? 0.000?? 0.553??identical(df2[,602:901],df3[,602:901])
#[1] TRUE A.K. ??????