It's great that you provided example data and code. It's helpful if you
provide data in a way that's easy for others to run. For example, you can
use the output from the call to the dput() function.
Then you could share the example data like this:
# TRAINING data...
dtot <- structure(list(STOCK.NAME = c("Stock.1",
"Stock.1", "Stock.1",
"Stock.1", "Stock.2", "Stock.2"), Indicator1 =
c(0.53464, 0.907586,
0.682471, 0.156847, 0.177951, 0.525604), Indicator2 = c(0.809136,
0.421417, 0.501301, 0.057765, 0.506193, 0.152735), Indicator3 c(0.090641,
0.292742, 0.160167, 0.345092, 0.075647, 0.033175), Indicator4 = c(0.212288,
0.78914, 0.753329, 0.148373, 0.719628, 0.780946), Indicator5 = c(0.817402,
0.263374, 0.426113, 0.79769, 0.52613, 0.037649), Indicator6 = c(0.976926,
0.597003, 0.874266, 0.927548, 0.131471, 0.733622), Indicator7 c(0.383471,
0.420898, 0.752404, 0.797175, 0.140883, 0.128549), Indicator8 = c(0.119862,
0.582622, 0.535917, 0.4544, 0.926419, 0.763801), Indicator9 = c(0.369533,
0.666901, 0.26929, 0.135831, 0.393547, 0.493194), Indicator10 = c(0.374066,
0.71218, 0.30212, 0.767282, 0.292262, 0.008631), Action = structure(c(1L,
2L, 2L, 1L, 2L, 1L), .Label = c("Buy", "Notbuy"), class =
.Names = c("STOCK.NAME",
"Indicator1", "Indicator2", "Indicator3",
"Indicator4", "Indicator5",
"Indicator6", "Indicator7", "Indicator8",
"Indicator9", "Indicator10",
"Action"), row.names = c(NA, -6L), class = "data.frame")
# NEW data...
newtot <- structure(list(STOCK.NAME = c("Stock.1",
"Stock.2"), Indicator1 c(0.53464,
0.907586), Indicator2 = c(0.809136, 0.421417), Indicator3 = c(0.090641,
0.292742), Indicator4 = c(0.212288, 0.78914), Indicator5 = c(0.817402,
0.263374), Indicator6 = c(0.976926, 0.597003), Indicator7 = c(0.383471,
0.420898), Indicator8 = c(0.119862, 0.582622), Indicator9 = c(0.369533,
0.666901), Indicator10 = c(0.374066, 0.71218), Action = c(NA,
NA)), .Names = c("STOCK.NAME", "Indicator1",
"Indicator2", "Indicator3",
"Indicator4", "Indicator5", "Indicator6",
"Indicator7", "Indicator8",
"Indicator9", "Indicator10", "Action"), class =
"data.frame", row.names c(NA,
Your code looks okay to me for the most part. There were a few things that
I simplified, but that wouldn't have any effect on the output. The only
sticking point I noticed was the line in your for loop that looks like this:
test <- rbind(dtot[nrow(dtot),-c(1,12)], newtot[i,-c(1,12)])
Why are you combining the last row of the training data with the test data?
In the case of i=1 this last row of dtot is not even of the same stock as
the one subsetted by the for loop. I changed the code so that only the
test data were used here.
I am not a regular user of randomForest() so I am not sure where you are
seeing the number of rows that are used, which is the issue you raised.
Where do you see the problem?
stk <- unique(dtot[,1])
res <- matrix(NA, length(stk), 2, dimnames=list(NULL, c("Buy",
"Not Buy")))
for(i in 1:length(stk)) {
d <- dtot[stk==stk[i], -1]
# Build the Random Forest model
rf <- randomForest(Action ~ ., data=d, ntree=500, mtry=3, importance=TRUE,
na.action=na.roughfix, replace=FALSE)
test <- newtot[i, -c(1,12)]
# Obtain probability scores for the Random Forest model
res[i, ] <- predict(rf, test, type="prob")
data.frame(stk, res)
Hope this helps.
On Wed, Aug 7, 2013 at 4:59 AM, Rolf Edberg <> wrote:
> Hi
> I have a problem with getting stuck in the number of rows that the first
> data group has when looping.
> Let me explain the program:
> I want to run randomForest on 200 stocks and get scores of each of them.
> First I shall point at the training data set file(with data from 200
> stocks)
> then I shall point at the predict dataset(with 200 rows of data from the
> 200
> stocks with unknown target).
> At the end I shall point at a place to save the file with the score for
> each
> stock.
> This works almost. The problem is that the number of rows of data from the
> stocks in the training file differs on every stock. One time it is 50 rows
> for one stock and for another stock it can be 100 rows of data. When I run
> this code the number of rows from the first stock is used on all stocks.
> For
> example if stock 1 has 50 rows the calculation on stock2 also will use 50
> rows.
> So the score in the result file differs from if I do the calculations one
> stock at the time.
> What am I doing wrong?
> Kind regards
> Rolf
> The data in the files looks like this:
> Train file:
> Indicator1
> Indicator2
> Indicator3
> Indicator4
> Indicator5
> Indicator6
> Indicator7
> Indicator8
> Indicator9
> Indicator10
> Action
> Stock.1
> 0.53464
> 0.809136
> 0.090641
> 0.212288
> 0.817402
> 0.976926
> 0.383471
> 0.119862
> 0.369533
> 0.374066
> Buy
> Stock.1
> 0.907586
> 0.421417
> 0.292742
> 0.78914
> 0.263374
> 0.597003
> 0.420898
> 0.582622
> 0.666901
> 0.71218
> Notbuy
> Stock.1
> 0.682471
> 0.501301
> 0.160167
> 0.753329
> 0.426113
> 0.874266
> 0.752404
> 0.535917
> 0.26929
> 0.30212
> Notbuy
> Stock.1
> 0.156847
> 0.057765
> 0.345092
> 0.148373
> 0.79769
> 0.927548
> 0.797175
> 0.4544
> 0.135831
> 0.767282
> Buy
> Stock.2
> 0.177951
> 0.506193
> 0.075647
> 0.719628
> 0.52613
> 0.131471
> 0.140883
> 0.926419
> 0.393547
> 0.292262
> Notbuy
> Stock.2
> 0.525604
> 0.152735
> 0.033175
> 0.780946
> 0.037649
> 0.733622
> 0.128549
> 0.763801
> 0.493194
> 0.008631
> Buy
> Predict file:
> Indicator1
> Indicator2
> Indicator3
> Indicator4
> Indicator5
> Indicator6
> Indicator7
> Indicator8
> Indicator9
> Indicator10
> Action
> Stock.1
> 0.53464
> 0.809136
> 0.090641
> 0.212288
> 0.817402
> 0.976926
> 0.383471
> 0.119862
> 0.369533
> 0.374066
> Stock.2
> 0.907586
> 0.421417
> 0.292742
> 0.78914
> 0.263374
> 0.597003
> 0.420898
> 0.582622
> 0.666901
> 0.71218
> rm(list=ls())
> require(randomForest, quietly=TRUE)
> #Reading the TRAINING data...
> dtot=read.csv(choose.files(caption="Choose the TRAINING
> #Reading the NEW data...
> newtot=read.csv(choose.files(caption="Choose the NEW data..."))
> stk=names(table(dtot[,1]))
> date=paste(
> strsplit(as.character(Sys.Date()),"-")[[1]][1],
> strsplit(as.character(Sys.Date()),"-")[[1]][2],
> strsplit(as.character(Sys.Date()),"-")[[1]][3],
> sep="")
> res=matrix(0,length(stk),2)
> for (i in 1:length(stk))
> {
> d=dtot[which(dtot[,1]==stk[i]),-1]
> #write.csv(d,paste(stk[i],".csv",sep=""),row.names=F)
> #}
> # Build the Random Forest model.
> set.seed(42)
> rf <- randomForest(Action ~ .,
> data=d,
> ntree=500,
> mtry=3,
> importance=TRUE,
> na.action=na.roughfix,
> replace=FALSE)
> test=rbind(dtot[nrow(dtot),-c(1,12)],newtot[i,-c(1,12)])
> # Obtain probability scores for the Random Forest model
> res[i,2]=predict(rf, test, type="prob")[2,2]
> }
> res=data.frame(res)
> names(res)=c("Stockname","Score")
> res[,1]=stk
> # Output the combined data.
> setwd(choose.dir(caption="Choose the FOLDER where you want the send
> results..."))
> write.csv(res,
> row.names=FALSE)
> [[alternative HTML version deleted]]
> ______________________________________________
> mailing list
> PLEASE do read the posting guide
> and provide commented, minimal, self-contained, reproducible code.
[[alternative HTML version deleted]]