Rolf,
It's great that you provided example data and code. It's helpful if you
provide data in a way that's easy for others to run. For example, you can
use the output from the call to the dput() function.
dput(dtot)
dput(newtot)
Then you could share the example data like this:
# TRAINING data...
dtot <- structure(list(STOCK.NAME = c("Stock.1",
"Stock.1", "Stock.1",
"Stock.1", "Stock.2", "Stock.2"), Indicator1 =
c(0.53464, 0.907586,
0.682471, 0.156847, 0.177951, 0.525604), Indicator2 = c(0.809136,
0.421417, 0.501301, 0.057765, 0.506193, 0.152735), Indicator3 c(0.090641,
0.292742, 0.160167, 0.345092, 0.075647, 0.033175), Indicator4 = c(0.212288,
0.78914, 0.753329, 0.148373, 0.719628, 0.780946), Indicator5 = c(0.817402,
0.263374, 0.426113, 0.79769, 0.52613, 0.037649), Indicator6 = c(0.976926,
0.597003, 0.874266, 0.927548, 0.131471, 0.733622), Indicator7 c(0.383471,
0.420898, 0.752404, 0.797175, 0.140883, 0.128549), Indicator8 = c(0.119862,
0.582622, 0.535917, 0.4544, 0.926419, 0.763801), Indicator9 = c(0.369533,
0.666901, 0.26929, 0.135831, 0.393547, 0.493194), Indicator10 = c(0.374066,
0.71218, 0.30212, 0.767282, 0.292262, 0.008631), Action = structure(c(1L,
2L, 2L, 1L, 2L, 1L), .Label = c("Buy", "Notbuy"), class =
"factor")),
.Names = c("STOCK.NAME",
"Indicator1", "Indicator2", "Indicator3",
"Indicator4", "Indicator5",
"Indicator6", "Indicator7", "Indicator8",
"Indicator9", "Indicator10",
"Action"), row.names = c(NA, -6L), class = "data.frame")
# NEW data...
newtot <- structure(list(STOCK.NAME = c("Stock.1",
"Stock.2"), Indicator1 c(0.53464,
0.907586), Indicator2 = c(0.809136, 0.421417), Indicator3 = c(0.090641,
0.292742), Indicator4 = c(0.212288, 0.78914), Indicator5 = c(0.817402,
0.263374), Indicator6 = c(0.976926, 0.597003), Indicator7 = c(0.383471,
0.420898), Indicator8 = c(0.119862, 0.582622), Indicator9 = c(0.369533,
0.666901), Indicator10 = c(0.374066, 0.71218), Action = c(NA,
NA)), .Names = c("STOCK.NAME", "Indicator1",
"Indicator2", "Indicator3",
"Indicator4", "Indicator5", "Indicator6",
"Indicator7", "Indicator8",
"Indicator9", "Indicator10", "Action"), class =
"data.frame", row.names c(NA,
-2L))
Your code looks okay to me for the most part. There were a few things that
I simplified, but that wouldn't have any effect on the output. The only
sticking point I noticed was the line in your for loop that looks like this:
test <- rbind(dtot[nrow(dtot),-c(1,12)], newtot[i,-c(1,12)])
Why are you combining the last row of the training data with the test data?
In the case of i=1 this last row of dtot is not even of the same stock as
the one subsetted by the for loop. I changed the code so that only the
test data were used here.
I am not a regular user of randomForest() so I am not sure where you are
seeing the number of rows that are used, which is the issue you raised.
Where do you see the problem?
stk <- unique(dtot[,1])
res <- matrix(NA, length(stk), 2, dimnames=list(NULL, c("Buy",
"Not Buy")))
for(i in 1:length(stk)) {
d <- dtot[stk==stk[i], -1]
# Build the Random Forest model
set.seed(42)
rf <- randomForest(Action ~ ., data=d, ntree=500, mtry=3, importance=TRUE,
na.action=na.roughfix, replace=FALSE)
test <- newtot[i, -c(1,12)]
# Obtain probability scores for the Random Forest model
res[i, ] <- predict(rf, test, type="prob")
}
data.frame(stk, res)
Hope this helps.
Jean
On Wed, Aug 7, 2013 at 4:59 AM, Rolf Edberg <rolfe@algonet.se> wrote:
> Hi
>
>
>
> I have a problem with getting stuck in the number of rows that the first
> data group has when looping.
>
>
>
> Let me explain the program:
>
> I want to run randomForest on 200 stocks and get scores of each of them.
> First I shall point at the training data set file(with data from 200
> stocks)
> then I shall point at the predict dataset(with 200 rows of data from the
> 200
> stocks with unknown target).
>
> At the end I shall point at a place to save the file with the score for
> each
> stock.
>
>
>
> This works almost. The problem is that the number of rows of data from the
> stocks in the training file differs on every stock. One time it is 50 rows
> for one stock and for another stock it can be 100 rows of data. When I run
> this code the number of rows from the first stock is used on all stocks.
> For
> example if stock 1 has 50 rows the calculation on stock2 also will use 50
> rows.
>
>
>
> So the score in the result file differs from if I do the calculations one
> stock at the time.
>
>
>
> What am I doing wrong?
>
>
>
> Kind regards
>
> Rolf
>
>
>
>
>
> The data in the files looks like this:
>
> Train file:
>
>
> STOCK.NAME
>
> Indicator1
>
> Indicator2
>
> Indicator3
>
> Indicator4
>
> Indicator5
>
> Indicator6
>
> Indicator7
>
> Indicator8
>
> Indicator9
>
> Indicator10
>
> Action
>
>
> Stock.1
>
> 0.53464
>
> 0.809136
>
> 0.090641
>
> 0.212288
>
> 0.817402
>
> 0.976926
>
> 0.383471
>
> 0.119862
>
> 0.369533
>
> 0.374066
>
> Buy
>
>
> Stock.1
>
> 0.907586
>
> 0.421417
>
> 0.292742
>
> 0.78914
>
> 0.263374
>
> 0.597003
>
> 0.420898
>
> 0.582622
>
> 0.666901
>
> 0.71218
>
> Notbuy
>
>
> Stock.1
>
> 0.682471
>
> 0.501301
>
> 0.160167
>
> 0.753329
>
> 0.426113
>
> 0.874266
>
> 0.752404
>
> 0.535917
>
> 0.26929
>
> 0.30212
>
> Notbuy
>
>
> Stock.1
>
> 0.156847
>
> 0.057765
>
> 0.345092
>
> 0.148373
>
> 0.79769
>
> 0.927548
>
> 0.797175
>
> 0.4544
>
> 0.135831
>
> 0.767282
>
> Buy
>
>
> Stock.2
>
> 0.177951
>
> 0.506193
>
> 0.075647
>
> 0.719628
>
> 0.52613
>
> 0.131471
>
> 0.140883
>
> 0.926419
>
> 0.393547
>
> 0.292262
>
> Notbuy
>
>
> Stock.2
>
> 0.525604
>
> 0.152735
>
> 0.033175
>
> 0.780946
>
> 0.037649
>
> 0.733622
>
> 0.128549
>
> 0.763801
>
> 0.493194
>
> 0.008631
>
> Buy
>
> Predict file:
>
>
> STOCK.NAME
>
> Indicator1
>
> Indicator2
>
> Indicator3
>
> Indicator4
>
> Indicator5
>
> Indicator6
>
> Indicator7
>
> Indicator8
>
> Indicator9
>
> Indicator10
>
> Action
>
>
> Stock.1
>
> 0.53464
>
> 0.809136
>
> 0.090641
>
> 0.212288
>
> 0.817402
>
> 0.976926
>
> 0.383471
>
> 0.119862
>
> 0.369533
>
> 0.374066
>
>
>
> Stock.2
>
> 0.907586
>
> 0.421417
>
> 0.292742
>
> 0.78914
>
> 0.263374
>
> 0.597003
>
> 0.420898
>
> 0.582622
>
> 0.666901
>
> 0.71218
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
> rm(list=ls())
>
>
>
> require(randomForest, quietly=TRUE)
>
>
>
> #Reading the TRAINING data...
>
> dtot=read.csv(choose.files(caption="Choose the TRAINING
data..."))
>
>
>
> #Reading the NEW data...
>
> newtot=read.csv(choose.files(caption="Choose the NEW data..."))
>
>
>
> stk=names(table(dtot[,1]))
>
>
>
> date=paste(
>
> strsplit(as.character(Sys.Date()),"-")[[1]][1],
>
> strsplit(as.character(Sys.Date()),"-")[[1]][2],
>
> strsplit(as.character(Sys.Date()),"-")[[1]][3],
>
> sep="")
>
>
>
> res=matrix(0,length(stk),2)
>
>
>
> for (i in 1:length(stk))
>
> {
>
>
>
> d=dtot[which(dtot[,1]==stk[i]),-1]
>
> #write.csv(d,paste(stk[i],".csv",sep=""),row.names=F)
>
>
#write.csv(newtot[i,-c(1,12)],paste(stk[i],"_PRED.csv",sep=""),row.names=F)
>
> #}
>
>
>
> # Build the Random Forest model.
>
>
>
> set.seed(42)
>
> rf <- randomForest(Action ~ .,
>
> data=d,
>
> ntree=500,
>
> mtry=3,
>
> importance=TRUE,
>
> na.action=na.roughfix,
>
> replace=FALSE)
>
>
>
> test=rbind(dtot[nrow(dtot),-c(1,12)],newtot[i,-c(1,12)])
>
>
>
> # Obtain probability scores for the Random Forest model
>
> res[i,2]=predict(rf, test, type="prob")[2,2]
>
>
>
>
>
> }
>
>
>
> res=data.frame(res)
>
> names(res)=c("Stockname","Score")
>
> res[,1]=stk
>
>
>
> # Output the combined data.
>
>
>
> setwd(choose.dir(caption="Choose the FOLDER where you want the send
the
> results..."))
>
> write.csv(res,
file=paste("Stock1_newdata_score_all_",date,".csv",sep=""),
> row.names=FALSE)
>
>
>
>
> [[alternative HTML version deleted]]
>
> ______________________________________________
> R-help@r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
> http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>
[[alternative HTML version deleted]]