Mojca ZELNIKAR
2012-Aug-07 09:21 UTC
[R] predicting test dataset response from training dataset with randomForest
Hi I am new to R so I apologize if this is trivial. I am trying to predict the resistance or susceptibility of my sequences to a certain drug with a randomForest function from a file with amino acids on each of the positions in the protein. I ran the following:> library(randomForest) > > path <- "C:\\..." > path2 <- "..." > name <- "..." > > actualFileName <- paste(path, path2, name, ".txt", sep="") > > # reading in the training dataset > dat1 <- read.table(actualFileName, header=TRUE, sep="\t", > colClasses="character") > > head(dat1)X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 ... SR 1 M K V K L L V L L C T F T A T Y A ... suscep 2 M K V K L L V L L C T F A A T Y A ... suscep 3 M K V K L L V L L C T F T A T Y A ... resist ...> # some of the important sites identified by fisher test > dat1$X13 <- as.factor(dat1$X13) > dat1$X52 <- as.factor(dat1$X52) > dat1$X53 <- as.factor(dat1$X53) > dat1$X64 <- as.factor(dat1$X64) > dat1$X85 <- as.factor(dat1$X85) > dat1$X99 <- as.factor(dat1$X99) > dat1$X111 <- as.factor(dat1$X111) > dat1$X142 <- as.factor(dat1$X142) > dat1$X157 <- as.factor(dat1$X157) > dat1$X158 <- as.factor(dat1$X158) > dat1$X162 <- as.factor(dat1$X162) > dat1$X169 <- as.factor(dat1$X169) > dat1$X200 <- as.factor(dat1$X200) > dat1$X202 <- as.factor(dat1$X202) > dat1$X203 <- as.factor(dat1$X203) > dat1$X205 <- as.factor(dat1$X205) > dat1$X206 <- as.factor(dat1$X206) > dat1$X209 <- as.factor(dat1$X209) > dat1$X210 <- as.factor(dat1$X210) > dat1$X225 <- as.factor(dat1$X225) > dat1$X269 <- as.factor(dat1$X269) > dat1$X283 <- as.factor(dat1$X283) > dat1$X290 <- as.factor(dat1$X290) > dat1$X432 <- as.factor(dat1$X432) > dat1$X434 <- as.factor(dat1$X434) > dat1$X455 <- as.factor(dat1$X455) > dat1$X467 <- as.factor(dat1$X467) > dat1$X512 <- as.factor(dat1$X512) > dat1$SR <- as.factor(dat1$SR) > > > > dat1.rf <-randomForest(SR ~ X13+ X52+ X53+ X64+ X85+ X99+ X111+ > X142+ X157+ X158+ X162+ X169+ X200++ X202+ X203+ X205+ X206+ X209+ X210+ X225+ X269+ X283+ X290+ X432+ X434+ X455+ X467+ X512, data=dat1, importance=TRUE, + proximity=TRUE, varUsed=TRUE, ntree=5000, varImpPlot=TRUE)> > > print(dat1.rf)> varImpPlot(dat1.rf) > > varUsed(dat1.rf, by.tree=FALSE, count=TRUE)> > MDSplot(dat1.rf, dat1$SR, palette=rep(1, 2),+ pch=as.numeric(dat1$SR))> > > path3 <- "C:\\Users..." > path4 <- "..." > name2 <- "..." > > # reading in the test dataset > actualFileName2 <- paste(path3, path4, name2, ".txt", sep="") > > dat2 <- read.table(actualFileName2, header=TRUE, sep="\t", > colClasses="character") >> > dat2$X13 <- as.factor(dat2$X13) > dat2$X52 <- as.factor(dat2$X52) > dat2$X53 <- as.factor(dat2$X53) > dat2$X64 <- as.factor(dat2$X64) > dat2$X85 <- as.factor(dat2$X85) > dat2$X99 <- as.factor(dat2$X99) > dat2$X111 <- as.factor(dat2$X111) > dat2$X142 <- as.factor(dat2$X142) > dat2$X157 <- as.factor(dat2$X157) > dat2$X158 <- as.factor(dat2$X158) > dat2$X162 <- as.factor(dat2$X162) > dat2$X169 <- as.factor(dat2$X169) > dat2$X200 <- as.factor(dat2$X200) > dat2$X202 <- as.factor(dat2$X202) > dat2$X203 <- as.factor(dat2$X203) > dat2$X205 <- as.factor(dat2$X205) > dat2$X206 <- as.factor(dat2$X206) > dat2$X209 <- as.factor(dat2$X209) > dat2$X210 <- as.factor(dat2$X210) > dat2$X225 <- as.factor(dat2$X225) > dat2$X269 <- as.factor(dat2$X269) > dat2$X283 <- as.factor(dat2$X283) > dat2$X290 <- as.factor(dat2$X290) > dat2$X432 <- as.factor(dat2$X432) > dat2$X434 <- as.factor(dat2$X434) > dat2$X455 <- as.factor(dat2$X455) > dat2$X467 <- as.factor(dat2$X467) > dat2$X512 <- as.factor(dat2$X512) > dat2$SR <- as.factor(dat2$SR) > > > > dat2.pred<-predict(dat1.rf, dat2, type="response", norm.votes=TRUE, > predict.all=FALSE, proximity=FALSE, nodes=FALSE)Error in predict.randomForest(dat1.rf, dat2, type = "response", norm.votes = TRUE, : New factor levels not present in the training data>The thing is that each of the amino acid positions in the training dataset is present also in the training dataset. So I don't know how to deal with the error. Thank you very much. Kind regards, Mojca Zelnikar -- The University of Edinburgh is a charitable body, registered in Scotland, with registration number SC005336.