Hi I'm trying to set up a simulation problem without resorting to (m)any loops. I want to set entries in a data frame of zeros ('starts' in the code below) to 1 at certain points and the points have been randomly generated and stored in a separate data.frame ('sl'), which has the same number of columns. An example of the procedure is as follows: ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4)) mm <- apply(ml, 2, cumsum) starts<- data.frame(matrix(0,600,4)) I can achieve the result I want with a loop: for (i in 1:4){ lstarts[,i][mm[,i]] <-1 } But as I want to use a large number of columns I would like to do away with the loop Can anyone suggest how this might be done? Thanks in advance Brett Robinson ______________________________________________________________________ This email has been scanned by the Symantec Email Security.cloud service. For more information please visit http://www.symanteccloud.com ______________________________________________________________________ [[alternative HTML version deleted]]
Hi, Not sure this helps: ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4)) mm <- apply(ml, 2, cumsum) starts<- data.frame(matrix(0,600,4)) starts1<- data.frame(matrix(0,600,4)) for (i in 1:4){ starts1[,i][mm[,i]] <-1 } starts2<-as.data.frame(do.call(cbind,lapply(1:4,function(i) {starts[,i][mm[,i]]<-1;starts[,i]}))) colnames(starts2)<- colnames(starts1) identical(starts1,starts2) #[1] TRUE #loop should be fast compared to lapply ml <- data.frame(matrix(sample(1:1e4,1e3, replace=TRUE),100,4)) mm <- apply(ml, 2, cumsum) starts<- data.frame(matrix(0,1e6,4)) starts1<- data.frame(matrix(0,1e6,4)) system.time({for (i in 1:4){ starts1[,i][mm[,i]] <-1 }}) ?#user ?system elapsed? ? # 0.63 ? ?0.30 ? ?0.92? system.time(starts2<-as.data.frame(do.call(cbind,lapply(1:4,function(i) {starts[,i][mm[,i]]<-1;starts[,i]})))) # user ?system elapsed? ?# ?0.74 ? ?0.26 ? ?1.03? colnames(starts2)<- colnames(starts1) identical(starts1,starts2) #[1] TRUE A.K. ----- Original Message ----- From: Brett Robinson <brett.robinson at 7dials.com> To: "r-help at r-project.org" <r-help at r-project.org> Cc: Sent: Saturday, February 2, 2013 11:38 AM Subject: [R] vectorisation Hi I'm trying to set up a simulation problem without resorting to (m)any loops. I want to set entries in a data frame of zeros ('starts' in the code below) to 1 at certain points and the points have been randomly generated and stored in a separate data.frame ('sl'), which has the same number of columns. An example of the procedure is as follows: ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4)) mm <- apply(ml, 2, cumsum) starts<- data.frame(matrix(0,600,4)) I can achieve the result I want with a loop: for (i in 1:4){ lstarts[,i][mm[,i]] <-1 } But as I want to use a large number of columns I would like to do away with the loop Can anyone suggest how this might be done? Thanks in advance Brett Robinson ______________________________________________________________________ This email has been scanned by the Symantec Email Security.cloud service. For more information please visit http://www.symanteccloud.com ______________________________________________________________________ ??? [[alternative HTML version deleted]] ______________________________________________ R-help at r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.
Hello, Try the following. set.seed(4315) ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4)) mm <- apply(ml, 2, cumsum) s2 <- starts <- data.frame(matrix(0,600,4)) for (i in 1:4){ starts[,i][mm[,i]] <- 1 } s2[] <- lapply(seq_len(ncol(mm)), function(i) {s2[,i][mm[,i]] <- 1; s2[,i]}) identical(s2, starts) # TRUE Note that lapply is a loop in disguise. Hope this helps, Rui Barradas Em 02-02-2013 16:38, Brett Robinson escreveu:> Hi > I'm trying to set up a simulation problem without resorting to (m)any loops. I want to set entries in a data frame of zeros ('starts' in the code below) to 1 at certain points and the points have been randomly generated and stored in a separate data.frame ('sl'), which has the same number of columns. > > An example of the procedure is as follows: > ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4)) > mm <- apply(ml, 2, cumsum) > starts<- data.frame(matrix(0,600,4)) > > I can achieve the result I want with a loop: > for (i in 1:4){ > lstarts[,i][mm[,i]] <-1 > } > > But as I want to use a large number of columns I would like to do away with the loop > > Can anyone suggest how this might be done? > > Thanks in advance > > Brett Robinson > > ______________________________________________________________________ > This email has been scanned by the Symantec Email Security.cloud service. > For more information please visit http://www.symanteccloud.com > ______________________________________________________________________ > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. >
On 02-02-2013, at 17:38, Brett Robinson <brett.robinson at 7dials.com> wrote:> Hi > I'm trying to set up a simulation problem without resorting to (m)any loops. I want to set entries in a data frame of zeros ('starts' in the code below) to 1 at certain points and the points have been randomly generated and stored in a separate data.frame ('sl'), which has the same number of columns. > > An example of the procedure is as follows: > ml <- data.frame(matrix(sample(1:50,80, replace=TRUE),20,4)) > mm <- apply(ml, 2, cumsum) > starts<- data.frame(matrix(0,600,4)) > > I can achieve the result I want with a loop: > for (i in 1:4){ > lstarts[,i][mm[,i]] <-1 > } > > But as I want to use a large number of columns I would like to do away with the loop > > Can anyone suggest how this might be done?Another way is this f2 <- function(starts, mm) { mn <- cbind(as.vector(mm),rep(1:ncol(mm),each=nrow(mm))) x <- as.matrix(starts) x[mn] <- 1 as.data.frame(x) } starts2 <- f2(starts,mm) #> identical(starts2,starts1) # [1] TRUE Collect all the options presented so far in functions, use the compiler package to see if that helps and do some speed tests with Arun's parameters. # Brett f1 <- function(starts, mm) { for (i in 1:ncol(mm)){ starts[,i][mm[,i]] <-1 } starts } # Berend f2 <- function(starts, mm) { mn <- cbind(as.vector(mm),rep(1:ncol(mm),each=nrow(mm))) x <- as.matrix(starts) x[mn] <- 1 as.data.frame(x) } # Rui f3 <- function(s2,mm) { s2[] <- lapply(seq_len(ncol(mm)), function(i) {s2[,i][mm[,i]] <- 1; s2[,i]}) s2 } # Arun f4 <- function(starts,mm) { starts2 <- as.data.frame(do.call(cbind,lapply(1:ncol(mm),function(i) {starts[,i][mm[,i]]<-1;starts[,i]}))) colnames(starts2)<- colnames(starts) starts2 } library(compiler) f1c <- cmpfun(f1) f2c <- cmpfun(f2) f3c <- cmpfun(f3) f4c <- cmpfun(f4) library(rbenchmark) # Arun's test set.seed(11) starts <- data.frame(matrix(0,1e6,4)) ml <- data.frame(matrix(sample(1:1e4,1e3, replace=TRUE),100,4)) mm <- apply(ml, 2, cumsum) z1 <- f1(starts,mm) z2 <- f2(starts,mm) z3 <- f3(starts,mm) z4 <- f4(starts,mm) z1c <- f1c(starts,mm) z2c <- f2c(starts,mm) z3c <- f3c(starts,mm) z4c <- f4c(starts,mm) identical(z2,z1) identical(z3,z1) identical(z4,z1) identical(z1c,z1) identical(z2c,z1) identical(z3c,z1) identical(z4c,z1) benchmark( f1(starts,mm) , f2(starts,mm), f1c(starts,mm), f2c(starts,mm), f3(starts,mm) , f4(starts,mm), f3c(starts,mm), f4c(starts,mm), replications=1,order="relative", columns=c("test","relative","elapsed","replications")) Result: # > identical(z2,z1) # [1] TRUE # > identical(z3,z1) # [1] TRUE # > identical(z4,z1) # [1] TRUE # > identical(z1c,z1) # [1] TRUE # > identical(z2c,z1) # [1] TRUE # > identical(z3c,z1) # [1] TRUE # > identical(z4c,z1) # [1] TRUE # > # > benchmark( f1(starts,mm) , f2(starts,mm), # + f1c(starts,mm), f2c(starts,mm), # + f3(starts,mm) , f4(starts,mm), # + f3c(starts,mm), f4c(starts,mm), # + replications=1,order="relative", columns=c("test","relative","elapsed","replications")) # test relative elapsed replications # 2 f2(starts, mm) 1.000 0.195 1 # 4 f2c(starts, mm) 1.005 0.196 1 # 1 f1(starts, mm) 2.990 0.583 1 # 3 f1c(starts, mm) 3.082 0.601 1 # 7 f3c(starts, mm) 3.903 0.761 1 # 5 f3(starts, mm) 3.949 0.770 1 # 8 f4c(starts, mm) 4.436 0.865 1 # 6 f4(starts, mm) 4.462 0.870 1 Compiling doesn't deliver significant speed gains in this case. Function f2 is the quickest. Berend