Benjamin Caldwell
2013-Feb-12 05:22 UTC
[R] improving/speeding up a very large, slow simulation
Dear R help; I'll preface this by saying that the example I've provided below is pretty long, turgid, and otherwise a deep dive into a series of functions I wrote for a simulation study. It is, however, reproducible and self-contained. I'm trying to do my first simulation study that's quite big, and so I'll say that the output of this simulation as I'd like it to be is 9.375 million rows (9375 combinations of the inputs * 1000). So, first thing, maybe I should try to cut down on the number of dimensions and do it a piece at a time. If that's a conclusion people come to here, I'll look into paring down/doing it a chunk at a time. I'm hoping, though, that this is an opportunity for me to learn some more efficient, elegant ways to a. vectorize and b. use the apply functions, which still seem to elude me when it comes to their use with more complex, custom functions that have multiple inputs. If having looked the below over you can give me suggestions to help make this and future code I write run more quickly/efficiently, I will be most grateful, as as this is currently written it would take what looks like days to run a thousand simulations of each possible combination of variables of interest. Best Ben Caldwell ----------------------------------------------- #unpaired verification.plots<-rnorm(30, 100, 10) # writeClipboard(as.character(verification.plots)) unpaired.test<- function(verification.plots, project.n, project.mean, project.sd, allowed.deviation, project.acres, alpha=.05){ verification.plots <-as.numeric(as.character(verification.plots)) a <- qnorm(alpha/2) d <- alpha*project.mean # verifier plot number n<-length(verification.plots) verifier.plot.number <- c(1:n) #all plots (verifier and project) all.plots.n <- rep(0,n) for(i in 1:n){ all.plots.n[i] <- project.n + verifier.plot.number[i] } #running mean X_bar <- rep(1,n) for(i in 1:n){ X_bar[i]<- mean(verification.plots[1:i]) } #running sd sd2 <- NULL for(i in 2:n){ sd2[i]<-sd(verification.plots[1:i]) } #Tn Tn<-NULL for(i in 2:n){ Tn[i] <- project.mean-X_bar[i] } #n_Threshold n_thresh<-NULL for(i in 2:n){ n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2))) } #Upper upper <- NULL for (i in 2:n){ upper[i] <- Tn[i]-d } #Lower lower<- NULL for(i in 2:n){ lower[i] <- Tn[i]+d } #Success.fail success.fail <- NULL success.fail.num <- rep(0,n) if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){ success.fail[1] <- "Pass" success.fail.num[1] <- 1 } else { success.fail[1] <- "Fail" success.fail.num[1] <-0 } for(i in 2:n){ if(all.plots.n[i]>=n_thresh[i]){ if(upper[i] <= 0){ if(lower[i] >= 0){ success.fail[i]<-"Pass" success.fail.num[i]<-1 } else { success.fail[i] <- "Fail" success.fail.num[i] <-0} } else { success.fail[i] <- "Fail" success.fail.num[i] <- 0 } } else { success.fail[i] <- "Inconclusive" success.fail.num[i] <- 0 } } out.unpaired.test <- data.frame(success.fail, success.fail.num) } out.unpaired.test<-unpaired.test(verification.plots=verification.plots, project.n=30, project.mean=100, project.sd=10, allowed.deviation=.05, project.acres=99) out.unpaired.test # sequential.unpaired<- function(number.strata, project.n, project.mean, project.sd, verification.plots,allowed.deviation, project.acres, min.plots="default", alpha=.05){ if(min.plots=="default"){ min.plots<-NULL # minimum passing if(number.strata == 1 & project.acres <= 100){ min.plots = 8 } else if(number.strata == 1 & project.acres > 100 & project.acres < 500){ min.plots = 12 } else if(number.strata == 1 & project.acres > 500 & project.acres < 5000){ min.plots = 16 } else if(number.strata == 1 & project.acres > 5000 & project.acres < 10000){ min.plots = 20 } else if(number.strata == 1 & project.acres > 10000){ min.plots = 24 } else if(number.strata == 2 & project.acres < 100){ min.plots = 4 } else if(number.strata == 2 & project.acres > 100 & project.acres < 500){ min.plots = 6 } else if(number.strata == 2 & project.acres > 501 & project.acres < 5000){ min.plots = 8 } else if(number.strata == 2 & project.acres > 5001 & project.acres < 10000){ min.plots = 10 } else if(number.strata == 2 & project.acres > 10000){ min.plots = 12 } else if(number.strata == 3 & project.acres < 100){ min.plots = 2 } else if(number.strata == 3 & project.acres > 100 & project.acres < 500){ min.plots = 3 } else if(number.strata == 3 & project.acres > 501 & project.acres < 5000){ min.plots = 4 } else if(number.strata == 3 & project.acres > 5001 & project.acres < 10000){ min.plots = 5 } else if(number.strata == 3 & project.acres > 10000){ min.plots = 6 } else {min.plots=NULL} } else (min.plots = min.plots) if(number.strata > 3){print("max three strata")} if(number.strata > 3){break} p <- length(verification.plots) mp <- min.plots run <- unpaired.test(verification.plots, project.n, project.mean, project.sd, allowed.deviation, project.acres, alpha=.05) number.passing.plots <- function(verification.plots, success.fail.num){ n<-length(verification.plots) number.passing<-rep(0,n) number.passing[1]<-ifelse(sum(success.fail.num[1]> mp),1,0) for(i in 2:n){ if(success.fail.num[i] == 1){ v <- i-1 number.passing[i]<-number.passing[v]+1 } else{number.passing[i] <- 0} } number.passing } number.passing<-number.passing.plots(verification.plots=verification.plots, success.fail.num=run$success.fail.num) sucessful.unsucessful<-rep("blank",p) one.zero <- rep(0,p) result <- "blank" resultL <- 0 n <-length(verification.plots) for(i in 1:n){ if(number.passing[i] >= mp) { sucessful.unsucessful[i] <- "successful" one.zero[i] <- 1 } else { sucessful.unsucessful[i]<- "unsuccessful" one.zero[i] <- 0 } } if(sum(one.zero) > 0 ) { result <- "successful" resultL <- 1 } plots.success <- function(one.zero){ plots.to.success<-NULL for(i in 1:n){ if(sum(one.zero)==0){plots.to.success= NA } else if(one.zero[i]==1){plots.to.success<-append(plots.to.success, i)} } plots.to.success<-min(plots.to.success) plots.to.success } plots.to.success <- plots.success(one.zero=one.zero) complete <-data.frame (min.plots, number.passing, sucessful.unsucessful, one.zero) collapsed <- data.frame(result, resultL, plots.to.success) out <- c(as.list(complete),as.list(collapsed)) } unpaired.out<-sequential.unpaired(number.strata=2, verification.plots=verification.plots, project.n=15, project.mean=100, project.sd=10, allowed.deviation=.05, project.acres=99, min.plots="default") str(unpaired.out) #unpaired.out[[7]][1] ## simulation.unpaired <- function(reps, project.acreage.range = c(99, 110,510,5100,11000), project.mean, project.n.min, project.n.max, project.sd.min, project.sd.max, verification.mean.min, verification.mean.max, number.verification.plots.min, number.verification.plots.max, allowed.deviation=.1, alpha=.05, length.out = 5, min.plots="default") { number.strata.range<-c(1:3) # set by CAR project.n.range <- seq(project.n.min, project.n.max, length.out=length.out) project.sd.range <- seq(project.sd.min, project.sd.max, length.out=length.out) # assume verification sd is the same as the project sd to simplify - seems a reasonable simpification number.verification.plots<- seq(number.verification.plots.min, number.verification.plots.max, length.out=length.out) verification.range <- seq(verification.mean.min, verification.mean.max, length.out=length.out) permut.grid<-expand.grid(number.strata.range, project.n.range, project.acreage.range, project.mean, project.sd.range, number.verification.plots, verification.range, allowed.deviation) # create a matrix with all combinations of the supplied vectors #assign names to the colums of the grid of combinations names.permut<-c("number.strata", "project.n.plots", "project.acreage", "project.mean", "project.sd", "number.verification.plots", "verification.mean", "allowed.deviation") names(permut.grid)<-names.permut # done combinations<-length(permut.grid[,1]) size <-reps*combinations #need to know the size of the master matrix, which is the the number of replications * each combination of the supplied factors # we want a df from which to read all the data into the simulation, and record the results permut.col<-ncol(permut.grid) col.base<-ncol(permut.grid)+2 base <- matrix(nrow=size, ncol=col.base) base <-data.frame(base) # supply the names names.base<-c("number.strata", "project.n.plots", "project.acreage", "project.mean", "project.sd", "number.verification.plots", "verification.mean", "allowed.deviation","success.fail", "plots.to.success") names(base)<-names.base # need to create index vectors for the base, master df ends <- seq(reps+1, size+1, by=reps) begins <- ends-reps index <- cbind(begins, ends-1) #done # next, need to assign the first 6 columns and number of rows = to the number of reps in the simulation to be the given row in the permut.grid matrix pb <- winProgressBar(title="Create base, big loop 1 of 2", label="0% done", min=0, max=100, initial=0) for (i in 1:combinations) { base[index[i,1]:index[i,2],1:permut.col] <- permut.grid[i,] #progress bar info <- sprintf("%d%% done", round((i/combinations)*100)) setWinProgressBar(pb, (i/combinations)*100, label=info) } close(pb) # now, simply feed the values replicated the number of times we want to run the simulation into the sequential.unpaired function, and assign the values to the appropriate columns out.index1<-ncol(permut.grid)+1 out.index2<-ncol(permut.grid)+2 #progress bar pb <- winProgressBar(title="fill values, big loop 2 of 2", label="0% done", min=0, max=100, initial=0) for (i in 1:size){ scalar.base <- base[i,] verification.plots <- rnorm(scalar.base$number.verification.plots, scalar.base$verification.mean, scalar.base$project.sd) result<- sequential.unpaired(scalar.base$number.strata, scalar.base$project.n.plots, scalar.base$project.mean, scalar.base$ project.sd, verification.plots, scalar.base$allowed.deviation, scalar.base$project.acreage, min.plots='default', alpha) base[i,out.index1] <- result[[6]][1] base[i,out.index2] <- result[[7]][1] info <- sprintf("%d%% done", round((i/size)*100)) setWinProgressBar(pb, (i/size)*100, label=info) } close(pb) #return the results return(base) } # I would like reps to = 1000, but that takes a really long time right now test5 <- simulation.unpaired(reps=5, project.acreage.range = c(99, 110,510,5100,11000), project.mean=100, project.n.min=10, project.n.max=100, project.sd.min=.01, project.sd.max=.2, verification.mean.min=100, verification.mean.max=130, number.verification.plots.min=10, number.verification.plots.max=50, length.out = 5) [[alternative HTML version deleted]]
Anindya Sankar Dey
2013-Feb-12 06:01 UTC
[R] improving/speeding up a very large, slow simulation
Hi, I've just read your intro, not the full code. But one way first divide your input data into parts and make a list where each element of the list is one part of your input data. Rest of code put in a function, which will work on a part of your data. Build the function such that it takes list as input. At the beginning of the function unlist your function variable, and then convert to required data frame. Then use mclapply of multicore/parallel package and you'll get results much faster. Well this is the way I'd have tried, but there are other experts who can give a better way to do it. Regards, Anindya On Tue, Feb 12, 2013 at 10:52 AM, Benjamin Caldwell <btcaldwell@berkeley.edu> wrote:> Dear R help; > > I'll preface this by saying that the example I've provided below is pretty > long, turgid, and otherwise a deep dive into a series of functions I wrote > for a simulation study. It is, however, reproducible and self-contained. > > I'm trying to do my first simulation study that's quite big, and so I'll > say that the output of this simulation as I'd like it to be is 9.375 > million rows (9375 combinations of the inputs * 1000). So, first thing, > maybe I should try to cut down on the number of dimensions and do it a > piece at a time. If that's a conclusion people come to here, I'll look into > paring down/doing it a chunk at a time. > > I'm hoping, though, that this is an opportunity for me to learn some more > efficient, elegant ways to a. vectorize and b. use the apply functions, > which still seem to elude me when it comes to their use with more complex, > custom functions that have multiple inputs. > > If having looked the below over you can give me suggestions to help make > this and future code I write run more quickly/efficiently, I will be > most grateful, as as this is currently written it would take what looks > like days to run a thousand simulations of each possible combination of > variables of interest. > > Best > > Ben Caldwell > > ----------------------------------------------- > #unpaired > verification.plots<-rnorm(30, 100, 10) > # writeClipboard(as.character(verification.plots)) > > unpaired.test<- function(verification.plots, project.n, project.mean, > project.sd, allowed.deviation, project.acres, alpha=.05){ > > verification.plots <-as.numeric(as.character(verification.plots)) > a <- qnorm(alpha/2) > d <- alpha*project.mean > > # verifier plot number > > n<-length(verification.plots) > verifier.plot.number <- c(1:n) > > > #all plots (verifier and project) > > all.plots.n <- rep(0,n) > for(i in 1:n){ > all.plots.n[i] <- project.n + verifier.plot.number[i] > } > > #running mean > X_bar <- rep(1,n) > > for(i in 1:n){ > X_bar[i]<- mean(verification.plots[1:i]) > } > #running sd > sd2 <- NULL > > for(i in 2:n){ > sd2[i]<-sd(verification.plots[1:i]) > } > #Tn > Tn<-NULL > > for(i in 2:n){ > Tn[i] <- project.mean-X_bar[i] > } > #n_Threshold > n_thresh<-NULL > > for(i in 2:n){ > n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2))) > } > #Upper > upper <- NULL > for (i in 2:n){ > upper[i] <- Tn[i]-d > } > #Lower > lower<- NULL > for(i in 2:n){ > lower[i] <- Tn[i]+d > } > > #Success.fail > success.fail <- NULL > > > > success.fail.num <- rep(0,n) > > if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){ > success.fail[1] <- "Pass" > success.fail.num[1] <- 1 > } else { > success.fail[1] <- "Fail" > success.fail.num[1] <-0 > } > for(i in 2:n){ > if(all.plots.n[i]>=n_thresh[i]){ > if(upper[i] <= 0){ > if(lower[i] >= 0){ > success.fail[i]<-"Pass" > success.fail.num[i]<-1 > } else { > success.fail[i] <- "Fail" > success.fail.num[i] <-0} > } else { > success.fail[i] <- "Fail" > success.fail.num[i] <- 0 > } > > } else { > success.fail[i] <- "Inconclusive" > success.fail.num[i] <- 0 > } > } > > out.unpaired.test <- data.frame(success.fail, success.fail.num) > } > > > out.unpaired.test<-unpaired.test(verification.plots=verification.plots, > project.n=30, project.mean=100, project.sd=10, allowed.deviation=.05, > project.acres=99) > out.unpaired.test > # > sequential.unpaired<- function(number.strata, project.n, project.mean, > project.sd, verification.plots,allowed.deviation, project.acres, > min.plots="default", alpha=.05){ > > if(min.plots=="default"){ > min.plots<-NULL # minimum passing > if(number.strata == 1 & project.acres <= 100){ > min.plots = 8 > } else if(number.strata == 1 & project.acres > 100 & project.acres < 500){ > min.plots = 12 > } else if(number.strata == 1 & project.acres > 500 & project.acres < > 5000){ > min.plots = 16 > } else if(number.strata == 1 & project.acres > 5000 & project.acres < > 10000){ > min.plots = 20 > } else if(number.strata == 1 & project.acres > 10000){ > min.plots = 24 > } else if(number.strata == 2 & project.acres < 100){ > min.plots = 4 > } else if(number.strata == 2 & project.acres > 100 & project.acres < 500){ > min.plots = 6 > } else if(number.strata == 2 & project.acres > 501 & project.acres < 5000){ > min.plots = 8 > } else if(number.strata == 2 & project.acres > 5001 & project.acres < > 10000){ > min.plots = 10 > } else if(number.strata == 2 & project.acres > 10000){ > min.plots = 12 > } else if(number.strata == 3 & project.acres < 100){ > min.plots = 2 > } else if(number.strata == 3 & project.acres > 100 & project.acres < > 500){ > min.plots = 3 > } else if(number.strata == 3 & project.acres > 501 & project.acres < 5000){ > min.plots = 4 > } else if(number.strata == 3 & project.acres > 5001 & project.acres < > 10000){ > min.plots = 5 > } else if(number.strata == 3 & project.acres > 10000){ > min.plots = 6 > } else {min.plots=NULL} > } else (min.plots = min.plots) > > > if(number.strata > 3){print("max three strata")} > if(number.strata > 3){break} > > > p <- length(verification.plots) > mp <- min.plots > run <- unpaired.test(verification.plots, project.n, project.mean, > project.sd, > allowed.deviation, project.acres, alpha=.05) > number.passing.plots <- function(verification.plots, success.fail.num){ > n<-length(verification.plots) > number.passing<-rep(0,n) > number.passing[1]<-ifelse(sum(success.fail.num[1]> mp),1,0) > for(i in 2:n){ > if(success.fail.num[i] == 1){ > v <- i-1 > number.passing[i]<-number.passing[v]+1 > } else{number.passing[i] <- 0} > } > number.passing > } > number.passing<-number.passing.plots(verification.plots=verification.plots, > success.fail.num=run$success.fail.num) > sucessful.unsucessful<-rep("blank",p) > one.zero <- rep(0,p) > result <- "blank" > resultL <- 0 > n <-length(verification.plots) > for(i in 1:n){ > if(number.passing[i] >= mp) { > sucessful.unsucessful[i] <- "successful" > one.zero[i] <- 1 > } else { > sucessful.unsucessful[i]<- "unsuccessful" > one.zero[i] <- 0 > } > } > > if(sum(one.zero) > 0 ) { > result <- "successful" > resultL <- 1 > } > > plots.success <- function(one.zero){ > > plots.to.success<-NULL > > for(i in 1:n){ > if(sum(one.zero)==0){plots.to.success= NA > } else if(one.zero[i]==1){plots.to.success<-append(plots.to.success, i)} > } > plots.to.success<-min(plots.to.success) > plots.to.success > } > > plots.to.success <- plots.success(one.zero=one.zero) > > complete <-data.frame (min.plots, number.passing, sucessful.unsucessful, > one.zero) > collapsed <- data.frame(result, resultL, plots.to.success) > out <- c(as.list(complete),as.list(collapsed)) > } > > > unpaired.out<-sequential.unpaired(number.strata=2, > verification.plots=verification.plots, project.n=15, project.mean=100, > project.sd=10, allowed.deviation=.05, project.acres=99, > min.plots="default") > > str(unpaired.out) > #unpaired.out[[7]][1] > > ## > > simulation.unpaired <- function(reps, project.acreage.range = c(99, > 110,510,5100,11000), project.mean, project.n.min, project.n.max, > project.sd.min, project.sd.max, verification.mean.min, > verification.mean.max, number.verification.plots.min, > number.verification.plots.max, allowed.deviation=.1, alpha=.05, length.out > = 5, min.plots="default") { > > number.strata.range<-c(1:3) # set by CAR > > project.n.range <- seq(project.n.min, project.n.max, length.out=length.out) > > project.sd.range <- seq(project.sd.min, project.sd.max, > length.out=length.out) # assume verification sd is the same as the project > sd to simplify - seems a reasonable simpification > > number.verification.plots<- seq(number.verification.plots.min, > number.verification.plots.max, length.out=length.out) > > verification.range <- seq(verification.mean.min, verification.mean.max, > length.out=length.out) > > permut.grid<-expand.grid(number.strata.range, project.n.range, > project.acreage.range, project.mean, project.sd.range, > number.verification.plots, verification.range, allowed.deviation) # create > a matrix with all combinations of the supplied vectors > > #assign names to the colums of the grid of combinations > names.permut<-c("number.strata", "project.n.plots", "project.acreage", > "project.mean", "project.sd", "number.verification.plots", > "verification.mean", "allowed.deviation") > > names(permut.grid)<-names.permut # done > > combinations<-length(permut.grid[,1]) > > size <-reps*combinations #need to know the size of the master matrix, which > is the the number of replications * each combination of the supplied > factors > > # we want a df from which to read all the data into the simulation, and > record the results > permut.col<-ncol(permut.grid) > col.base<-ncol(permut.grid)+2 > base <- matrix(nrow=size, ncol=col.base) > base <-data.frame(base) > > # supply the names > names.base<-c("number.strata", "project.n.plots", "project.acreage", > "project.mean", "project.sd", "number.verification.plots", > "verification.mean", "allowed.deviation","success.fail", > "plots.to.success") > > names(base)<-names.base > > # need to create index vectors for the base, master df > ends <- seq(reps+1, size+1, by=reps) > begins <- ends-reps > index <- cbind(begins, ends-1) > #done > > # next, need to assign the first 6 columns and number of rows = to the > number of reps in the simulation to be the given row in the permut.grid > matrix > > pb <- winProgressBar(title="Create base, big loop 1 of 2", label="0% done", > min=0, max=100, initial=0) > > for (i in 1:combinations) { > > base[index[i,1]:index[i,2],1:permut.col] <- permut.grid[i,] > #progress bar > info <- sprintf("%d%% done", round((i/combinations)*100)) > setWinProgressBar(pb, (i/combinations)*100, label=info) > } > > close(pb) > > # now, simply feed the values replicated the number of times we want to run > the simulation into the sequential.unpaired function, and assign the values > to the appropriate columns > > out.index1<-ncol(permut.grid)+1 > out.index2<-ncol(permut.grid)+2 > > #progress bar > pb <- winProgressBar(title="fill values, big loop 2 of 2", label="0% done", > min=0, max=100, initial=0) > > for (i in 1:size){ > > scalar.base <- base[i,] > verification.plots <- rnorm(scalar.base$number.verification.plots, > scalar.base$verification.mean, scalar.base$project.sd) > result<- sequential.unpaired(scalar.base$number.strata, > scalar.base$project.n.plots, scalar.base$project.mean, scalar.base$ > project.sd, verification.plots, scalar.base$allowed.deviation, > scalar.base$project.acreage, min.plots='default', alpha) > > base[i,out.index1] <- result[[6]][1] > base[i,out.index2] <- result[[7]][1] > info <- sprintf("%d%% done", round((i/size)*100)) > setWinProgressBar(pb, (i/size)*100, label=info) > } > > close(pb) > #return the results > return(base) > > } > > # I would like reps to = 1000, but that takes a really long time right now > test5 <- simulation.unpaired(reps=5, project.acreage.range = c(99, > 110,510,5100,11000), project.mean=100, project.n.min=10, project.n.max=100, > project.sd.min=.01, project.sd.max=.2, verification.mean.min=100, > verification.mean.max=130, number.verification.plots.min=10, > number.verification.plots.max=50, length.out = 5) > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help@r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. >-- Anindya Sankar Dey [[alternative HTML version deleted]]
Joshua Wiley
2013-Feb-12 06:10 UTC
[R] improving/speeding up a very large, slow simulation
Hi Ben, I appreciate that the example is reproducible, and I understand why you shared the entire project. At the same time, that is an awful lot of code for volunteers to go through and try to optimize. I would try to think of the structure of your task, see what the individual pieces are---figure out what can be reused and optimized. Look at loops and try to think whether some operation performed on every element of the vector at once could do the same thing. Sometimes the next iteration of a loop depends on the previous, so it is difficult to vectorize, but often that is not the case. Make use of the compiler package, especially cmpfun(). It can give fairly substantial speedups, particularly with for loops in R. Finally if you want 1000 reps, do you actually need to repeat all the steps 1000 times, or could you just simulate 1000x the data? If the latter, do the steps once but make more data. If the former, you ought to be able to parallelize it fairly easily, see the parallel package. Good luck, Josh On Mon, Feb 11, 2013 at 9:22 PM, Benjamin Caldwell <btcaldwell at berkeley.edu> wrote:> Dear R help; > > I'll preface this by saying that the example I've provided below is pretty > long, turgid, and otherwise a deep dive into a series of functions I wrote > for a simulation study. It is, however, reproducible and self-contained. > > I'm trying to do my first simulation study that's quite big, and so I'll > say that the output of this simulation as I'd like it to be is 9.375 > million rows (9375 combinations of the inputs * 1000). So, first thing, > maybe I should try to cut down on the number of dimensions and do it a > piece at a time. If that's a conclusion people come to here, I'll look into > paring down/doing it a chunk at a time. > > I'm hoping, though, that this is an opportunity for me to learn some more > efficient, elegant ways to a. vectorize and b. use the apply functions, > which still seem to elude me when it comes to their use with more complex, > custom functions that have multiple inputs. > > If having looked the below over you can give me suggestions to help make > this and future code I write run more quickly/efficiently, I will be > most grateful, as as this is currently written it would take what looks > like days to run a thousand simulations of each possible combination of > variables of interest. > > Best > > Ben Caldwell > > ----------------------------------------------- > #unpaired > verification.plots<-rnorm(30, 100, 10) > # writeClipboard(as.character(verification.plots)) > > unpaired.test<- function(verification.plots, project.n, project.mean, > project.sd, allowed.deviation, project.acres, alpha=.05){ > > verification.plots <-as.numeric(as.character(verification.plots)) > a <- qnorm(alpha/2) > d <- alpha*project.mean > > # verifier plot number > > n<-length(verification.plots) > verifier.plot.number <- c(1:n) > > > #all plots (verifier and project) > > all.plots.n <- rep(0,n) > for(i in 1:n){ > all.plots.n[i] <- project.n + verifier.plot.number[i] > } > > #running mean > X_bar <- rep(1,n) > > for(i in 1:n){ > X_bar[i]<- mean(verification.plots[1:i]) > } > #running sd > sd2 <- NULL > > for(i in 2:n){ > sd2[i]<-sd(verification.plots[1:i]) > } > #Tn > Tn<-NULL > > for(i in 2:n){ > Tn[i] <- project.mean-X_bar[i] > } > #n_Threshold > n_thresh<-NULL > > for(i in 2:n){ > n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2))) > } > #Upper > upper <- NULL > for (i in 2:n){ > upper[i] <- Tn[i]-d > } > #Lower > lower<- NULL > for(i in 2:n){ > lower[i] <- Tn[i]+d > } > > #Success.fail > success.fail <- NULL > > > > success.fail.num <- rep(0,n) > if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){ > success.fail[1] <- "Pass" > success.fail.num[1] <- 1 > } else { > success.fail[1] <- "Fail" > success.fail.num[1] <-0 > } > for(i in 2:n){ > if(all.plots.n[i]>=n_thresh[i]){ > if(upper[i] <= 0){ > if(lower[i] >= 0){ > success.fail[i]<-"Pass" > success.fail.num[i]<-1 > } else { > success.fail[i] <- "Fail" > success.fail.num[i] <-0} > } else { > success.fail[i] <- "Fail" > success.fail.num[i] <- 0 > } > > } else { > success.fail[i] <- "Inconclusive" > success.fail.num[i] <- 0 > } > } > > out.unpaired.test <- data.frame(success.fail, success.fail.num) > } > > > out.unpaired.test<-unpaired.test(verification.plots=verification.plots, > project.n=30, project.mean=100, project.sd=10, allowed.deviation=.05, > project.acres=99) > out.unpaired.test > # > sequential.unpaired<- function(number.strata, project.n, project.mean, > project.sd, verification.plots,allowed.deviation, project.acres, > min.plots="default", alpha=.05){ > > if(min.plots=="default"){ > min.plots<-NULL # minimum passing > if(number.strata == 1 & project.acres <= 100){ > min.plots = 8 > } else if(number.strata == 1 & project.acres > 100 & project.acres < 500){ > min.plots = 12 > } else if(number.strata == 1 & project.acres > 500 & project.acres < 5000){ > min.plots = 16 > } else if(number.strata == 1 & project.acres > 5000 & project.acres < > 10000){ > min.plots = 20 > } else if(number.strata == 1 & project.acres > 10000){ > min.plots = 24 > } else if(number.strata == 2 & project.acres < 100){ > min.plots = 4 > } else if(number.strata == 2 & project.acres > 100 & project.acres < 500){ > min.plots = 6 > } else if(number.strata == 2 & project.acres > 501 & project.acres < 5000){ > min.plots = 8 > } else if(number.strata == 2 & project.acres > 5001 & project.acres < > 10000){ > min.plots = 10 > } else if(number.strata == 2 & project.acres > 10000){ > min.plots = 12 > } else if(number.strata == 3 & project.acres < 100){ > min.plots = 2 > } else if(number.strata == 3 & project.acres > 100 & project.acres < > 500){ > min.plots = 3 > } else if(number.strata == 3 & project.acres > 501 & project.acres < 5000){ > min.plots = 4 > } else if(number.strata == 3 & project.acres > 5001 & project.acres < > 10000){ > min.plots = 5 > } else if(number.strata == 3 & project.acres > 10000){ > min.plots = 6 > } else {min.plots=NULL} > } else (min.plots = min.plots) > > > if(number.strata > 3){print("max three strata")} > if(number.strata > 3){break} > > > p <- length(verification.plots) > mp <- min.plots > run <- unpaired.test(verification.plots, project.n, project.mean, project.sd, > allowed.deviation, project.acres, alpha=.05) > number.passing.plots <- function(verification.plots, success.fail.num){ > n<-length(verification.plots) > number.passing<-rep(0,n) > number.passing[1]<-ifelse(sum(success.fail.num[1]> mp),1,0) > for(i in 2:n){ > if(success.fail.num[i] == 1){ > v <- i-1 > number.passing[i]<-number.passing[v]+1 > } else{number.passing[i] <- 0} > } > number.passing > } > number.passing<-number.passing.plots(verification.plots=verification.plots, > success.fail.num=run$success.fail.num) > sucessful.unsucessful<-rep("blank",p) > one.zero <- rep(0,p) > result <- "blank" > resultL <- 0 > n <-length(verification.plots) > for(i in 1:n){ > if(number.passing[i] >= mp) { > sucessful.unsucessful[i] <- "successful" > one.zero[i] <- 1 > } else { > sucessful.unsucessful[i]<- "unsuccessful" > one.zero[i] <- 0 > } > } > > if(sum(one.zero) > 0 ) { > result <- "successful" > resultL <- 1 > } > > plots.success <- function(one.zero){ > > plots.to.success<-NULL > > for(i in 1:n){ > if(sum(one.zero)==0){plots.to.success= NA > } else if(one.zero[i]==1){plots.to.success<-append(plots.to.success, i)} > } > plots.to.success<-min(plots.to.success) > plots.to.success > } > > plots.to.success <- plots.success(one.zero=one.zero) > > complete <-data.frame (min.plots, number.passing, sucessful.unsucessful, > one.zero) > collapsed <- data.frame(result, resultL, plots.to.success) > out <- c(as.list(complete),as.list(collapsed)) > } > > > unpaired.out<-sequential.unpaired(number.strata=2, > verification.plots=verification.plots, project.n=15, project.mean=100, > project.sd=10, allowed.deviation=.05, project.acres=99, min.plots="default") > > str(unpaired.out) > #unpaired.out[[7]][1] > > ## > > simulation.unpaired <- function(reps, project.acreage.range = c(99, > 110,510,5100,11000), project.mean, project.n.min, project.n.max, > project.sd.min, project.sd.max, verification.mean.min, > verification.mean.max, number.verification.plots.min, > number.verification.plots.max, allowed.deviation=.1, alpha=.05, length.out > = 5, min.plots="default") { > > number.strata.range<-c(1:3) # set by CAR > > project.n.range <- seq(project.n.min, project.n.max, length.out=length.out) > > project.sd.range <- seq(project.sd.min, project.sd.max, > length.out=length.out) # assume verification sd is the same as the project > sd to simplify - seems a reasonable simpification > > number.verification.plots<- seq(number.verification.plots.min, > number.verification.plots.max, length.out=length.out) > > verification.range <- seq(verification.mean.min, verification.mean.max, > length.out=length.out) > > permut.grid<-expand.grid(number.strata.range, project.n.range, > project.acreage.range, project.mean, project.sd.range, > number.verification.plots, verification.range, allowed.deviation) # create > a matrix with all combinations of the supplied vectors > > #assign names to the colums of the grid of combinations > names.permut<-c("number.strata", "project.n.plots", "project.acreage", > "project.mean", "project.sd", "number.verification.plots", > "verification.mean", "allowed.deviation") > > names(permut.grid)<-names.permut # done > > combinations<-length(permut.grid[,1]) > > size <-reps*combinations #need to know the size of the master matrix, which > is the the number of replications * each combination of the supplied factors > > # we want a df from which to read all the data into the simulation, and > record the results > permut.col<-ncol(permut.grid) > col.base<-ncol(permut.grid)+2 > base <- matrix(nrow=size, ncol=col.base) > base <-data.frame(base) > > # supply the names > names.base<-c("number.strata", "project.n.plots", "project.acreage", > "project.mean", "project.sd", "number.verification.plots", > "verification.mean", "allowed.deviation","success.fail", "plots.to.success") > > names(base)<-names.base > > # need to create index vectors for the base, master df > ends <- seq(reps+1, size+1, by=reps) > begins <- ends-reps > index <- cbind(begins, ends-1) > #done > > # next, need to assign the first 6 columns and number of rows = to the > number of reps in the simulation to be the given row in the permut.grid > matrix > > pb <- winProgressBar(title="Create base, big loop 1 of 2", label="0% done", > min=0, max=100, initial=0) > > for (i in 1:combinations) { > > base[index[i,1]:index[i,2],1:permut.col] <- permut.grid[i,] > #progress bar > info <- sprintf("%d%% done", round((i/combinations)*100)) > setWinProgressBar(pb, (i/combinations)*100, label=info) > } > > close(pb) > > # now, simply feed the values replicated the number of times we want to run > the simulation into the sequential.unpaired function, and assign the values > to the appropriate columns > > out.index1<-ncol(permut.grid)+1 > out.index2<-ncol(permut.grid)+2 > > #progress bar > pb <- winProgressBar(title="fill values, big loop 2 of 2", label="0% done", > min=0, max=100, initial=0) > > for (i in 1:size){ > > scalar.base <- base[i,] > verification.plots <- rnorm(scalar.base$number.verification.plots, > scalar.base$verification.mean, scalar.base$project.sd) > result<- sequential.unpaired(scalar.base$number.strata, > scalar.base$project.n.plots, scalar.base$project.mean, scalar.base$ > project.sd, verification.plots, scalar.base$allowed.deviation, > scalar.base$project.acreage, min.plots='default', alpha) > > base[i,out.index1] <- result[[6]][1] > base[i,out.index2] <- result[[7]][1] > info <- sprintf("%d%% done", round((i/size)*100)) > setWinProgressBar(pb, (i/size)*100, label=info) > } > > close(pb) > #return the results > return(base) > > } > > # I would like reps to = 1000, but that takes a really long time right now > test5 <- simulation.unpaired(reps=5, project.acreage.range = c(99, > 110,510,5100,11000), project.mean=100, project.n.min=10, project.n.max=100, > project.sd.min=.01, project.sd.max=.2, verification.mean.min=100, > verification.mean.max=130, number.verification.plots.min=10, > number.verification.plots.max=50, length.out = 5) > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.-- Joshua Wiley Ph.D. Student, Health Psychology Programmer Analyst II, Statistical Consulting Group University of California, Los Angeles https://joshuawiley.com/
Benjamin Caldwell
2013-Feb-13 23:33 UTC
[R] improving/speeding up a very large, slow simulation
Joshua, Thanks for the example, and the explanation. Nice article that you wrote - the figures alone deserve a deeper dive, I think. As side note, I found out about profiling on another post and did it - it appears that my ignorance about how slow dataframes can be was the main problem. After turning the dfs into lists and matrixes, the improvement was huge! Wish I had known about this issue ahead of time - it must be somewhere in the R help books in big red letters, but if it isn't it should be. Seems most of the discussions about speeding up code talks about vectorization and avoiding fragmentation. The below was for five iterations of the simulation, and I imagine was the reason it was hanging up and stalling out for large simulations: Original: $by.total total.time total.pct self.time self.pct simulation.unpaired.c 2857.54 99.97 111.90 3.91 [<- 2555.74 89.41 2.78 0.10 [<-.data.frame 2552.96 89.32 2543.84 89.00 sequential.unpaired.c 149.48 5.23 2.66 0.09 unpaired.test.c 92.30 3.23 18.80 0.66 data.frame 70.72 2.47 9.68 0.34 as.data.frame 42.36 1.48 3.74 0.132 only using dataframe as the output: $by.total total.time total.pct self.time self.pct simulation.unpaired.c 109.14 100.00 0.90 0.82 sequential.unpaired.c 92.16 84.44 3.28 3.01 unpaired.test.c 84.48 77.41 21.08 19.31 sd 43.68 40.02 5.42 4.97 Thanks again folks. On Tue, Feb 12, 2013 at 9:45 PM, Joshua Wiley <jwiley.psych@gmail.com>wrote:> Hi Ben, > > That is correct about vectorizing----to some speed tests to see the > effects. They can be quite dramatic (like I have easily seen 300fold > speedups from doing that). The apply functions are essentially > loops---they tend to be about the same speed as loops, though slightly > less code. > > Compiler is nice---not it will not help already vectorized code > (because vectorized code really just means code that is linked to C > code that uses a compiled loop, but C is much faster than R. > > Sure, check out the bootstrapping (midway down) section on this page I > wrote: http://www.ats.ucla.edu/stat/r/dae/melogit.htm > > Cheers, > > Josh > > On Tue, Feb 12, 2013 at 12:57 PM, Benjamin Caldwell > <btcaldwell@berkeley.edu> wrote: > > Joshua, > > > > So, to your first suggestion - try to figure out whether some operation > > performed on every element of the vector at once could do the same thing > - > > the answer is yes! Where can I see examples of that implemented? > > > > e.g. would it just be > > > > a <- 1:10000 > > b <- 1:10000 > > c <- 1:10000 > > d <- data.frame(b,c) > > > > vectorize<-function(a, d) { > > > > f <- d[,1]+d[,2]+a > > f > > } > > > > out<-vectorize(a=a,d=d) > > out > > > > vs > > > > loop<- function(a,d){ > > > > for(i in 1:length(d[,1])){ > > a[i]<-d[i,1]+d[i,2]+a[i] > > } > > a > > } > > > > out.l<-loop(a,d) > > > > out.l > > > > If so, it's vectorization that's the big advantage, not something > mysterious > > that's happening under the hood with the apply functions? > > > > To your second : Compiler - thanks for the tip, that's great to know! > > > > To your last, could you please give an example or point me to one that > was > > useful for you? I don't understand that well enough to use it. > > > > Thanks! > > > > Ben Caldwell > > > > Graduate Fellow > > University of California, Berkeley > > 130 Mulford Hall #3114 > > Berkeley, CA 94720 > > Office 223 Mulford Hall > > (510)859-3358 > > > > > > On Mon, Feb 11, 2013 at 10:10 PM, Joshua Wiley <jwiley.psych@gmail.com> > > wrote: > >> > >> Hi Ben, > >> > >> I appreciate that the example is reproducible, and I understand why > >> you shared the entire project. At the same time, that is an awful lot > >> of code for volunteers to go through and try to optimize. > >> > >> I would try to think of the structure of your task, see what the > >> individual pieces are---figure out what can be reused and optimized. > >> Look at loops and try to think whether some operation performed on > >> every element of the vector at once could do the same thing. > >> Sometimes the next iteration of a loop depends on the previous, so it > >> is difficult to vectorize, but often that is not the case. > >> > >> Make use of the compiler package, especially cmpfun(). It can give > >> fairly substantial speedups, particularly with for loops in R. > >> > >> Finally if you want 1000 reps, do you actually need to repeat all the > >> steps 1000 times, or could you just simulate 1000x the data? If the > >> latter, do the steps once but make more data. If the former, you > >> ought to be able to parallelize it fairly easily, see the parallel > >> package. > >> > >> Good luck, > >> > >> Josh > >> > >> > >> On Mon, Feb 11, 2013 at 9:22 PM, Benjamin Caldwell > >> <btcaldwell@berkeley.edu> wrote: > >> > Dear R help; > >> > > >> > I'll preface this by saying that the example I've provided below is > >> > pretty > >> > long, turgid, and otherwise a deep dive into a series of functions I > >> > wrote > >> > for a simulation study. It is, however, reproducible and > self-contained. > >> > > >> > I'm trying to do my first simulation study that's quite big, and so > I'll > >> > say that the output of this simulation as I'd like it to be is 9.375 > >> > million rows (9375 combinations of the inputs * 1000). So, first > thing, > >> > maybe I should try to cut down on the number of dimensions and do it a > >> > piece at a time. If that's a conclusion people come to here, I'll look > >> > into > >> > paring down/doing it a chunk at a time. > >> > > >> > I'm hoping, though, that this is an opportunity for me to learn some > >> > more > >> > efficient, elegant ways to a. vectorize and b. use the apply > functions, > >> > which still seem to elude me when it comes to their use with more > >> > complex, > >> > custom functions that have multiple inputs. > >> > > >> > If having looked the below over you can give me suggestions to help > make > >> > this and future code I write run more quickly/efficiently, I will be > >> > most grateful, as as this is currently written it would take what > looks > >> > like days to run a thousand simulations of each possible combination > of > >> > variables of interest. > >> > > >> > Best > >> > > >> > Ben Caldwell > >> > > >> > ----------------------------------------------- > >> > #unpaired > >> > verification.plots<-rnorm(30, 100, 10) > >> > # writeClipboard(as.character(verification.plots)) > >> > > >> > unpaired.test<- function(verification.plots, project.n, project.mean, > >> > project.sd, allowed.deviation, project.acres, alpha=.05){ > >> > > >> > verification.plots <-as.numeric(as.character(verification.plots)) > >> > a <- qnorm(alpha/2) > >> > d <- alpha*project.mean > >> > > >> > # verifier plot number > >> > > >> > n<-length(verification.plots) > >> > verifier.plot.number <- c(1:n) > >> > > >> > > >> > #all plots (verifier and project) > >> > > >> > all.plots.n <- rep(0,n) > >> > for(i in 1:n){ > >> > all.plots.n[i] <- project.n + verifier.plot.number[i] > >> > } > >> > > >> > #running mean > >> > X_bar <- rep(1,n) > >> > > >> > for(i in 1:n){ > >> > X_bar[i]<- mean(verification.plots[1:i]) > >> > } > >> > #running sd > >> > sd2 <- NULL > >> > > >> > for(i in 2:n){ > >> > sd2[i]<-sd(verification.plots[1:i]) > >> > } > >> > #Tn > >> > Tn<-NULL > >> > > >> > for(i in 2:n){ > >> > Tn[i] <- project.mean-X_bar[i] > >> > } > >> > #n_Threshold > >> > n_thresh<-NULL > >> > > >> > for(i in 2:n){ > >> > n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2))) > >> > } > >> > #Upper > >> > upper <- NULL > >> > for (i in 2:n){ > >> > upper[i] <- Tn[i]-d > >> > } > >> > #Lower > >> > lower<- NULL > >> > for(i in 2:n){ > >> > lower[i] <- Tn[i]+d > >> > } > >> > > >> > #Success.fail > >> > success.fail <- NULL > >> > > >> > > >> > > >> > success.fail.num <- rep(0,n) > >> > > >> > > if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){ > >> > success.fail[1] <- "Pass" > >> > success.fail.num[1] <- 1 > >> > } else { > >> > success.fail[1] <- "Fail" > >> > success.fail.num[1] <-0 > >> > } > >> > for(i in 2:n){ > >> > if(all.plots.n[i]>=n_thresh[i]){ > >> > if(upper[i] <= 0){ > >> > if(lower[i] >= 0){ > >> > success.fail[i]<-"Pass" > >> > success.fail.num[i]<-1 > >> > } else { > >> > success.fail[i] <- "Fail" > >> > success.fail.num[i] <-0} > >> > } else { > >> > success.fail[i] <- "Fail" > >> > success.fail.num[i] <- 0 > >> > } > >> > > >> > } else { > >> > success.fail[i] <- "Inconclusive" > >> > success.fail.num[i] <- 0 > >> > } > >> > } > >> > > >> > out.unpaired.test <- data.frame(success.fail, success.fail.num) > >> > } > >> > > >> > > >> > > out.unpaired.test<-unpaired.test(verification.plots=verification.plots, > >> > project.n=30, project.mean=100, project.sd=10, allowed.deviation=.05, > >> > project.acres=99) > >> > out.unpaired.test > >> > # > >> > sequential.unpaired<- function(number.strata, project.n, project.mean, > >> > project.sd, verification.plots,allowed.deviation, project.acres, > >> > min.plots="default", alpha=.05){ > >> > > >> > if(min.plots=="default"){ > >> > min.plots<-NULL # minimum passing > >> > if(number.strata == 1 & project.acres <= 100){ > >> > min.plots = 8 > >> > } else if(number.strata == 1 & project.acres > 100 & project.acres < > >> > 500){ > >> > min.plots = 12 > >> > } else if(number.strata == 1 & project.acres > 500 & project.acres < > >> > 5000){ > >> > min.plots = 16 > >> > } else if(number.strata == 1 & project.acres > 5000 & > project.acres > >> > < > >> > 10000){ > >> > min.plots = 20 > >> > } else if(number.strata == 1 & project.acres > 10000){ > >> > min.plots = 24 > >> > } else if(number.strata == 2 & project.acres < 100){ > >> > min.plots = 4 > >> > } else if(number.strata == 2 & project.acres > 100 & project.acres < > >> > 500){ > >> > min.plots = 6 > >> > } else if(number.strata == 2 & project.acres > 501 & project.acres < > >> > 5000){ > >> > min.plots = 8 > >> > } else if(number.strata == 2 & project.acres > 5001 & project.acres < > >> > 10000){ > >> > min.plots = 10 > >> > } else if(number.strata == 2 & project.acres > 10000){ > >> > min.plots = 12 > >> > } else if(number.strata == 3 & project.acres < 100){ > >> > min.plots = 2 > >> > } else if(number.strata == 3 & project.acres > 100 & > project.acres < > >> > 500){ > >> > min.plots = 3 > >> > } else if(number.strata == 3 & project.acres > 501 & project.acres < > >> > 5000){ > >> > min.plots = 4 > >> > } else if(number.strata == 3 & project.acres > 5001 & project.acres < > >> > 10000){ > >> > min.plots = 5 > >> > } else if(number.strata == 3 & project.acres > 10000){ > >> > min.plots = 6 > >> > } else {min.plots=NULL} > >> > } else (min.plots = min.plots) > >> > > >> > > >> > if(number.strata > 3){print("max three strata")} > >> > if(number.strata > 3){break} > >> > > >> > > >> > p <- length(verification.plots) > >> > mp <- min.plots > >> > run <- unpaired.test(verification.plots, project.n, project.mean, > >> > project.sd, > >> > allowed.deviation, project.acres, alpha=.05) > >> > number.passing.plots <- function(verification.plots, > success.fail.num){ > >> > n<-length(verification.plots) > >> > number.passing<-rep(0,n) > >> > number.passing[1]<-ifelse(sum(success.fail.num[1]> mp),1,0) > >> > for(i in 2:n){ > >> > if(success.fail.num[i] == 1){ > >> > v <- i-1 > >> > number.passing[i]<-number.passing[v]+1 > >> > } else{number.passing[i] <- 0} > >> > } > >> > number.passing > >> > } > >> > > >> > > number.passing<-number.passing.plots(verification.plots=verification.plots, > >> > success.fail.num=run$success.fail.num) > >> > sucessful.unsucessful<-rep("blank",p) > >> > one.zero <- rep(0,p) > >> > result <- "blank" > >> > resultL <- 0 > >> > n <-length(verification.plots) > >> > for(i in 1:n){ > >> > if(number.passing[i] >= mp) { > >> > sucessful.unsucessful[i] <- "successful" > >> > one.zero[i] <- 1 > >> > } else { > >> > sucessful.unsucessful[i]<- "unsuccessful" > >> > one.zero[i] <- 0 > >> > } > >> > } > >> > > >> > if(sum(one.zero) > 0 ) { > >> > result <- "successful" > >> > resultL <- 1 > >> > } > >> > > >> > plots.success <- function(one.zero){ > >> > > >> > plots.to.success<-NULL > >> > > >> > for(i in 1:n){ > >> > if(sum(one.zero)==0){plots.to.success= NA > >> > } else if(one.zero[i]==1){plots.to.success<-append(plots.to.success, > >> > i)} > >> > } > >> > plots.to.success<-min(plots.to.success) > >> > plots.to.success > >> > } > >> > > >> > plots.to.success <- plots.success(one.zero=one.zero) > >> > > >> > complete <-data.frame (min.plots, number.passing, > sucessful.unsucessful, > >> > one.zero) > >> > collapsed <- data.frame(result, resultL, plots.to.success) > >> > out <- c(as.list(complete),as.list(collapsed)) > >> > } > >> > > >> > > >> > unpaired.out<-sequential.unpaired(number.strata=2, > >> > verification.plots=verification.plots, project.n=15, project.mean=100, > >> > project.sd=10, allowed.deviation=.05, project.acres=99, > >> > min.plots="default") > >> > > >> > str(unpaired.out) > >> > #unpaired.out[[7]][1] > >> > > >> > ## > >> > > >> > simulation.unpaired <- function(reps, project.acreage.range = c(99, > >> > 110,510,5100,11000), project.mean, project.n.min, project.n.max, > >> > project.sd.min, project.sd.max, verification.mean.min, > >> > verification.mean.max, number.verification.plots.min, > >> > number.verification.plots.max, allowed.deviation=.1, alpha=.05, > >> > length.out > >> > = 5, min.plots="default") { > >> > > >> > number.strata.range<-c(1:3) # set by CAR > >> > > >> > project.n.range <- seq(project.n.min, project.n.max, > >> > length.out=length.out) > >> > > >> > project.sd.range <- seq(project.sd.min, project.sd.max, > >> > length.out=length.out) # assume verification sd is the same as the > >> > project > >> > sd to simplify - seems a reasonable simpification > >> > > >> > number.verification.plots<- seq(number.verification.plots.min, > >> > number.verification.plots.max, length.out=length.out) > >> > > >> > verification.range <- seq(verification.mean.min, > verification.mean.max, > >> > length.out=length.out) > >> > > >> > permut.grid<-expand.grid(number.strata.range, project.n.range, > >> > project.acreage.range, project.mean, project.sd.range, > >> > number.verification.plots, verification.range, allowed.deviation) # > >> > create > >> > a matrix with all combinations of the supplied vectors > >> > > >> > #assign names to the colums of the grid of combinations > >> > names.permut<-c("number.strata", "project.n.plots", "project.acreage", > >> > "project.mean", "project.sd", "number.verification.plots", > >> > "verification.mean", "allowed.deviation") > >> > > >> > names(permut.grid)<-names.permut # done > >> > > >> > combinations<-length(permut.grid[,1]) > >> > > >> > size <-reps*combinations #need to know the size of the master matrix, > >> > which > >> > is the the number of replications * each combination of the supplied > >> > factors > >> > > >> > # we want a df from which to read all the data into the simulation, > and > >> > record the results > >> > permut.col<-ncol(permut.grid) > >> > col.base<-ncol(permut.grid)+2 > >> > base <- matrix(nrow=size, ncol=col.base) > >> > base <-data.frame(base) > >> > > >> > # supply the names > >> > names.base<-c("number.strata", "project.n.plots", "project.acreage", > >> > "project.mean", "project.sd", "number.verification.plots", > >> > "verification.mean", "allowed.deviation","success.fail", > >> > "plots.to.success") > >> > > >> > names(base)<-names.base > >> > > >> > # need to create index vectors for the base, master df > >> > ends <- seq(reps+1, size+1, by=reps) > >> > begins <- ends-reps > >> > index <- cbind(begins, ends-1) > >> > #done > >> > > >> > # next, need to assign the first 6 columns and number of rows = to the > >> > number of reps in the simulation to be the given row in the > permut.grid > >> > matrix > >> > > >> > pb <- winProgressBar(title="Create base, big loop 1 of 2", label="0% > >> > done", > >> > min=0, max=100, initial=0) > >> > > >> > for (i in 1:combinations) { > >> > > >> > base[index[i,1]:index[i,2],1:permut.col] <- permut.grid[i,] > >> > #progress bar > >> > info <- sprintf("%d%% done", round((i/combinations)*100)) > >> > setWinProgressBar(pb, (i/combinations)*100, label=info) > >> > } > >> > > >> > close(pb) > >> > > >> > # now, simply feed the values replicated the number of times we want > to > >> > run > >> > the simulation into the sequential.unpaired function, and assign the > >> > values > >> > to the appropriate columns > >> > > >> > out.index1<-ncol(permut.grid)+1 > >> > out.index2<-ncol(permut.grid)+2 > >> > > >> > #progress bar > >> > pb <- winProgressBar(title="fill values, big loop 2 of 2", label="0% > >> > done", > >> > min=0, max=100, initial=0) > >> > > >> > for (i in 1:size){ > >> > > >> > scalar.base <- base[i,] > >> > verification.plots <- rnorm(scalar.base$number.verification.plots, > >> > scalar.base$verification.mean, scalar.base$project.sd) > >> > result<- sequential.unpaired(scalar.base$number.strata, > >> > scalar.base$project.n.plots, scalar.base$project.mean, scalar.base$ > >> > project.sd, verification.plots, scalar.base$allowed.deviation, > >> > scalar.base$project.acreage, min.plots='default', alpha) > >> > > >> > base[i,out.index1] <- result[[6]][1] > >> > base[i,out.index2] <- result[[7]][1] > >> > info <- sprintf("%d%% done", round((i/size)*100)) > >> > setWinProgressBar(pb, (i/size)*100, label=info) > >> > } > >> > > >> > close(pb) > >> > #return the results > >> > return(base) > >> > > >> > } > >> > > >> > # I would like reps to = 1000, but that takes a really long time right > >> > now > >> > test5 <- simulation.unpaired(reps=5, project.acreage.range = c(99, > >> > 110,510,5100,11000), project.mean=100, project.n.min=10, > >> > project.n.max=100, > >> > project.sd.min=.01, project.sd.max=.2, verification.mean.min=100, > >> > verification.mean.max=130, number.verification.plots.min=10, > >> > number.verification.plots.max=50, length.out = 5) > >> > > >> > [[alternative HTML version deleted]] > >> > > >> > ______________________________________________ > >> > R-help@r-project.org mailing list > >> > https://stat.ethz.ch/mailman/listinfo/r-help > >> > PLEASE do read the posting guide > >> > http://www.R-project.org/posting-guide.html > >> > and provide commented, minimal, self-contained, reproducible code. > >> > >> > >> > >> -- > >> Joshua Wiley > >> Ph.D. Student, Health Psychology > >> Programmer Analyst II, Statistical Consulting Group > >> University of California, Los Angeles > >> https://joshuawiley.com/ > > > > > > > > -- > Joshua Wiley > Ph.D. Student, Health Psychology > Programmer Analyst II, Statistical Consulting Group > University of California, Los Angeles > https://joshuawiley.com/ >[[alternative HTML version deleted]]