Benjamin Caldwell
2013-Feb-12 05:22 UTC
[R] improving/speeding up a very large, slow simulation
Dear R help;
I'll preface this by saying that the example I've provided below is
pretty
long, turgid, and otherwise a deep dive into a series of functions I wrote
for a simulation study. It is, however, reproducible and self-contained.
I'm trying to do my first simulation study that's quite big, and so
I'll
say that the output of this simulation as I'd like it to be is 9.375
million rows (9375 combinations of the inputs * 1000). So, first thing,
maybe I should try to cut down on the number of dimensions and do it a
piece at a time. If that's a conclusion people come to here, I'll look
into
paring down/doing it a chunk at a time.
I'm hoping, though, that this is an opportunity for me to learn some more
efficient, elegant ways to a. vectorize and b. use the apply functions,
which still seem to elude me when it comes to their use with more complex,
custom functions that have multiple inputs.
If having looked the below over you can give me suggestions to help make
this and future code I write run more quickly/efficiently, I will be
most grateful, as as this is currently written it would take what looks
like days to run a thousand simulations of each possible combination of
variables of interest.
Best
Ben Caldwell
-----------------------------------------------
#unpaired
verification.plots<-rnorm(30, 100, 10)
# writeClipboard(as.character(verification.plots))
unpaired.test<- function(verification.plots, project.n, project.mean,
project.sd, allowed.deviation, project.acres, alpha=.05){
verification.plots <-as.numeric(as.character(verification.plots))
a <- qnorm(alpha/2)
d <- alpha*project.mean
# verifier plot number
n<-length(verification.plots)
verifier.plot.number <- c(1:n)
#all plots (verifier and project)
all.plots.n <- rep(0,n)
for(i in 1:n){
all.plots.n[i] <- project.n + verifier.plot.number[i]
}
#running mean
X_bar <- rep(1,n)
for(i in 1:n){
X_bar[i]<- mean(verification.plots[1:i])
}
#running sd
sd2 <- NULL
for(i in 2:n){
sd2[i]<-sd(verification.plots[1:i])
}
#Tn
Tn<-NULL
for(i in 2:n){
Tn[i] <- project.mean-X_bar[i]
}
#n_Threshold
n_thresh<-NULL
for(i in 2:n){
n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2)))
}
#Upper
upper <- NULL
for (i in 2:n){
upper[i] <- Tn[i]-d
}
#Lower
lower<- NULL
for(i in 2:n){
lower[i] <- Tn[i]+d
}
#Success.fail
success.fail <- NULL
success.fail.num <- rep(0,n)
if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){
success.fail[1] <- "Pass"
success.fail.num[1] <- 1
} else {
success.fail[1] <- "Fail"
success.fail.num[1] <-0
}
for(i in 2:n){
if(all.plots.n[i]>=n_thresh[i]){
if(upper[i] <= 0){
if(lower[i] >= 0){
success.fail[i]<-"Pass"
success.fail.num[i]<-1
} else {
success.fail[i] <- "Fail"
success.fail.num[i] <-0}
} else {
success.fail[i] <- "Fail"
success.fail.num[i] <- 0
}
} else {
success.fail[i] <- "Inconclusive"
success.fail.num[i] <- 0
}
}
out.unpaired.test <- data.frame(success.fail, success.fail.num)
}
out.unpaired.test<-unpaired.test(verification.plots=verification.plots,
project.n=30, project.mean=100, project.sd=10, allowed.deviation=.05,
project.acres=99)
out.unpaired.test
#
sequential.unpaired<- function(number.strata, project.n, project.mean,
project.sd, verification.plots,allowed.deviation, project.acres,
min.plots="default", alpha=.05){
if(min.plots=="default"){
min.plots<-NULL # minimum passing
if(number.strata == 1 & project.acres <= 100){
min.plots = 8
} else if(number.strata == 1 & project.acres > 100 & project.acres
< 500){
min.plots = 12
} else if(number.strata == 1 & project.acres > 500 & project.acres
< 5000){
min.plots = 16
} else if(number.strata == 1 & project.acres > 5000 &
project.acres <
10000){
min.plots = 20
} else if(number.strata == 1 & project.acres > 10000){
min.plots = 24
} else if(number.strata == 2 & project.acres < 100){
min.plots = 4
} else if(number.strata == 2 & project.acres > 100 & project.acres
< 500){
min.plots = 6
} else if(number.strata == 2 & project.acres > 501 & project.acres
< 5000){
min.plots = 8
} else if(number.strata == 2 & project.acres > 5001 & project.acres
<
10000){
min.plots = 10
} else if(number.strata == 2 & project.acres > 10000){
min.plots = 12
} else if(number.strata == 3 & project.acres < 100){
min.plots = 2
} else if(number.strata == 3 & project.acres > 100 &
project.acres <
500){
min.plots = 3
} else if(number.strata == 3 & project.acres > 501 & project.acres
< 5000){
min.plots = 4
} else if(number.strata == 3 & project.acres > 5001 & project.acres
<
10000){
min.plots = 5
} else if(number.strata == 3 & project.acres > 10000){
min.plots = 6
} else {min.plots=NULL}
} else (min.plots = min.plots)
if(number.strata > 3){print("max three strata")}
if(number.strata > 3){break}
p <- length(verification.plots)
mp <- min.plots
run <- unpaired.test(verification.plots, project.n, project.mean, project.sd,
allowed.deviation, project.acres, alpha=.05)
number.passing.plots <- function(verification.plots, success.fail.num){
n<-length(verification.plots)
number.passing<-rep(0,n)
number.passing[1]<-ifelse(sum(success.fail.num[1]> mp),1,0)
for(i in 2:n){
if(success.fail.num[i] == 1){
v <- i-1
number.passing[i]<-number.passing[v]+1
} else{number.passing[i] <- 0}
}
number.passing
}
number.passing<-number.passing.plots(verification.plots=verification.plots,
success.fail.num=run$success.fail.num)
sucessful.unsucessful<-rep("blank",p)
one.zero <- rep(0,p)
result <- "blank"
resultL <- 0
n <-length(verification.plots)
for(i in 1:n){
if(number.passing[i] >= mp) {
sucessful.unsucessful[i] <- "successful"
one.zero[i] <- 1
} else {
sucessful.unsucessful[i]<- "unsuccessful"
one.zero[i] <- 0
}
}
if(sum(one.zero) > 0 ) {
result <- "successful"
resultL <- 1
}
plots.success <- function(one.zero){
plots.to.success<-NULL
for(i in 1:n){
if(sum(one.zero)==0){plots.to.success= NA
} else if(one.zero[i]==1){plots.to.success<-append(plots.to.success, i)}
}
plots.to.success<-min(plots.to.success)
plots.to.success
}
plots.to.success <- plots.success(one.zero=one.zero)
complete <-data.frame (min.plots, number.passing, sucessful.unsucessful,
one.zero)
collapsed <- data.frame(result, resultL, plots.to.success)
out <- c(as.list(complete),as.list(collapsed))
}
unpaired.out<-sequential.unpaired(number.strata=2,
verification.plots=verification.plots, project.n=15, project.mean=100,
project.sd=10, allowed.deviation=.05, project.acres=99,
min.plots="default")
str(unpaired.out)
#unpaired.out[[7]][1]
##
simulation.unpaired <- function(reps, project.acreage.range = c(99,
110,510,5100,11000), project.mean, project.n.min, project.n.max,
project.sd.min, project.sd.max, verification.mean.min,
verification.mean.max, number.verification.plots.min,
number.verification.plots.max, allowed.deviation=.1, alpha=.05, length.out
= 5, min.plots="default") {
number.strata.range<-c(1:3) # set by CAR
project.n.range <- seq(project.n.min, project.n.max, length.out=length.out)
project.sd.range <- seq(project.sd.min, project.sd.max,
length.out=length.out) # assume verification sd is the same as the project
sd to simplify - seems a reasonable simpification
number.verification.plots<- seq(number.verification.plots.min,
number.verification.plots.max, length.out=length.out)
verification.range <- seq(verification.mean.min, verification.mean.max,
length.out=length.out)
permut.grid<-expand.grid(number.strata.range, project.n.range,
project.acreage.range, project.mean, project.sd.range,
number.verification.plots, verification.range, allowed.deviation) # create
a matrix with all combinations of the supplied vectors
#assign names to the colums of the grid of combinations
names.permut<-c("number.strata", "project.n.plots",
"project.acreage",
"project.mean", "project.sd",
"number.verification.plots",
"verification.mean", "allowed.deviation")
names(permut.grid)<-names.permut # done
combinations<-length(permut.grid[,1])
size <-reps*combinations #need to know the size of the master matrix, which
is the the number of replications * each combination of the supplied factors
# we want a df from which to read all the data into the simulation, and
record the results
permut.col<-ncol(permut.grid)
col.base<-ncol(permut.grid)+2
base <- matrix(nrow=size, ncol=col.base)
base <-data.frame(base)
# supply the names
names.base<-c("number.strata", "project.n.plots",
"project.acreage",
"project.mean", "project.sd",
"number.verification.plots",
"verification.mean",
"allowed.deviation","success.fail",
"plots.to.success")
names(base)<-names.base
# need to create index vectors for the base, master df
ends <- seq(reps+1, size+1, by=reps)
begins <- ends-reps
index <- cbind(begins, ends-1)
#done
# next, need to assign the first 6 columns and number of rows = to the
number of reps in the simulation to be the given row in the permut.grid
matrix
pb <- winProgressBar(title="Create base, big loop 1 of 2",
label="0% done",
min=0, max=100, initial=0)
for (i in 1:combinations) {
base[index[i,1]:index[i,2],1:permut.col] <- permut.grid[i,]
#progress bar
info <- sprintf("%d%% done", round((i/combinations)*100))
setWinProgressBar(pb, (i/combinations)*100, label=info)
}
close(pb)
# now, simply feed the values replicated the number of times we want to run
the simulation into the sequential.unpaired function, and assign the values
to the appropriate columns
out.index1<-ncol(permut.grid)+1
out.index2<-ncol(permut.grid)+2
#progress bar
pb <- winProgressBar(title="fill values, big loop 2 of 2",
label="0% done",
min=0, max=100, initial=0)
for (i in 1:size){
scalar.base <- base[i,]
verification.plots <- rnorm(scalar.base$number.verification.plots,
scalar.base$verification.mean, scalar.base$project.sd)
result<- sequential.unpaired(scalar.base$number.strata,
scalar.base$project.n.plots, scalar.base$project.mean, scalar.base$
project.sd, verification.plots, scalar.base$allowed.deviation,
scalar.base$project.acreage, min.plots='default', alpha)
base[i,out.index1] <- result[[6]][1]
base[i,out.index2] <- result[[7]][1]
info <- sprintf("%d%% done", round((i/size)*100))
setWinProgressBar(pb, (i/size)*100, label=info)
}
close(pb)
#return the results
return(base)
}
# I would like reps to = 1000, but that takes a really long time right now
test5 <- simulation.unpaired(reps=5, project.acreage.range = c(99,
110,510,5100,11000), project.mean=100, project.n.min=10, project.n.max=100,
project.sd.min=.01, project.sd.max=.2, verification.mean.min=100,
verification.mean.max=130, number.verification.plots.min=10,
number.verification.plots.max=50, length.out = 5)
[[alternative HTML version deleted]]
Anindya Sankar Dey
2013-Feb-12 06:01 UTC
[R] improving/speeding up a very large, slow simulation
Hi, I've just read your intro, not the full code. But one way first divide your input data into parts and make a list where each element of the list is one part of your input data. Rest of code put in a function, which will work on a part of your data. Build the function such that it takes list as input. At the beginning of the function unlist your function variable, and then convert to required data frame. Then use mclapply of multicore/parallel package and you'll get results much faster. Well this is the way I'd have tried, but there are other experts who can give a better way to do it. Regards, Anindya On Tue, Feb 12, 2013 at 10:52 AM, Benjamin Caldwell <btcaldwell@berkeley.edu> wrote:> Dear R help; > > I'll preface this by saying that the example I've provided below is pretty > long, turgid, and otherwise a deep dive into a series of functions I wrote > for a simulation study. It is, however, reproducible and self-contained. > > I'm trying to do my first simulation study that's quite big, and so I'll > say that the output of this simulation as I'd like it to be is 9.375 > million rows (9375 combinations of the inputs * 1000). So, first thing, > maybe I should try to cut down on the number of dimensions and do it a > piece at a time. If that's a conclusion people come to here, I'll look into > paring down/doing it a chunk at a time. > > I'm hoping, though, that this is an opportunity for me to learn some more > efficient, elegant ways to a. vectorize and b. use the apply functions, > which still seem to elude me when it comes to their use with more complex, > custom functions that have multiple inputs. > > If having looked the below over you can give me suggestions to help make > this and future code I write run more quickly/efficiently, I will be > most grateful, as as this is currently written it would take what looks > like days to run a thousand simulations of each possible combination of > variables of interest. > > Best > > Ben Caldwell > > ----------------------------------------------- > #unpaired > verification.plots<-rnorm(30, 100, 10) > # writeClipboard(as.character(verification.plots)) > > unpaired.test<- function(verification.plots, project.n, project.mean, > project.sd, allowed.deviation, project.acres, alpha=.05){ > > verification.plots <-as.numeric(as.character(verification.plots)) > a <- qnorm(alpha/2) > d <- alpha*project.mean > > # verifier plot number > > n<-length(verification.plots) > verifier.plot.number <- c(1:n) > > > #all plots (verifier and project) > > all.plots.n <- rep(0,n) > for(i in 1:n){ > all.plots.n[i] <- project.n + verifier.plot.number[i] > } > > #running mean > X_bar <- rep(1,n) > > for(i in 1:n){ > X_bar[i]<- mean(verification.plots[1:i]) > } > #running sd > sd2 <- NULL > > for(i in 2:n){ > sd2[i]<-sd(verification.plots[1:i]) > } > #Tn > Tn<-NULL > > for(i in 2:n){ > Tn[i] <- project.mean-X_bar[i] > } > #n_Threshold > n_thresh<-NULL > > for(i in 2:n){ > n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2))) > } > #Upper > upper <- NULL > for (i in 2:n){ > upper[i] <- Tn[i]-d > } > #Lower > lower<- NULL > for(i in 2:n){ > lower[i] <- Tn[i]+d > } > > #Success.fail > success.fail <- NULL > > > > success.fail.num <- rep(0,n) > > if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){ > success.fail[1] <- "Pass" > success.fail.num[1] <- 1 > } else { > success.fail[1] <- "Fail" > success.fail.num[1] <-0 > } > for(i in 2:n){ > if(all.plots.n[i]>=n_thresh[i]){ > if(upper[i] <= 0){ > if(lower[i] >= 0){ > success.fail[i]<-"Pass" > success.fail.num[i]<-1 > } else { > success.fail[i] <- "Fail" > success.fail.num[i] <-0} > } else { > success.fail[i] <- "Fail" > success.fail.num[i] <- 0 > } > > } else { > success.fail[i] <- "Inconclusive" > success.fail.num[i] <- 0 > } > } > > out.unpaired.test <- data.frame(success.fail, success.fail.num) > } > > > out.unpaired.test<-unpaired.test(verification.plots=verification.plots, > project.n=30, project.mean=100, project.sd=10, allowed.deviation=.05, > project.acres=99) > out.unpaired.test > # > sequential.unpaired<- function(number.strata, project.n, project.mean, > project.sd, verification.plots,allowed.deviation, project.acres, > min.plots="default", alpha=.05){ > > if(min.plots=="default"){ > min.plots<-NULL # minimum passing > if(number.strata == 1 & project.acres <= 100){ > min.plots = 8 > } else if(number.strata == 1 & project.acres > 100 & project.acres < 500){ > min.plots = 12 > } else if(number.strata == 1 & project.acres > 500 & project.acres < > 5000){ > min.plots = 16 > } else if(number.strata == 1 & project.acres > 5000 & project.acres < > 10000){ > min.plots = 20 > } else if(number.strata == 1 & project.acres > 10000){ > min.plots = 24 > } else if(number.strata == 2 & project.acres < 100){ > min.plots = 4 > } else if(number.strata == 2 & project.acres > 100 & project.acres < 500){ > min.plots = 6 > } else if(number.strata == 2 & project.acres > 501 & project.acres < 5000){ > min.plots = 8 > } else if(number.strata == 2 & project.acres > 5001 & project.acres < > 10000){ > min.plots = 10 > } else if(number.strata == 2 & project.acres > 10000){ > min.plots = 12 > } else if(number.strata == 3 & project.acres < 100){ > min.plots = 2 > } else if(number.strata == 3 & project.acres > 100 & project.acres < > 500){ > min.plots = 3 > } else if(number.strata == 3 & project.acres > 501 & project.acres < 5000){ > min.plots = 4 > } else if(number.strata == 3 & project.acres > 5001 & project.acres < > 10000){ > min.plots = 5 > } else if(number.strata == 3 & project.acres > 10000){ > min.plots = 6 > } else {min.plots=NULL} > } else (min.plots = min.plots) > > > if(number.strata > 3){print("max three strata")} > if(number.strata > 3){break} > > > p <- length(verification.plots) > mp <- min.plots > run <- unpaired.test(verification.plots, project.n, project.mean, > project.sd, > allowed.deviation, project.acres, alpha=.05) > number.passing.plots <- function(verification.plots, success.fail.num){ > n<-length(verification.plots) > number.passing<-rep(0,n) > number.passing[1]<-ifelse(sum(success.fail.num[1]> mp),1,0) > for(i in 2:n){ > if(success.fail.num[i] == 1){ > v <- i-1 > number.passing[i]<-number.passing[v]+1 > } else{number.passing[i] <- 0} > } > number.passing > } > number.passing<-number.passing.plots(verification.plots=verification.plots, > success.fail.num=run$success.fail.num) > sucessful.unsucessful<-rep("blank",p) > one.zero <- rep(0,p) > result <- "blank" > resultL <- 0 > n <-length(verification.plots) > for(i in 1:n){ > if(number.passing[i] >= mp) { > sucessful.unsucessful[i] <- "successful" > one.zero[i] <- 1 > } else { > sucessful.unsucessful[i]<- "unsuccessful" > one.zero[i] <- 0 > } > } > > if(sum(one.zero) > 0 ) { > result <- "successful" > resultL <- 1 > } > > plots.success <- function(one.zero){ > > plots.to.success<-NULL > > for(i in 1:n){ > if(sum(one.zero)==0){plots.to.success= NA > } else if(one.zero[i]==1){plots.to.success<-append(plots.to.success, i)} > } > plots.to.success<-min(plots.to.success) > plots.to.success > } > > plots.to.success <- plots.success(one.zero=one.zero) > > complete <-data.frame (min.plots, number.passing, sucessful.unsucessful, > one.zero) > collapsed <- data.frame(result, resultL, plots.to.success) > out <- c(as.list(complete),as.list(collapsed)) > } > > > unpaired.out<-sequential.unpaired(number.strata=2, > verification.plots=verification.plots, project.n=15, project.mean=100, > project.sd=10, allowed.deviation=.05, project.acres=99, > min.plots="default") > > str(unpaired.out) > #unpaired.out[[7]][1] > > ## > > simulation.unpaired <- function(reps, project.acreage.range = c(99, > 110,510,5100,11000), project.mean, project.n.min, project.n.max, > project.sd.min, project.sd.max, verification.mean.min, > verification.mean.max, number.verification.plots.min, > number.verification.plots.max, allowed.deviation=.1, alpha=.05, length.out > = 5, min.plots="default") { > > number.strata.range<-c(1:3) # set by CAR > > project.n.range <- seq(project.n.min, project.n.max, length.out=length.out) > > project.sd.range <- seq(project.sd.min, project.sd.max, > length.out=length.out) # assume verification sd is the same as the project > sd to simplify - seems a reasonable simpification > > number.verification.plots<- seq(number.verification.plots.min, > number.verification.plots.max, length.out=length.out) > > verification.range <- seq(verification.mean.min, verification.mean.max, > length.out=length.out) > > permut.grid<-expand.grid(number.strata.range, project.n.range, > project.acreage.range, project.mean, project.sd.range, > number.verification.plots, verification.range, allowed.deviation) # create > a matrix with all combinations of the supplied vectors > > #assign names to the colums of the grid of combinations > names.permut<-c("number.strata", "project.n.plots", "project.acreage", > "project.mean", "project.sd", "number.verification.plots", > "verification.mean", "allowed.deviation") > > names(permut.grid)<-names.permut # done > > combinations<-length(permut.grid[,1]) > > size <-reps*combinations #need to know the size of the master matrix, which > is the the number of replications * each combination of the supplied > factors > > # we want a df from which to read all the data into the simulation, and > record the results > permut.col<-ncol(permut.grid) > col.base<-ncol(permut.grid)+2 > base <- matrix(nrow=size, ncol=col.base) > base <-data.frame(base) > > # supply the names > names.base<-c("number.strata", "project.n.plots", "project.acreage", > "project.mean", "project.sd", "number.verification.plots", > "verification.mean", "allowed.deviation","success.fail", > "plots.to.success") > > names(base)<-names.base > > # need to create index vectors for the base, master df > ends <- seq(reps+1, size+1, by=reps) > begins <- ends-reps > index <- cbind(begins, ends-1) > #done > > # next, need to assign the first 6 columns and number of rows = to the > number of reps in the simulation to be the given row in the permut.grid > matrix > > pb <- winProgressBar(title="Create base, big loop 1 of 2", label="0% done", > min=0, max=100, initial=0) > > for (i in 1:combinations) { > > base[index[i,1]:index[i,2],1:permut.col] <- permut.grid[i,] > #progress bar > info <- sprintf("%d%% done", round((i/combinations)*100)) > setWinProgressBar(pb, (i/combinations)*100, label=info) > } > > close(pb) > > # now, simply feed the values replicated the number of times we want to run > the simulation into the sequential.unpaired function, and assign the values > to the appropriate columns > > out.index1<-ncol(permut.grid)+1 > out.index2<-ncol(permut.grid)+2 > > #progress bar > pb <- winProgressBar(title="fill values, big loop 2 of 2", label="0% done", > min=0, max=100, initial=0) > > for (i in 1:size){ > > scalar.base <- base[i,] > verification.plots <- rnorm(scalar.base$number.verification.plots, > scalar.base$verification.mean, scalar.base$project.sd) > result<- sequential.unpaired(scalar.base$number.strata, > scalar.base$project.n.plots, scalar.base$project.mean, scalar.base$ > project.sd, verification.plots, scalar.base$allowed.deviation, > scalar.base$project.acreage, min.plots='default', alpha) > > base[i,out.index1] <- result[[6]][1] > base[i,out.index2] <- result[[7]][1] > info <- sprintf("%d%% done", round((i/size)*100)) > setWinProgressBar(pb, (i/size)*100, label=info) > } > > close(pb) > #return the results > return(base) > > } > > # I would like reps to = 1000, but that takes a really long time right now > test5 <- simulation.unpaired(reps=5, project.acreage.range = c(99, > 110,510,5100,11000), project.mean=100, project.n.min=10, project.n.max=100, > project.sd.min=.01, project.sd.max=.2, verification.mean.min=100, > verification.mean.max=130, number.verification.plots.min=10, > number.verification.plots.max=50, length.out = 5) > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help@r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. >-- Anindya Sankar Dey [[alternative HTML version deleted]]
Joshua Wiley
2013-Feb-12 06:10 UTC
[R] improving/speeding up a very large, slow simulation
Hi Ben, I appreciate that the example is reproducible, and I understand why you shared the entire project. At the same time, that is an awful lot of code for volunteers to go through and try to optimize. I would try to think of the structure of your task, see what the individual pieces are---figure out what can be reused and optimized. Look at loops and try to think whether some operation performed on every element of the vector at once could do the same thing. Sometimes the next iteration of a loop depends on the previous, so it is difficult to vectorize, but often that is not the case. Make use of the compiler package, especially cmpfun(). It can give fairly substantial speedups, particularly with for loops in R. Finally if you want 1000 reps, do you actually need to repeat all the steps 1000 times, or could you just simulate 1000x the data? If the latter, do the steps once but make more data. If the former, you ought to be able to parallelize it fairly easily, see the parallel package. Good luck, Josh On Mon, Feb 11, 2013 at 9:22 PM, Benjamin Caldwell <btcaldwell at berkeley.edu> wrote:> Dear R help; > > I'll preface this by saying that the example I've provided below is pretty > long, turgid, and otherwise a deep dive into a series of functions I wrote > for a simulation study. It is, however, reproducible and self-contained. > > I'm trying to do my first simulation study that's quite big, and so I'll > say that the output of this simulation as I'd like it to be is 9.375 > million rows (9375 combinations of the inputs * 1000). So, first thing, > maybe I should try to cut down on the number of dimensions and do it a > piece at a time. If that's a conclusion people come to here, I'll look into > paring down/doing it a chunk at a time. > > I'm hoping, though, that this is an opportunity for me to learn some more > efficient, elegant ways to a. vectorize and b. use the apply functions, > which still seem to elude me when it comes to their use with more complex, > custom functions that have multiple inputs. > > If having looked the below over you can give me suggestions to help make > this and future code I write run more quickly/efficiently, I will be > most grateful, as as this is currently written it would take what looks > like days to run a thousand simulations of each possible combination of > variables of interest. > > Best > > Ben Caldwell > > ----------------------------------------------- > #unpaired > verification.plots<-rnorm(30, 100, 10) > # writeClipboard(as.character(verification.plots)) > > unpaired.test<- function(verification.plots, project.n, project.mean, > project.sd, allowed.deviation, project.acres, alpha=.05){ > > verification.plots <-as.numeric(as.character(verification.plots)) > a <- qnorm(alpha/2) > d <- alpha*project.mean > > # verifier plot number > > n<-length(verification.plots) > verifier.plot.number <- c(1:n) > > > #all plots (verifier and project) > > all.plots.n <- rep(0,n) > for(i in 1:n){ > all.plots.n[i] <- project.n + verifier.plot.number[i] > } > > #running mean > X_bar <- rep(1,n) > > for(i in 1:n){ > X_bar[i]<- mean(verification.plots[1:i]) > } > #running sd > sd2 <- NULL > > for(i in 2:n){ > sd2[i]<-sd(verification.plots[1:i]) > } > #Tn > Tn<-NULL > > for(i in 2:n){ > Tn[i] <- project.mean-X_bar[i] > } > #n_Threshold > n_thresh<-NULL > > for(i in 2:n){ > n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2))) > } > #Upper > upper <- NULL > for (i in 2:n){ > upper[i] <- Tn[i]-d > } > #Lower > lower<- NULL > for(i in 2:n){ > lower[i] <- Tn[i]+d > } > > #Success.fail > success.fail <- NULL > > > > success.fail.num <- rep(0,n) > if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){ > success.fail[1] <- "Pass" > success.fail.num[1] <- 1 > } else { > success.fail[1] <- "Fail" > success.fail.num[1] <-0 > } > for(i in 2:n){ > if(all.plots.n[i]>=n_thresh[i]){ > if(upper[i] <= 0){ > if(lower[i] >= 0){ > success.fail[i]<-"Pass" > success.fail.num[i]<-1 > } else { > success.fail[i] <- "Fail" > success.fail.num[i] <-0} > } else { > success.fail[i] <- "Fail" > success.fail.num[i] <- 0 > } > > } else { > success.fail[i] <- "Inconclusive" > success.fail.num[i] <- 0 > } > } > > out.unpaired.test <- data.frame(success.fail, success.fail.num) > } > > > out.unpaired.test<-unpaired.test(verification.plots=verification.plots, > project.n=30, project.mean=100, project.sd=10, allowed.deviation=.05, > project.acres=99) > out.unpaired.test > # > sequential.unpaired<- function(number.strata, project.n, project.mean, > project.sd, verification.plots,allowed.deviation, project.acres, > min.plots="default", alpha=.05){ > > if(min.plots=="default"){ > min.plots<-NULL # minimum passing > if(number.strata == 1 & project.acres <= 100){ > min.plots = 8 > } else if(number.strata == 1 & project.acres > 100 & project.acres < 500){ > min.plots = 12 > } else if(number.strata == 1 & project.acres > 500 & project.acres < 5000){ > min.plots = 16 > } else if(number.strata == 1 & project.acres > 5000 & project.acres < > 10000){ > min.plots = 20 > } else if(number.strata == 1 & project.acres > 10000){ > min.plots = 24 > } else if(number.strata == 2 & project.acres < 100){ > min.plots = 4 > } else if(number.strata == 2 & project.acres > 100 & project.acres < 500){ > min.plots = 6 > } else if(number.strata == 2 & project.acres > 501 & project.acres < 5000){ > min.plots = 8 > } else if(number.strata == 2 & project.acres > 5001 & project.acres < > 10000){ > min.plots = 10 > } else if(number.strata == 2 & project.acres > 10000){ > min.plots = 12 > } else if(number.strata == 3 & project.acres < 100){ > min.plots = 2 > } else if(number.strata == 3 & project.acres > 100 & project.acres < > 500){ > min.plots = 3 > } else if(number.strata == 3 & project.acres > 501 & project.acres < 5000){ > min.plots = 4 > } else if(number.strata == 3 & project.acres > 5001 & project.acres < > 10000){ > min.plots = 5 > } else if(number.strata == 3 & project.acres > 10000){ > min.plots = 6 > } else {min.plots=NULL} > } else (min.plots = min.plots) > > > if(number.strata > 3){print("max three strata")} > if(number.strata > 3){break} > > > p <- length(verification.plots) > mp <- min.plots > run <- unpaired.test(verification.plots, project.n, project.mean, project.sd, > allowed.deviation, project.acres, alpha=.05) > number.passing.plots <- function(verification.plots, success.fail.num){ > n<-length(verification.plots) > number.passing<-rep(0,n) > number.passing[1]<-ifelse(sum(success.fail.num[1]> mp),1,0) > for(i in 2:n){ > if(success.fail.num[i] == 1){ > v <- i-1 > number.passing[i]<-number.passing[v]+1 > } else{number.passing[i] <- 0} > } > number.passing > } > number.passing<-number.passing.plots(verification.plots=verification.plots, > success.fail.num=run$success.fail.num) > sucessful.unsucessful<-rep("blank",p) > one.zero <- rep(0,p) > result <- "blank" > resultL <- 0 > n <-length(verification.plots) > for(i in 1:n){ > if(number.passing[i] >= mp) { > sucessful.unsucessful[i] <- "successful" > one.zero[i] <- 1 > } else { > sucessful.unsucessful[i]<- "unsuccessful" > one.zero[i] <- 0 > } > } > > if(sum(one.zero) > 0 ) { > result <- "successful" > resultL <- 1 > } > > plots.success <- function(one.zero){ > > plots.to.success<-NULL > > for(i in 1:n){ > if(sum(one.zero)==0){plots.to.success= NA > } else if(one.zero[i]==1){plots.to.success<-append(plots.to.success, i)} > } > plots.to.success<-min(plots.to.success) > plots.to.success > } > > plots.to.success <- plots.success(one.zero=one.zero) > > complete <-data.frame (min.plots, number.passing, sucessful.unsucessful, > one.zero) > collapsed <- data.frame(result, resultL, plots.to.success) > out <- c(as.list(complete),as.list(collapsed)) > } > > > unpaired.out<-sequential.unpaired(number.strata=2, > verification.plots=verification.plots, project.n=15, project.mean=100, > project.sd=10, allowed.deviation=.05, project.acres=99, min.plots="default") > > str(unpaired.out) > #unpaired.out[[7]][1] > > ## > > simulation.unpaired <- function(reps, project.acreage.range = c(99, > 110,510,5100,11000), project.mean, project.n.min, project.n.max, > project.sd.min, project.sd.max, verification.mean.min, > verification.mean.max, number.verification.plots.min, > number.verification.plots.max, allowed.deviation=.1, alpha=.05, length.out > = 5, min.plots="default") { > > number.strata.range<-c(1:3) # set by CAR > > project.n.range <- seq(project.n.min, project.n.max, length.out=length.out) > > project.sd.range <- seq(project.sd.min, project.sd.max, > length.out=length.out) # assume verification sd is the same as the project > sd to simplify - seems a reasonable simpification > > number.verification.plots<- seq(number.verification.plots.min, > number.verification.plots.max, length.out=length.out) > > verification.range <- seq(verification.mean.min, verification.mean.max, > length.out=length.out) > > permut.grid<-expand.grid(number.strata.range, project.n.range, > project.acreage.range, project.mean, project.sd.range, > number.verification.plots, verification.range, allowed.deviation) # create > a matrix with all combinations of the supplied vectors > > #assign names to the colums of the grid of combinations > names.permut<-c("number.strata", "project.n.plots", "project.acreage", > "project.mean", "project.sd", "number.verification.plots", > "verification.mean", "allowed.deviation") > > names(permut.grid)<-names.permut # done > > combinations<-length(permut.grid[,1]) > > size <-reps*combinations #need to know the size of the master matrix, which > is the the number of replications * each combination of the supplied factors > > # we want a df from which to read all the data into the simulation, and > record the results > permut.col<-ncol(permut.grid) > col.base<-ncol(permut.grid)+2 > base <- matrix(nrow=size, ncol=col.base) > base <-data.frame(base) > > # supply the names > names.base<-c("number.strata", "project.n.plots", "project.acreage", > "project.mean", "project.sd", "number.verification.plots", > "verification.mean", "allowed.deviation","success.fail", "plots.to.success") > > names(base)<-names.base > > # need to create index vectors for the base, master df > ends <- seq(reps+1, size+1, by=reps) > begins <- ends-reps > index <- cbind(begins, ends-1) > #done > > # next, need to assign the first 6 columns and number of rows = to the > number of reps in the simulation to be the given row in the permut.grid > matrix > > pb <- winProgressBar(title="Create base, big loop 1 of 2", label="0% done", > min=0, max=100, initial=0) > > for (i in 1:combinations) { > > base[index[i,1]:index[i,2],1:permut.col] <- permut.grid[i,] > #progress bar > info <- sprintf("%d%% done", round((i/combinations)*100)) > setWinProgressBar(pb, (i/combinations)*100, label=info) > } > > close(pb) > > # now, simply feed the values replicated the number of times we want to run > the simulation into the sequential.unpaired function, and assign the values > to the appropriate columns > > out.index1<-ncol(permut.grid)+1 > out.index2<-ncol(permut.grid)+2 > > #progress bar > pb <- winProgressBar(title="fill values, big loop 2 of 2", label="0% done", > min=0, max=100, initial=0) > > for (i in 1:size){ > > scalar.base <- base[i,] > verification.plots <- rnorm(scalar.base$number.verification.plots, > scalar.base$verification.mean, scalar.base$project.sd) > result<- sequential.unpaired(scalar.base$number.strata, > scalar.base$project.n.plots, scalar.base$project.mean, scalar.base$ > project.sd, verification.plots, scalar.base$allowed.deviation, > scalar.base$project.acreage, min.plots='default', alpha) > > base[i,out.index1] <- result[[6]][1] > base[i,out.index2] <- result[[7]][1] > info <- sprintf("%d%% done", round((i/size)*100)) > setWinProgressBar(pb, (i/size)*100, label=info) > } > > close(pb) > #return the results > return(base) > > } > > # I would like reps to = 1000, but that takes a really long time right now > test5 <- simulation.unpaired(reps=5, project.acreage.range = c(99, > 110,510,5100,11000), project.mean=100, project.n.min=10, project.n.max=100, > project.sd.min=.01, project.sd.max=.2, verification.mean.min=100, > verification.mean.max=130, number.verification.plots.min=10, > number.verification.plots.max=50, length.out = 5) > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.-- Joshua Wiley Ph.D. Student, Health Psychology Programmer Analyst II, Statistical Consulting Group University of California, Los Angeles https://joshuawiley.com/
Benjamin Caldwell
2013-Feb-13 23:33 UTC
[R] improving/speeding up a very large, slow simulation
Joshua,
Thanks for the example, and the explanation. Nice article that you wrote -
the figures alone deserve a deeper dive, I think.
As side note, I found out about profiling on another post and did it - it
appears that my ignorance about how slow dataframes can be was the main
problem. After turning the dfs into lists and matrixes, the improvement was
huge! Wish I had known about this issue ahead of time - it must be
somewhere in the R help books in big red letters, but if it isn't it should
be. Seems most of the discussions about speeding up code talks about
vectorization and avoiding fragmentation.
The below was for five iterations of the simulation, and I imagine was the
reason it was hanging up and stalling out for large simulations:
Original:
$by.total
total.time total.pct self.time self.pct
simulation.unpaired.c 2857.54 99.97 111.90 3.91
[<- 2555.74 89.41 2.78 0.10
[<-.data.frame 2552.96 89.32 2543.84 89.00
sequential.unpaired.c 149.48 5.23 2.66 0.09
unpaired.test.c 92.30 3.23 18.80 0.66
data.frame 70.72 2.47 9.68 0.34
as.data.frame 42.36 1.48 3.74 0.132
only using dataframe as the output:
$by.total
total.time total.pct self.time self.pct
simulation.unpaired.c 109.14 100.00 0.90 0.82
sequential.unpaired.c 92.16 84.44 3.28 3.01
unpaired.test.c 84.48 77.41 21.08 19.31
sd 43.68 40.02 5.42 4.97
Thanks again folks.
On Tue, Feb 12, 2013 at 9:45 PM, Joshua Wiley
<jwiley.psych@gmail.com>wrote:
> Hi Ben,
>
> That is correct about vectorizing----to some speed tests to see the
> effects. They can be quite dramatic (like I have easily seen 300fold
> speedups from doing that). The apply functions are essentially
> loops---they tend to be about the same speed as loops, though slightly
> less code.
>
> Compiler is nice---not it will not help already vectorized code
> (because vectorized code really just means code that is linked to C
> code that uses a compiled loop, but C is much faster than R.
>
> Sure, check out the bootstrapping (midway down) section on this page I
> wrote: http://www.ats.ucla.edu/stat/r/dae/melogit.htm
>
> Cheers,
>
> Josh
>
> On Tue, Feb 12, 2013 at 12:57 PM, Benjamin Caldwell
> <btcaldwell@berkeley.edu> wrote:
> > Joshua,
> >
> > So, to your first suggestion - try to figure out whether some
operation
> > performed on every element of the vector at once could do the same
thing
> -
> > the answer is yes! Where can I see examples of that implemented?
> >
> > e.g. would it just be
> >
> > a <- 1:10000
> > b <- 1:10000
> > c <- 1:10000
> > d <- data.frame(b,c)
> >
> > vectorize<-function(a, d) {
> >
> > f <- d[,1]+d[,2]+a
> > f
> > }
> >
> > out<-vectorize(a=a,d=d)
> > out
> >
> > vs
> >
> > loop<- function(a,d){
> >
> > for(i in 1:length(d[,1])){
> > a[i]<-d[i,1]+d[i,2]+a[i]
> > }
> > a
> > }
> >
> > out.l<-loop(a,d)
> >
> > out.l
> >
> > If so, it's vectorization that's the big advantage, not
something
> mysterious
> > that's happening under the hood with the apply functions?
> >
> > To your second : Compiler - thanks for the tip, that's great to
know!
> >
> > To your last, could you please give an example or point me to one that
> was
> > useful for you? I don't understand that well enough to use it.
> >
> > Thanks!
> >
> > Ben Caldwell
> >
> > Graduate Fellow
> > University of California, Berkeley
> > 130 Mulford Hall #3114
> > Berkeley, CA 94720
> > Office 223 Mulford Hall
> > (510)859-3358
> >
> >
> > On Mon, Feb 11, 2013 at 10:10 PM, Joshua Wiley
<jwiley.psych@gmail.com>
> > wrote:
> >>
> >> Hi Ben,
> >>
> >> I appreciate that the example is reproducible, and I understand
why
> >> you shared the entire project. At the same time, that is an awful
lot
> >> of code for volunteers to go through and try to optimize.
> >>
> >> I would try to think of the structure of your task, see what the
> >> individual pieces are---figure out what can be reused and
optimized.
> >> Look at loops and try to think whether some operation performed on
> >> every element of the vector at once could do the same thing.
> >> Sometimes the next iteration of a loop depends on the previous, so
it
> >> is difficult to vectorize, but often that is not the case.
> >>
> >> Make use of the compiler package, especially cmpfun(). It can
give
> >> fairly substantial speedups, particularly with for loops in R.
> >>
> >> Finally if you want 1000 reps, do you actually need to repeat all
the
> >> steps 1000 times, or could you just simulate 1000x the data? If
the
> >> latter, do the steps once but make more data. If the former, you
> >> ought to be able to parallelize it fairly easily, see the parallel
> >> package.
> >>
> >> Good luck,
> >>
> >> Josh
> >>
> >>
> >> On Mon, Feb 11, 2013 at 9:22 PM, Benjamin Caldwell
> >> <btcaldwell@berkeley.edu> wrote:
> >> > Dear R help;
> >> >
> >> > I'll preface this by saying that the example I've
provided below is
> >> > pretty
> >> > long, turgid, and otherwise a deep dive into a series of
functions I
> >> > wrote
> >> > for a simulation study. It is, however, reproducible and
> self-contained.
> >> >
> >> > I'm trying to do my first simulation study that's
quite big, and so
> I'll
> >> > say that the output of this simulation as I'd like it to
be is 9.375
> >> > million rows (9375 combinations of the inputs * 1000). So,
first
> thing,
> >> > maybe I should try to cut down on the number of dimensions
and do it a
> >> > piece at a time. If that's a conclusion people come to
here, I'll look
> >> > into
> >> > paring down/doing it a chunk at a time.
> >> >
> >> > I'm hoping, though, that this is an opportunity for me to
learn some
> >> > more
> >> > efficient, elegant ways to a. vectorize and b. use the apply
> functions,
> >> > which still seem to elude me when it comes to their use with
more
> >> > complex,
> >> > custom functions that have multiple inputs.
> >> >
> >> > If having looked the below over you can give me suggestions
to help
> make
> >> > this and future code I write run more quickly/efficiently, I
will be
> >> > most grateful, as as this is currently written it would take
what
> looks
> >> > like days to run a thousand simulations of each possible
combination
> of
> >> > variables of interest.
> >> >
> >> > Best
> >> >
> >> > Ben Caldwell
> >> >
> >> > -----------------------------------------------
> >> > #unpaired
> >> > verification.plots<-rnorm(30, 100, 10)
> >> > # writeClipboard(as.character(verification.plots))
> >> >
> >> > unpaired.test<- function(verification.plots, project.n,
project.mean,
> >> > project.sd, allowed.deviation, project.acres, alpha=.05){
> >> >
> >> > verification.plots
<-as.numeric(as.character(verification.plots))
> >> > a <- qnorm(alpha/2)
> >> > d <- alpha*project.mean
> >> >
> >> > # verifier plot number
> >> >
> >> > n<-length(verification.plots)
> >> > verifier.plot.number <- c(1:n)
> >> >
> >> >
> >> > #all plots (verifier and project)
> >> >
> >> > all.plots.n <- rep(0,n)
> >> > for(i in 1:n){
> >> > all.plots.n[i] <- project.n + verifier.plot.number[i]
> >> > }
> >> >
> >> > #running mean
> >> > X_bar <- rep(1,n)
> >> >
> >> > for(i in 1:n){
> >> > X_bar[i]<- mean(verification.plots[1:i])
> >> > }
> >> > #running sd
> >> > sd2 <- NULL
> >> >
> >> > for(i in 2:n){
> >> > sd2[i]<-sd(verification.plots[1:i])
> >> > }
> >> > #Tn
> >> > Tn<-NULL
> >> >
> >> > for(i in 2:n){
> >> > Tn[i] <- project.mean-X_bar[i]
> >> > }
> >> > #n_Threshold
> >> > n_thresh<-NULL
> >> >
> >> > for(i in 2:n){
> >> > n_thresh[i] <- (((a^2)/(d^2))*((project.sd^2)+(sd2[i]^2)))
> >> > }
> >> > #Upper
> >> > upper <- NULL
> >> > for (i in 2:n){
> >> > upper[i] <- Tn[i]-d
> >> > }
> >> > #Lower
> >> > lower<- NULL
> >> > for(i in 2:n){
> >> > lower[i] <- Tn[i]+d
> >> > }
> >> >
> >> > #Success.fail
> >> > success.fail <- NULL
> >> >
> >> >
> >> >
> >> > success.fail.num <- rep(0,n)
> >> >
> >> >
>
if(abs(verification.plots[1]-project.mean)<(allowed.deviation*project.mean)){
> >> > success.fail[1] <- "Pass"
> >> > success.fail.num[1] <- 1
> >> > } else {
> >> > success.fail[1] <- "Fail"
> >> > success.fail.num[1] <-0
> >> > }
> >> > for(i in 2:n){
> >> > if(all.plots.n[i]>=n_thresh[i]){
> >> > if(upper[i] <= 0){
> >> > if(lower[i] >= 0){
> >> > success.fail[i]<-"Pass"
> >> > success.fail.num[i]<-1
> >> > } else {
> >> > success.fail[i] <- "Fail"
> >> > success.fail.num[i] <-0}
> >> > } else {
> >> > success.fail[i] <- "Fail"
> >> > success.fail.num[i] <- 0
> >> > }
> >> >
> >> > } else {
> >> > success.fail[i] <- "Inconclusive"
> >> > success.fail.num[i] <- 0
> >> > }
> >> > }
> >> >
> >> > out.unpaired.test <- data.frame(success.fail,
success.fail.num)
> >> > }
> >> >
> >> >
> >> >
> out.unpaired.test<-unpaired.test(verification.plots=verification.plots,
> >> > project.n=30, project.mean=100, project.sd=10,
allowed.deviation=.05,
> >> > project.acres=99)
> >> > out.unpaired.test
> >> > #
> >> > sequential.unpaired<- function(number.strata, project.n,
project.mean,
> >> > project.sd, verification.plots,allowed.deviation,
project.acres,
> >> > min.plots="default", alpha=.05){
> >> >
> >> > if(min.plots=="default"){
> >> > min.plots<-NULL # minimum passing
> >> > if(number.strata == 1 & project.acres <= 100){
> >> > min.plots = 8
> >> > } else if(number.strata == 1 & project.acres > 100
& project.acres <
> >> > 500){
> >> > min.plots = 12
> >> > } else if(number.strata == 1 & project.acres > 500
& project.acres <
> >> > 5000){
> >> > min.plots = 16
> >> > } else if(number.strata == 1 & project.acres >
5000 &
> project.acres
> >> > <
> >> > 10000){
> >> > min.plots = 20
> >> > } else if(number.strata == 1 & project.acres > 10000){
> >> > min.plots = 24
> >> > } else if(number.strata == 2 & project.acres <
100){
> >> > min.plots = 4
> >> > } else if(number.strata == 2 & project.acres > 100
& project.acres <
> >> > 500){
> >> > min.plots = 6
> >> > } else if(number.strata == 2 & project.acres > 501
& project.acres <
> >> > 5000){
> >> > min.plots = 8
> >> > } else if(number.strata == 2 & project.acres > 5001
& project.acres <
> >> > 10000){
> >> > min.plots = 10
> >> > } else if(number.strata == 2 & project.acres >
10000){
> >> > min.plots = 12
> >> > } else if(number.strata == 3 & project.acres < 100){
> >> > min.plots = 2
> >> > } else if(number.strata == 3 & project.acres > 100
&
> project.acres <
> >> > 500){
> >> > min.plots = 3
> >> > } else if(number.strata == 3 & project.acres > 501
& project.acres <
> >> > 5000){
> >> > min.plots = 4
> >> > } else if(number.strata == 3 & project.acres > 5001
& project.acres <
> >> > 10000){
> >> > min.plots = 5
> >> > } else if(number.strata == 3 & project.acres >
10000){
> >> > min.plots = 6
> >> > } else {min.plots=NULL}
> >> > } else (min.plots = min.plots)
> >> >
> >> >
> >> > if(number.strata > 3){print("max three strata")}
> >> > if(number.strata > 3){break}
> >> >
> >> >
> >> > p <- length(verification.plots)
> >> > mp <- min.plots
> >> > run <- unpaired.test(verification.plots, project.n,
project.mean,
> >> > project.sd,
> >> > allowed.deviation, project.acres, alpha=.05)
> >> > number.passing.plots <- function(verification.plots,
> success.fail.num){
> >> > n<-length(verification.plots)
> >> > number.passing<-rep(0,n)
> >> > number.passing[1]<-ifelse(sum(success.fail.num[1]>
mp),1,0)
> >> > for(i in 2:n){
> >> > if(success.fail.num[i] == 1){
> >> > v <- i-1
> >> > number.passing[i]<-number.passing[v]+1
> >> > } else{number.passing[i] <- 0}
> >> > }
> >> > number.passing
> >> > }
> >> >
> >> >
>
number.passing<-number.passing.plots(verification.plots=verification.plots,
> >> > success.fail.num=run$success.fail.num)
> >> > sucessful.unsucessful<-rep("blank",p)
> >> > one.zero <- rep(0,p)
> >> > result <- "blank"
> >> > resultL <- 0
> >> > n <-length(verification.plots)
> >> > for(i in 1:n){
> >> > if(number.passing[i] >= mp) {
> >> > sucessful.unsucessful[i] <- "successful"
> >> > one.zero[i] <- 1
> >> > } else {
> >> > sucessful.unsucessful[i]<- "unsuccessful"
> >> > one.zero[i] <- 0
> >> > }
> >> > }
> >> >
> >> > if(sum(one.zero) > 0 ) {
> >> > result <- "successful"
> >> > resultL <- 1
> >> > }
> >> >
> >> > plots.success <- function(one.zero){
> >> >
> >> > plots.to.success<-NULL
> >> >
> >> > for(i in 1:n){
> >> > if(sum(one.zero)==0){plots.to.success= NA
> >> > } else
if(one.zero[i]==1){plots.to.success<-append(plots.to.success,
> >> > i)}
> >> > }
> >> > plots.to.success<-min(plots.to.success)
> >> > plots.to.success
> >> > }
> >> >
> >> > plots.to.success <- plots.success(one.zero=one.zero)
> >> >
> >> > complete <-data.frame (min.plots, number.passing,
> sucessful.unsucessful,
> >> > one.zero)
> >> > collapsed <- data.frame(result, resultL, plots.to.success)
> >> > out <- c(as.list(complete),as.list(collapsed))
> >> > }
> >> >
> >> >
> >> > unpaired.out<-sequential.unpaired(number.strata=2,
> >> > verification.plots=verification.plots, project.n=15,
project.mean=100,
> >> > project.sd=10, allowed.deviation=.05, project.acres=99,
> >> > min.plots="default")
> >> >
> >> > str(unpaired.out)
> >> > #unpaired.out[[7]][1]
> >> >
> >> > ##
> >> >
> >> > simulation.unpaired <- function(reps,
project.acreage.range = c(99,
> >> > 110,510,5100,11000), project.mean, project.n.min,
project.n.max,
> >> > project.sd.min, project.sd.max, verification.mean.min,
> >> > verification.mean.max, number.verification.plots.min,
> >> > number.verification.plots.max, allowed.deviation=.1,
alpha=.05,
> >> > length.out
> >> > = 5, min.plots="default") {
> >> >
> >> > number.strata.range<-c(1:3) # set by CAR
> >> >
> >> > project.n.range <- seq(project.n.min, project.n.max,
> >> > length.out=length.out)
> >> >
> >> > project.sd.range <- seq(project.sd.min, project.sd.max,
> >> > length.out=length.out) # assume verification sd is the same
as the
> >> > project
> >> > sd to simplify - seems a reasonable simpification
> >> >
> >> > number.verification.plots<-
seq(number.verification.plots.min,
> >> > number.verification.plots.max, length.out=length.out)
> >> >
> >> > verification.range <- seq(verification.mean.min,
> verification.mean.max,
> >> > length.out=length.out)
> >> >
> >> > permut.grid<-expand.grid(number.strata.range,
project.n.range,
> >> > project.acreage.range, project.mean, project.sd.range,
> >> > number.verification.plots, verification.range,
allowed.deviation) #
> >> > create
> >> > a matrix with all combinations of the supplied vectors
> >> >
> >> > #assign names to the colums of the grid of combinations
> >> > names.permut<-c("number.strata",
"project.n.plots", "project.acreage",
> >> > "project.mean", "project.sd",
"number.verification.plots",
> >> > "verification.mean", "allowed.deviation")
> >> >
> >> > names(permut.grid)<-names.permut # done
> >> >
> >> > combinations<-length(permut.grid[,1])
> >> >
> >> > size <-reps*combinations #need to know the size of the
master matrix,
> >> > which
> >> > is the the number of replications * each combination of the
supplied
> >> > factors
> >> >
> >> > # we want a df from which to read all the data into the
simulation,
> and
> >> > record the results
> >> > permut.col<-ncol(permut.grid)
> >> > col.base<-ncol(permut.grid)+2
> >> > base <- matrix(nrow=size, ncol=col.base)
> >> > base <-data.frame(base)
> >> >
> >> > # supply the names
> >> > names.base<-c("number.strata",
"project.n.plots", "project.acreage",
> >> > "project.mean", "project.sd",
"number.verification.plots",
> >> > "verification.mean",
"allowed.deviation","success.fail",
> >> > "plots.to.success")
> >> >
> >> > names(base)<-names.base
> >> >
> >> > # need to create index vectors for the base, master df
> >> > ends <- seq(reps+1, size+1, by=reps)
> >> > begins <- ends-reps
> >> > index <- cbind(begins, ends-1)
> >> > #done
> >> >
> >> > # next, need to assign the first 6 columns and number of rows
= to the
> >> > number of reps in the simulation to be the given row in the
> permut.grid
> >> > matrix
> >> >
> >> > pb <- winProgressBar(title="Create base, big loop 1
of 2", label="0%
> >> > done",
> >> > min=0, max=100, initial=0)
> >> >
> >> > for (i in 1:combinations) {
> >> >
> >> > base[index[i,1]:index[i,2],1:permut.col] <-
permut.grid[i,]
> >> > #progress bar
> >> > info <- sprintf("%d%% done",
round((i/combinations)*100))
> >> > setWinProgressBar(pb, (i/combinations)*100, label=info)
> >> > }
> >> >
> >> > close(pb)
> >> >
> >> > # now, simply feed the values replicated the number of times
we want
> to
> >> > run
> >> > the simulation into the sequential.unpaired function, and
assign the
> >> > values
> >> > to the appropriate columns
> >> >
> >> > out.index1<-ncol(permut.grid)+1
> >> > out.index2<-ncol(permut.grid)+2
> >> >
> >> > #progress bar
> >> > pb <- winProgressBar(title="fill values, big loop 2
of 2", label="0%
> >> > done",
> >> > min=0, max=100, initial=0)
> >> >
> >> > for (i in 1:size){
> >> >
> >> > scalar.base <- base[i,]
> >> > verification.plots <-
rnorm(scalar.base$number.verification.plots,
> >> > scalar.base$verification.mean, scalar.base$project.sd)
> >> > result<- sequential.unpaired(scalar.base$number.strata,
> >> > scalar.base$project.n.plots, scalar.base$project.mean,
scalar.base$
> >> > project.sd, verification.plots,
scalar.base$allowed.deviation,
> >> > scalar.base$project.acreage, min.plots='default',
alpha)
> >> >
> >> > base[i,out.index1] <- result[[6]][1]
> >> > base[i,out.index2] <- result[[7]][1]
> >> > info <- sprintf("%d%% done",
round((i/size)*100))
> >> > setWinProgressBar(pb, (i/size)*100, label=info)
> >> > }
> >> >
> >> > close(pb)
> >> > #return the results
> >> > return(base)
> >> >
> >> > }
> >> >
> >> > # I would like reps to = 1000, but that takes a really long
time right
> >> > now
> >> > test5 <- simulation.unpaired(reps=5, project.acreage.range
= c(99,
> >> > 110,510,5100,11000), project.mean=100, project.n.min=10,
> >> > project.n.max=100,
> >> > project.sd.min=.01, project.sd.max=.2,
verification.mean.min=100,
> >> > verification.mean.max=130, number.verification.plots.min=10,
> >> > number.verification.plots.max=50, length.out = 5)
> >> >
> >> > [[alternative HTML version deleted]]
> >> >
> >> > ______________________________________________
> >> > R-help@r-project.org mailing list
> >> > https://stat.ethz.ch/mailman/listinfo/r-help
> >> > PLEASE do read the posting guide
> >> > http://www.R-project.org/posting-guide.html
> >> > and provide commented, minimal, self-contained, reproducible
code.
> >>
> >>
> >>
> >> --
> >> Joshua Wiley
> >> Ph.D. Student, Health Psychology
> >> Programmer Analyst II, Statistical Consulting Group
> >> University of California, Los Angeles
> >> https://joshuawiley.com/
> >
> >
>
>
>
> --
> Joshua Wiley
> Ph.D. Student, Health Psychology
> Programmer Analyst II, Statistical Consulting Group
> University of California, Los Angeles
> https://joshuawiley.com/
>
[[alternative HTML version deleted]]