Hi R-Help! I'm hoping that some of you may give me some tips that could make my code more efficient. More precisely, I would like to make the answer to my stakoverflow <http://stackoverflow.com/questions/31137940/randomly-assign-teachers-to-classrooms-imposing-restrictions> question more efficient. This is the code: library(dplyr) library(randomNames) library(geosphere) set.seed(7142015)# Define Parameters n.Schools <- 20 first.grade<-3 last.grade<-5 n.Grades <-last.grade-first.grade+1 n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per teacher # Define Random names function: gen.names <- function(n, which.names = "both", name.order = "last.first"){ names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order)) need <- n - length(names) while(need>0){ names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names)) need <- n - length(names) } return(names)} # Generate n.Schools names gen.schools <- function(n.schools) { School.ID <- paste0(gen.names(n = n.schools, which.names = "last"), ' School') School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025) School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025) School.RE <- rnorm(n = n.schools, mean = 0, sd = 1) Schools <- data.frame(School.ID, School.lat, School.long, School.RE) %>% mutate(School.ID = as.character(School.ID)) %>% rowwise() %>% mutate (School.distance = distHaversine( p1 = c(School.long, School.lat), p2 = c(21.7672, 58.8471), r = 3961 )) return(Schools)} Schools <- gen.schools(n.schools = n.Schools) # Generate Grades Grades <- c(first.grade:last.grade) # Generate n.Classrooms Classrooms <- LETTERS[1:n.Classrooms] # Group schools and grades SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades, '-'), FUN="paste")#head(SchGr) # Group SchGr and Classrooms SchGrClss <- outer(SchGr, paste0(Classrooms, '-'), FUN="paste")#head(SchGrClss) # These are the combination of School-Grades-Classroom SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) ) SchGrClssEnd <- as.data.frame(SchGrClssTmp) # Assign n.Teachers (2 classroom in a given school-grade) Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2))) AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ") library(stringr) separoPairs <- as.data.frame(str_split(string = AllpairsTmp, pattern = "-")) separoPairs <- as.data.frame(t(separoPairs)) row.names(separoPairs) <- NULL separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty column mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2), V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both")) separoPairs[120,]$V4#Only the rows with V1=V4 and V2=V5 are valid validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>% select(V1, V2, V3, V6) # Generate n.Teachers gen.teachers <- function(n.teachers){ Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first") Teacher.exp <- runif(n = n.teachers, min = 1, max = 30) Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers) Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1) Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE) return(Teachers)} Teachers <- gen.teachers(n.teachers = n.Teachers) %>% mutate(Teacher.ID = as.character(Teacher.ID)) # Randomly assign n.Teachers teachers to the "ValidPairs" TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ] Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments) names(Assignments) <- c("Teacher.ID", "School.ID", "Grade", "Class_1", "Class_2") # Tidy Data library(tidyr) TeacherClassroom <- Assignments %>% gather(x, Classroom, Class_1,Class_2) %>% select(-x) %>% mutate(Teacher.ID = as.character(Teacher.ID)) # Merge DF_Classrooms <- TeacherClassroom %>% full_join(Teachers, by="Teacher.ID") %>% full_join(Schools, by="School.ID") rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space! *I want to end up with the same* 'DF_Classrooms *data frame* but getting there in a more efficient way. In particular, when is use n.Classrooms <-4 the code run fast, but *if I increase it to something like 20 it is painfully slow.* Thanks!!! [[alternative HTML version deleted]]
Hi Ignacio, If I am reading your code correctly then the top while loop is essentially seeking to select a random set of names from the original set, then using unique to reduce it down, you then iterate until you have built your quota. Ultimately this results in a very inefficient attempt at sampling without replacement. Why not just sample without replacement rather than loop iteratively and use unique? Or if the set of possible names are short enough why not just randomize it and then pull the first n items off? Best, Collin. On Wed, Jul 15, 2015 at 11:15 PM, Ignacio Martinez <ignacio82 at gmail.com> wrote:> Hi R-Help! > > I'm hoping that some of you may give me some tips that could make my code > more efficient. More precisely, I would like to make the answer to my > stakoverflow > < > http://stackoverflow.com/questions/31137940/randomly-assign-teachers-to-classrooms-imposing-restrictions > > > question more efficient. > > This is the code: > > library(dplyr) > library(randomNames) > library(geosphere) > set.seed(7142015)# Define Parameters > n.Schools <- 20 > first.grade<-3 > last.grade<-5 > n.Grades <-last.grade-first.grade+1 > n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE > n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per > teacher > # Define Random names function: > gen.names <- function(n, which.names = "both", name.order = "last.first"){ > names <- unique(randomNames(n=n, which.names = which.names, > name.order = name.order)) > need <- n - length(names) > while(need>0){ > names <- unique(c(randomNames(n=need, which.names = which.names, > name.order = name.order), names)) > need <- n - length(names) > } > return(names)} > # Generate n.Schools names > gen.schools <- function(n.schools) { > School.ID <- > paste0(gen.names(n = n.schools, which.names = "last"), ' School') > School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025) > School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025) > School.RE <- rnorm(n = n.schools, mean = 0, sd = 1) > Schools <- > data.frame(School.ID, School.lat, School.long, School.RE) %>% > mutate(School.ID = as.character(School.ID)) %>% > rowwise() %>% mutate (School.distance = distHaversine( > p1 = c(School.long, School.lat), > p2 = c(21.7672, 58.8471), r = 3961 > )) > return(Schools)} > > Schools <- gen.schools(n.schools = n.Schools) > # Generate Grades > Grades <- c(first.grade:last.grade) > # Generate n.Classrooms > > Classrooms <- LETTERS[1:n.Classrooms] > # Group schools and grades > > SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades, '-'), > FUN="paste")#head(SchGr) > # Group SchGr and Classrooms > > SchGrClss <- outer(SchGr, paste0(Classrooms, '-'), > FUN="paste")#head(SchGrClss) > # These are the combination of School-Grades-Classroom > SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) ) > SchGrClssEnd <- as.data.frame(SchGrClssTmp) > # Assign n.Teachers (2 classroom in a given school-grade) > Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2))) > AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ") > > library(stringr) > separoPairs <- as.data.frame(str_split(string = AllpairsTmp, pattern > "-")) > separoPairs <- as.data.frame(t(separoPairs)) > row.names(separoPairs) <- NULL > separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty column > mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2), > V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both")) > > separoPairs[120,]$V4#Only the rows with V1=V4 and V2=V5 are valid > validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>% select(V1, V2, > V3, V6) > # Generate n.Teachers > > gen.teachers <- function(n.teachers){ > Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first") > Teacher.exp <- runif(n = n.teachers, min = 1, max = 30) > Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), > size = n.teachers) > Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1) > Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, > Teacher.RE) > return(Teachers)} > Teachers <- gen.teachers(n.teachers = n.Teachers) %>% > mutate(Teacher.ID = as.character(Teacher.ID)) > # Randomly assign n.Teachers teachers to the "ValidPairs" > TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ] > Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments) > names(Assignments) <- c("Teacher.ID", "School.ID", "Grade", "Class_1", > "Class_2") > # Tidy Data > library(tidyr) > TeacherClassroom <- Assignments %>% > gather(x, Classroom, Class_1,Class_2) %>% > select(-x) %>% > mutate(Teacher.ID = as.character(Teacher.ID)) > # Merge > DF_Classrooms <- TeacherClassroom %>% full_join(Teachers, > by="Teacher.ID") %>% full_join(Schools, by="School.ID") > rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space! > > *I want to end up with the same* 'DF_Classrooms *data frame* but getting > there in a more efficient way. In particular, when is use n.Classrooms <-4 > the > code run fast, but *if I increase it to something like 20 it is painfully > slow.* > > Thanks!!! > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. >[[alternative HTML version deleted]]
Hi Collin, The objective of the gen.names function is to generate N *unique *random names, where N is a *large *number. In my computer `gen.names(n = 50000)` takes under a second, so is probably not the root problem in my code. That said, I would love to improve it. I'm not exactly sure how you propose to change it using sample. What is the object that I would be sampling? I would love to run a little benchmark to compare my version with yours. What really takes a long time to run is: separoPairs <- as.data.frame(str_split(string = AllpairsTmp, pattern "-")) So that and the chunk of code before that is probably where I would get big gains in speed. Sadly, I have no clue how to do it differently Thanks a lot for the help!! Ignacio On Wed, Jul 15, 2015 at 11:34 PM Collin Lynch <cflynch at ncsu.edu> wrote:> Hi Ignacio, If I am reading your code correctly then the top while loop is > essentially seeking to select a random set of names from the original set, > then using unique to reduce it down, you then iterate until you have built > your quota. Ultimately this results in a very inefficient attempt at > sampling without replacement. Why not just sample without replacement > rather than loop iteratively and use unique? Or if the set of possible > names are short enough why not just randomize it and then pull the first n > items off? > > Best, > Collin. > > On Wed, Jul 15, 2015 at 11:15 PM, Ignacio Martinez <ignacio82 at gmail.com> > wrote: > >> Hi R-Help! >> >> I'm hoping that some of you may give me some tips that could make my code >> > more efficient. More precisely, I would like to make the answer to my >> stakoverflow >> < >> http://stackoverflow.com/questions/31137940/randomly-assign-teachers-to-classrooms-imposing-restrictions >> > > > >> question more efficient. >> >> This is the code: >> >> library(dplyr) >> library(randomNames) >> library(geosphere) >> > set.seed(7142015)# Define Parameters > > >> n.Schools <- 20 >> first.grade<-3 >> last.grade<-5 >> n.Grades <-last.grade-first.grade+1 >> n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE >> n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per >> teacher >> # Define Random names function: >> gen.names <- function(n, which.names = "both", name.order = "last.first"){ >> names <- unique(randomNames(n=n, which.names = which.names, >> name.order = name.order)) >> need <- n - length(names) >> while(need>0){ >> names <- unique(c(randomNames(n=need, which.names = which.names, >> name.order = name.order), names)) >> need <- n - length(names) >> } >> return(names)} >> # Generate n.Schools names >> gen.schools <- function(n.schools) { >> School.ID <- >> paste0(gen.names(n = n.schools, which.names = "last"), ' School') >> School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025) >> School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025) >> School.RE <- rnorm(n = n.schools, mean = 0, sd = 1) >> Schools <- >> data.frame(School.ID, School.lat, School.long, School.RE) %>% >> mutate(School.ID = as.character(School.ID)) %>% >> rowwise() %>% mutate (School.distance = distHaversine( >> p1 = c(School.long, School.lat), >> p2 = c(21.7672, 58.8471), r = 3961 >> )) >> return(Schools)} >> >> Schools <- gen.schools(n.schools = n.Schools) >> # Generate Grades >> Grades <- c(first.grade:last.grade) >> # Generate n.Classrooms >> >> Classrooms <- LETTERS[1:n.Classrooms] >> # Group schools and grades >> >> SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades, '-'), >> FUN="paste")#head(SchGr) >> # Group SchGr and Classrooms >> >> SchGrClss <- outer(SchGr, paste0(Classrooms, '-'), >> FUN="paste")#head(SchGrClss) >> # These are the combination of School-Grades-Classroom >> SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) ) >> SchGrClssEnd <- as.data.frame(SchGrClssTmp) >> # Assign n.Teachers (2 classroom in a given school-grade) >> Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2))) >> AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ") >> >> library(stringr) >> separoPairs <- as.data.frame(str_split(string = AllpairsTmp, pattern >> "-")) >> separoPairs <- as.data.frame(t(separoPairs)) >> row.names(separoPairs) <- NULL >> separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty column >> mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2), >> V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both")) >> >> separoPairs[120,]$V4#Only the rows with V1=V4 and V2=V5 are valid > > >> validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>% select(V1, V2, >> V3, V6) >> # Generate n.Teachers >> >> gen.teachers <- function(n.teachers){ >> Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first") >> Teacher.exp <- runif(n = n.teachers, min = 1, max = 30) >> Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), >> size = n.teachers) >> Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1) >> Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, >> Teacher.RE) >> return(Teachers)} >> Teachers <- gen.teachers(n.teachers = n.Teachers) %>% >> mutate(Teacher.ID = as.character(Teacher.ID)) >> # Randomly assign n.Teachers teachers to the "ValidPairs" >> TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ] >> Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments) >> names(Assignments) <- c("Teacher.ID", "School.ID", "Grade", "Class_1", >> "Class_2") >> # Tidy Data >> library(tidyr) >> TeacherClassroom <- Assignments %>% >> gather(x, Classroom, Class_1,Class_2) %>% >> select(-x) %>% >> mutate(Teacher.ID = as.character(Teacher.ID)) >> # Merge >> DF_Classrooms <- TeacherClassroom %>% full_join(Teachers, >> by="Teacher.ID") %>% full_join(Schools, by="School.ID") >> rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space! >> >> *I want to end up with the same* 'DF_Classrooms *data frame* but getting > > >> there in a more efficient way. In particular, when is use n.Classrooms >> <-4 the >> > code run fast, but *if I increase it to something like 20 it is painfully >> slow.* >> >> Thanks!!! >> >> [[alternative HTML version deleted]] >> >> ______________________________________________ >> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see >> https://stat.ethz.ch/mailman/listinfo/r-help >> PLEASE do read the posting guide >> http://www.R-project.org/posting-guide.html >> and provide commented, minimal, self-contained, reproducible code. >> > >[[alternative HTML version deleted]]