Hi R-Help!
I'm hoping that some of you may give me some tips that could make my code
more efficient. More precisely, I would like to make the answer to my
stakoverflow
<http://stackoverflow.com/questions/31137940/randomly-assign-teachers-to-classrooms-imposing-restrictions>
question more efficient.
This is the code:
library(dplyr)
library(randomNames)
library(geosphere)
set.seed(7142015)# Define Parameters
n.Schools <- 20
first.grade<-3
last.grade<-5
n.Grades <-last.grade-first.grade+1
n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE
n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per teacher
# Define Random names function:
gen.names <- function(n, which.names = "both", name.order =
"last.first"){
names <- unique(randomNames(n=n, which.names = which.names,
name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names,
name.order = name.order), names))
need <- n - length(names)
}
return(names)}
# Generate n.Schools names
gen.schools <- function(n.schools) {
School.ID <-
paste0(gen.names(n = n.schools, which.names = "last"), '
School')
School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025)
School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025)
School.RE <- rnorm(n = n.schools, mean = 0, sd = 1)
Schools <-
data.frame(School.ID, School.lat, School.long, School.RE) %>%
mutate(School.ID = as.character(School.ID)) %>%
rowwise() %>% mutate (School.distance = distHaversine(
p1 = c(School.long, School.lat),
p2 = c(21.7672, 58.8471), r = 3961
))
return(Schools)}
Schools <- gen.schools(n.schools = n.Schools)
# Generate Grades
Grades <- c(first.grade:last.grade)
# Generate n.Classrooms
Classrooms <- LETTERS[1:n.Classrooms]
# Group schools and grades
SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades,
'-'),
FUN="paste")#head(SchGr)
# Group SchGr and Classrooms
SchGrClss <- outer(SchGr, paste0(Classrooms, '-'),
FUN="paste")#head(SchGrClss)
# These are the combination of School-Grades-Classroom
SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) )
SchGrClssEnd <- as.data.frame(SchGrClssTmp)
# Assign n.Teachers (2 classroom in a given school-grade)
Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2)))
AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ")
library(stringr)
separoPairs <- as.data.frame(str_split(string = AllpairsTmp, pattern =
"-"))
separoPairs <- as.data.frame(t(separoPairs))
row.names(separoPairs) <- NULL
separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty column
mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2),
V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both"))
separoPairs[120,]$V4#Only the rows with V1=V4 and V2=V5 are valid
validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>%
select(V1, V2, V3, V6)
# Generate n.Teachers
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order =
"last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5),
size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)}
Teachers <- gen.teachers(n.teachers = n.Teachers) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Randomly assign n.Teachers teachers to the "ValidPairs"
TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ]
Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments)
names(Assignments) <- c("Teacher.ID", "School.ID",
"Grade", "Class_1",
"Class_2")
# Tidy Data
library(tidyr)
TeacherClassroom <- Assignments %>%
gather(x, Classroom, Class_1,Class_2) %>%
select(-x) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Merge
DF_Classrooms <- TeacherClassroom %>% full_join(Teachers,
by="Teacher.ID") %>% full_join(Schools, by="School.ID")
rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space!
*I want to end up with the same* 'DF_Classrooms *data frame* but getting
there in a more efficient way. In particular, when is use n.Classrooms <-4
the
code run fast, but *if I increase it to something like 20 it is painfully
slow.*
Thanks!!!
[[alternative HTML version deleted]]
Hi Ignacio, If I am reading your code correctly then the top while loop is
essentially seeking to select a random set of names from the original set,
then using unique to reduce it down, you then iterate until you have built
your quota. Ultimately this results in a very inefficient attempt at
sampling without replacement. Why not just sample without replacement
rather than loop iteratively and use unique? Or if the set of possible
names are short enough why not just randomize it and then pull the first n
items off?
Best,
Collin.
On Wed, Jul 15, 2015 at 11:15 PM, Ignacio Martinez <ignacio82 at
gmail.com>
wrote:
> Hi R-Help!
>
> I'm hoping that some of you may give me some tips that could make my
code
> more efficient. More precisely, I would like to make the answer to my
> stakoverflow
> <
>
http://stackoverflow.com/questions/31137940/randomly-assign-teachers-to-classrooms-imposing-restrictions
> >
> question more efficient.
>
> This is the code:
>
> library(dplyr)
> library(randomNames)
> library(geosphere)
> set.seed(7142015)# Define Parameters
> n.Schools <- 20
> first.grade<-3
> last.grade<-5
> n.Grades <-last.grade-first.grade+1
> n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE
> n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per
> teacher
> # Define Random names function:
> gen.names <- function(n, which.names = "both", name.order =
"last.first"){
> names <- unique(randomNames(n=n, which.names = which.names,
> name.order = name.order))
> need <- n - length(names)
> while(need>0){
> names <- unique(c(randomNames(n=need, which.names = which.names,
> name.order = name.order), names))
> need <- n - length(names)
> }
> return(names)}
> # Generate n.Schools names
> gen.schools <- function(n.schools) {
> School.ID <-
> paste0(gen.names(n = n.schools, which.names = "last"), '
School')
> School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025)
> School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025)
> School.RE <- rnorm(n = n.schools, mean = 0, sd = 1)
> Schools <-
> data.frame(School.ID, School.lat, School.long, School.RE) %>%
> mutate(School.ID = as.character(School.ID)) %>%
> rowwise() %>% mutate (School.distance = distHaversine(
> p1 = c(School.long, School.lat),
> p2 = c(21.7672, 58.8471), r = 3961
> ))
> return(Schools)}
>
> Schools <- gen.schools(n.schools = n.Schools)
> # Generate Grades
> Grades <- c(first.grade:last.grade)
> # Generate n.Classrooms
>
> Classrooms <- LETTERS[1:n.Classrooms]
> # Group schools and grades
>
> SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades,
'-'),
> FUN="paste")#head(SchGr)
> # Group SchGr and Classrooms
>
> SchGrClss <- outer(SchGr, paste0(Classrooms, '-'),
> FUN="paste")#head(SchGrClss)
> # These are the combination of School-Grades-Classroom
> SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) )
> SchGrClssEnd <- as.data.frame(SchGrClssTmp)
> # Assign n.Teachers (2 classroom in a given school-grade)
> Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2)))
> AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ")
>
> library(stringr)
> separoPairs <- as.data.frame(str_split(string = AllpairsTmp, pattern
> "-"))
> separoPairs <- as.data.frame(t(separoPairs))
> row.names(separoPairs) <- NULL
> separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty
column
> mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2),
> V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both"))
>
> separoPairs[120,]$V4#Only the rows with V1=V4 and V2=V5 are valid
> validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>%
select(V1, V2,
> V3, V6)
> # Generate n.Teachers
>
> gen.teachers <- function(n.teachers){
> Teacher.ID <- gen.names(n = n.teachers, name.order =
"last.first")
> Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
> Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5),
> size = n.teachers)
> Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
> Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other,
> Teacher.RE)
> return(Teachers)}
> Teachers <- gen.teachers(n.teachers = n.Teachers) %>%
> mutate(Teacher.ID = as.character(Teacher.ID))
> # Randomly assign n.Teachers teachers to the "ValidPairs"
> TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ]
> Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments)
> names(Assignments) <- c("Teacher.ID", "School.ID",
"Grade", "Class_1",
> "Class_2")
> # Tidy Data
> library(tidyr)
> TeacherClassroom <- Assignments %>%
> gather(x, Classroom, Class_1,Class_2) %>%
> select(-x) %>%
> mutate(Teacher.ID = as.character(Teacher.ID))
> # Merge
> DF_Classrooms <- TeacherClassroom %>% full_join(Teachers,
> by="Teacher.ID") %>% full_join(Schools,
by="School.ID")
> rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space!
>
> *I want to end up with the same* 'DF_Classrooms *data frame* but
getting
> there in a more efficient way. In particular, when is use n.Classrooms
<-4
> the
> code run fast, but *if I increase it to something like 20 it is painfully
> slow.*
>
> Thanks!!!
>
> [[alternative HTML version deleted]]
>
> ______________________________________________
> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
> http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>
[[alternative HTML version deleted]]
Hi Collin,
The objective of the gen.names function is to generate N *unique *random
names, where N is a *large *number. In my computer `gen.names(n = 50000)`
takes under a second, so is probably not the root problem in my code. That
said, I would love to improve it. I'm not exactly sure how you propose to
change it using sample. What is the object that I would be sampling? I
would love to run a little benchmark to compare my version with yours.
What really takes a long time to run is:
separoPairs <- as.data.frame(str_split(string = AllpairsTmp, pattern
"-"))
So that and the chunk of code before that is probably where I would get big
gains in speed. Sadly, I have no clue how to do it differently
Thanks a lot for the help!!
Ignacio
On Wed, Jul 15, 2015 at 11:34 PM Collin Lynch <cflynch at ncsu.edu> wrote:
> Hi Ignacio, If I am reading your code correctly then the top while loop is
> essentially seeking to select a random set of names from the original set,
> then using unique to reduce it down, you then iterate until you have built
> your quota. Ultimately this results in a very inefficient attempt at
> sampling without replacement. Why not just sample without replacement
> rather than loop iteratively and use unique? Or if the set of possible
> names are short enough why not just randomize it and then pull the first n
> items off?
>
> Best,
> Collin.
>
> On Wed, Jul 15, 2015 at 11:15 PM, Ignacio Martinez <ignacio82 at
gmail.com>
> wrote:
>
>> Hi R-Help!
>>
>> I'm hoping that some of you may give me some tips that could make
my code
>>
> more efficient. More precisely, I would like to make the answer to my
>> stakoverflow
>> <
>>
http://stackoverflow.com/questions/31137940/randomly-assign-teachers-to-classrooms-imposing-restrictions
>> >
>
>
>> question more efficient.
>>
>> This is the code:
>>
>> library(dplyr)
>> library(randomNames)
>> library(geosphere)
>>
> set.seed(7142015)# Define Parameters
>
>
>> n.Schools <- 20
>> first.grade<-3
>> last.grade<-5
>> n.Grades <-last.grade-first.grade+1
>> n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE
>> n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms
per
>> teacher
>> # Define Random names function:
>> gen.names <- function(n, which.names = "both", name.order
= "last.first"){
>> names <- unique(randomNames(n=n, which.names = which.names,
>> name.order = name.order))
>> need <- n - length(names)
>> while(need>0){
>> names <- unique(c(randomNames(n=need, which.names = which.names,
>> name.order = name.order), names))
>> need <- n - length(names)
>> }
>> return(names)}
>> # Generate n.Schools names
>> gen.schools <- function(n.schools) {
>> School.ID <-
>> paste0(gen.names(n = n.schools, which.names = "last"),
' School')
>> School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025)
>> School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025)
>> School.RE <- rnorm(n = n.schools, mean = 0, sd = 1)
>> Schools <-
>> data.frame(School.ID, School.lat, School.long, School.RE) %>%
>> mutate(School.ID = as.character(School.ID)) %>%
>> rowwise() %>% mutate (School.distance = distHaversine(
>> p1 = c(School.long, School.lat),
>> p2 = c(21.7672, 58.8471), r = 3961
>> ))
>> return(Schools)}
>>
>> Schools <- gen.schools(n.schools = n.Schools)
>> # Generate Grades
>> Grades <- c(first.grade:last.grade)
>> # Generate n.Classrooms
>>
>> Classrooms <- LETTERS[1:n.Classrooms]
>> # Group schools and grades
>>
>> SchGr <- outer(paste0(Schools$School.ID, '-'),
paste0(Grades, '-'),
>> FUN="paste")#head(SchGr)
>> # Group SchGr and Classrooms
>>
>> SchGrClss <- outer(SchGr, paste0(Classrooms, '-'),
>> FUN="paste")#head(SchGrClss)
>> # These are the combination of School-Grades-Classroom
>> SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss)
)
>> SchGrClssEnd <- as.data.frame(SchGrClssTmp)
>> # Assign n.Teachers (2 classroom in a given school-grade)
>> Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2)))
>> AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ")
>>
>> library(stringr)
>> separoPairs <- as.data.frame(str_split(string = AllpairsTmp, pattern
>> "-"))
>> separoPairs <- as.data.frame(t(separoPairs))
>> row.names(separoPairs) <- NULL
>> separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty
column
>> mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2),
>> V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which =
"both"))
>>
>> separoPairs[120,]$V4#Only the rows with V1=V4 and V2=V5 are valid
>
>
>> validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>%
select(V1, V2,
>> V3, V6)
>> # Generate n.Teachers
>>
>> gen.teachers <- function(n.teachers){
>> Teacher.ID <- gen.names(n = n.teachers, name.order =
"last.first")
>> Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
>> Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5),
>> size = n.teachers)
>> Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
>> Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other,
>> Teacher.RE)
>> return(Teachers)}
>> Teachers <- gen.teachers(n.teachers = n.Teachers) %>%
>> mutate(Teacher.ID = as.character(Teacher.ID))
>> # Randomly assign n.Teachers teachers to the "ValidPairs"
>> TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers),
]
>> Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments)
>> names(Assignments) <- c("Teacher.ID",
"School.ID", "Grade", "Class_1",
>> "Class_2")
>> # Tidy Data
>> library(tidyr)
>> TeacherClassroom <- Assignments %>%
>> gather(x, Classroom, Class_1,Class_2) %>%
>> select(-x) %>%
>> mutate(Teacher.ID = as.character(Teacher.ID))
>> # Merge
>> DF_Classrooms <- TeacherClassroom %>% full_join(Teachers,
>> by="Teacher.ID") %>% full_join(Schools,
by="School.ID")
>> rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work
space!
>>
>> *I want to end up with the same* 'DF_Classrooms *data frame* but
getting
>
>
>> there in a more efficient way. In particular, when is use n.Classrooms
>> <-4 the
>>
> code run fast, but *if I increase it to something like 20 it is painfully
>> slow.*
>>
>> Thanks!!!
>>
>> [[alternative HTML version deleted]]
>>
>> ______________________________________________
>> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide
>> http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>>
>
>
[[alternative HTML version deleted]]