Bill
2013-Dec-02 11:24 UTC
[R] why change days of the week from a factor to an ordered factor?
I am reading the code below. It acts on a csv file called dodgers.csv with the following variables.> print(str(dodgers)) # check the structure of the data frame'data.frame': 81 obs. of 12 variables: $ month : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 1 1 1 1 1 ... $ day : int 10 11 12 13 14 15 23 24 25 27 ... $ attend : int 56000 29729 28328 31601 46549 38359 26376 44014 26345 44807 ... $ day_of_week: Factor w/ 7 levels "Friday","Monday",..: 6 7 5 1 3 4 2 6 7 1 ... $ opponent : Factor w/ 17 levels "Angels","Astros",..: 13 13 13 11 11 11 3 3 3 10 ... $ temp : int 67 58 57 54 57 65 60 63 64 66 ... $ skies : Factor w/ 2 levels "Clear ","Cloudy": 1 2 2 2 2 1 2 2 2 1 ... $ day_night : Factor w/ 2 levels "Day","Night": 1 2 2 2 2 1 2 2 2 2 ... $ cap : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... $ shirt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... $ fireworks : Factor w/ 2 levels "NO","YES": 1 1 1 2 1 1 1 1 1 2 ... $ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... NULL>I don't understand why the author of the code decided to make the factor days_of_week into an ordered factor. Anyone know why this should be done? Thank you. Here is the code: # Predictive Model for Los Angeles Dodgers Promotion and Attendance library(car) # special functions for linear regression library(lattice) # graphics package # read in data and create a data frame called dodgers dodgers <- read.csv("dodgers.csv") print(str(dodgers)) # check the structure of the data frame # define an ordered day-of-week variable # for plots and data summaries dodgers$ordered_day_of_week <- with(data=dodgers, ifelse ((day_of_week == "Monday"),1, ifelse ((day_of_week == "Tuesday"),2, ifelse ((day_of_week == "Wednesday"),3, ifelse ((day_of_week == "Thursday"),4, ifelse ((day_of_week == "Friday"),5, ifelse ((day_of_week == "Saturday"),6,7))))))) dodgers$ordered_day_of_week <- factor(dodgers$ordered_day_of_week, levels=1:7, labels=c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun")) # exploratory data analysis with standard graphics: attendance by day of week with(data=dodgers,plot(ordered_day_of_week, attend/1000, xlab = "Day of Week", ylab = "Attendance (thousands)", col = "violet", las = 1)) # when do the Dodgers use bobblehead promotions with(dodgers, table(bobblehead,ordered_day_of_week)) # bobbleheads on Tuesday # define an ordered month variable # for plots and data summaries dodgers$ordered_month <- with(data=dodgers, ifelse ((month == "APR"),4, ifelse ((month == "MAY"),5, ifelse ((month == "JUN"),6, ifelse ((month == "JUL"),7, ifelse ((month == "AUG"),8, ifelse ((month == "SEP"),9,10))))))) dodgers$ordered_month <- factor(dodgers$ordered_month, levels=4:10, labels = c("April", "May", "June", "July", "Aug", "Sept", "Oct")) # exploratory data analysis with standard R graphics: attendance by month with(data=dodgers,plot(ordered_month,attend/1000, xlab = "Month", ylab = "Attendance (thousands)", col = "light blue", las = 1)) # exploratory data analysis displaying many variables # looking at attendance and conditioning on day/night # the skies and whether or not fireworks are displayed library(lattice) # used for plotting # let us prepare a graphical summary of the dodgers data group.labels <- c("No Fireworks","Fireworks") group.symbols <- c(21,24) group.colors <- c("black","black") group.fill <- c("black","red") xyplot(attend/1000 ~ temp | skies + day_night, data = dodgers, groups = fireworks, pch = group.symbols, aspect = 1, cex = 1.5, col = group.colors, fill = group.fill, layout = c(2, 2), type = c("p","g"), strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1), xlab = "Temperature (Degrees Fahrenheit)", ylab = "Attendance (thousands)", key = list(space = "top", text = list(rev(group.labels),col = rev(group.colors)), points = list(pch = rev(group.symbols), col = rev(group.colors), fill = rev(group.fill)))) # attendance by opponent and day/night game group.labels <- c("Day","Night") group.symbols <- c(1,20) group.symbols.size <- c(2,2.75) bwplot(opponent ~ attend/1000, data = dodgers, groups = day_night, xlab = "Attendance (thousands)", panel = function(x, y, groups, subscripts, ...) {panel.grid(h = (length(levels(dodgers$opponent)) - 1), v = -1) panel.stripplot(x, y, groups = groups, subscripts = subscripts, cex = group.symbols.size, pch = group.symbols, col = "darkblue") }, key = list(space = "top", text = list(group.labels,col = "black"), points = list(pch = group.symbols, cex = group.symbols.size, col = "darkblue"))) # specify a simple model with bobblehead entered last my.model <- {attend ~ ordered_month + ordered_day_of_week + bobblehead} # employ a training-and-test regimen set.seed(1234) # set seed for repeatability of training-and-test split training_test <- c(rep(1,length=trunc((2/3)*nrow(dodgers))), rep(2,length=(nrow(dodgers) - trunc((2/3)*nrow(dodgers))))) dodgers$training_test <- sample(training_test) # random permutation dodgers$training_test <- factor(dodgers$training_test, levels=c(1,2), labels=c("TRAIN","TEST")) dodgers.train <- subset(dodgers, training_test == "TRAIN") print(str(dodgers.train)) # check training data frame dodgers.test <- subset(dodgers, training_test == "TEST") print(str(dodgers.test)) # check test data frame # fit the model to the training set train.model.fit <- lm(my.model, data = dodgers.train) # obtain predictions from the training set dodgers.train$predict_attend <- predict(train.model.fit) # evaluate the fitted model on the test set dodgers.test$predict_attend <- predict(train.model.fit, newdata = dodgers.test) # compute the proportion of response variance # accounted for when predicting out-of-sample cat("\n","Proportion of Test Set Variance Accounted for: ", round((with(dodgers.test,cor(attend,predict_attend)^2)), digits=3),"\n",sep="") # merge the training and test sets for plotting dodgers.plotting.frame <- rbind(dodgers.train,dodgers.test) # generate predictive modeling visual for management group.labels <- c("No Bobbleheads","Bobbleheads") group.symbols <- c(21,24) group.colors <- c("black","black") group.fill <- c("black","red") xyplot(predict_attend/1000 ~ attend/1000 | training_test, data = dodgers.plotting.frame, groups = bobblehead, cex = 2, pch = group.symbols, col = group.colors, fill = group.fill, layout = c(2, 1), xlim = c(20,65), ylim = c(20,65), aspect=1, type = c("p","g"), panel=function(x,y, ...) {panel.xyplot(x,y,...) panel.segments(25,25,60,60,col="black",cex=2) }, strip=function(...) strip.default(..., style=1), xlab = "Actual Attendance (thousands)", ylab = "Predicted Attendance (thousands)", key = list(space = "top", text = list(rev(group.labels),col = rev(group.colors)), points = list(pch = rev(group.symbols), col = rev(group.colors), fill = rev(group.fill)))) # use the full data set to obtain an estimate of the increase in # attendance due to bobbleheads, controlling for other factors my.model.fit <- lm(my.model, data = dodgers) # use all available data print(summary(my.model.fit)) # tests statistical significance of the bobblehead promotion # type I anova computes sums of squares for sequential tests print(anova(my.model.fit)) cat("\n","Estimated Effect of Bobblehead Promotion on Attendance: ", round(my.model.fit$coefficients[length(my.model.fit$coefficients)], digits = 0),"\n",sep="") # standard graphics provide diagnostic plots plot(my.model.fit) # additional model diagnostics drawn from the car package library(car) residualPlots(my.model.fit) marginalModelPlots(my.model.fit) print(outlierTest(my.model.fit)) [[alternative HTML version deleted]]
Bert Gunter
2013-Dec-02 15:21 UTC
[R] why change days of the week from a factor to an ordered factor?
"BIll" : (Sorry -- Doubt that this will be helpful, but I couln't resist) "I don't understand why the author of the code decided to make the factor days_of_week into an ordered factor. Anyone know why this should be done?" A definitive answer would require either psychic abilities or asking the author/maintainer of the code. I suggest you try the latter. However, insight might be gained by **you** answering the following question: What is the difference between ordered and unordered factors? Note that one might expect some results that change with day of week to do so in an "orderly" way. For example, I would imagine that grocery purchases are at more or less one level on M-TH and at a higher level on Fri-Sun in the U.S . Ordered factors would be better at capturing this sort of thing I would think (with fewer df). Cheers, Bert On Mon, Dec 2, 2013 at 3:24 AM, Bill <william108 at gmail.com> wrote:> I am reading the code below. It acts on a csv file called dodgers.csv with > the following variables. > > >> print(str(dodgers)) # check the structure of the data frame > 'data.frame': 81 obs. of 12 variables: > $ month : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 1 1 1 1 > 1 ... > $ day : int 10 11 12 13 14 15 23 24 25 27 ... > $ attend : int 56000 29729 28328 31601 46549 38359 26376 44014 26345 > 44807 ... > $ day_of_week: Factor w/ 7 levels "Friday","Monday",..: 6 7 5 1 3 4 2 6 7 > 1 ... > $ opponent : Factor w/ 17 levels "Angels","Astros",..: 13 13 13 11 11 11 > 3 3 3 10 ... > $ temp : int 67 58 57 54 57 65 60 63 64 66 ... > $ skies : Factor w/ 2 levels "Clear ","Cloudy": 1 2 2 2 2 1 2 2 2 1 > ... > $ day_night : Factor w/ 2 levels "Day","Night": 1 2 2 2 2 1 2 2 2 2 ... > $ cap : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... > $ shirt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... > $ fireworks : Factor w/ 2 levels "NO","YES": 1 1 1 2 1 1 1 1 1 2 ... > $ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... > NULL >> > > I don't understand why the author of the code decided to make the factor > days_of_week into an ordered factor. Anyone know why this should be done? > Thank you. > > Here is the code: > > # Predictive Model for Los Angeles Dodgers Promotion and Attendance > > library(car) # special functions for linear regression > library(lattice) # graphics package > > # read in data and create a data frame called dodgers > dodgers <- read.csv("dodgers.csv") > print(str(dodgers)) # check the structure of the data frame > > # define an ordered day-of-week variable > # for plots and data summaries > dodgers$ordered_day_of_week <- with(data=dodgers, > ifelse ((day_of_week == "Monday"),1, > ifelse ((day_of_week == "Tuesday"),2, > ifelse ((day_of_week == "Wednesday"),3, > ifelse ((day_of_week == "Thursday"),4, > ifelse ((day_of_week == "Friday"),5, > ifelse ((day_of_week == "Saturday"),6,7))))))) > dodgers$ordered_day_of_week <- factor(dodgers$ordered_day_of_week, > levels=1:7, > labels=c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun")) > > # exploratory data analysis with standard graphics: attendance by day of > week > with(data=dodgers,plot(ordered_day_of_week, attend/1000, > xlab = "Day of Week", ylab = "Attendance (thousands)", > col = "violet", las = 1)) > > # when do the Dodgers use bobblehead promotions > with(dodgers, table(bobblehead,ordered_day_of_week)) # bobbleheads on > Tuesday > > # define an ordered month variable > # for plots and data summaries > dodgers$ordered_month <- with(data=dodgers, > ifelse ((month == "APR"),4, > ifelse ((month == "MAY"),5, > ifelse ((month == "JUN"),6, > ifelse ((month == "JUL"),7, > ifelse ((month == "AUG"),8, > ifelse ((month == "SEP"),9,10))))))) > dodgers$ordered_month <- factor(dodgers$ordered_month, levels=4:10, > labels = c("April", "May", "June", "July", "Aug", "Sept", "Oct")) > > # exploratory data analysis with standard R graphics: attendance by month > with(data=dodgers,plot(ordered_month,attend/1000, xlab = "Month", > ylab = "Attendance (thousands)", col = "light blue", las = 1)) > > # exploratory data analysis displaying many variables > # looking at attendance and conditioning on day/night > # the skies and whether or not fireworks are displayed > library(lattice) # used for plotting > # let us prepare a graphical summary of the dodgers data > group.labels <- c("No Fireworks","Fireworks") > group.symbols <- c(21,24) > group.colors <- c("black","black") > group.fill <- c("black","red") > xyplot(attend/1000 ~ temp | skies + day_night, > data = dodgers, groups = fireworks, pch = group.symbols, > aspect = 1, cex = 1.5, col = group.colors, fill = group.fill, > layout = c(2, 2), type = c("p","g"), > strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1), > xlab = "Temperature (Degrees Fahrenheit)", > ylab = "Attendance (thousands)", > key = list(space = "top", > text = list(rev(group.labels),col = rev(group.colors)), > points = list(pch = rev(group.symbols), col = rev(group.colors), > fill = rev(group.fill)))) > > # attendance by opponent and day/night game > group.labels <- c("Day","Night") > group.symbols <- c(1,20) > group.symbols.size <- c(2,2.75) > bwplot(opponent ~ attend/1000, data = dodgers, groups = day_night, > xlab = "Attendance (thousands)", > panel = function(x, y, groups, subscripts, ...) > {panel.grid(h = (length(levels(dodgers$opponent)) - 1), v = -1) > panel.stripplot(x, y, groups = groups, subscripts = subscripts, > cex = group.symbols.size, pch = group.symbols, col = "darkblue") > }, > key = list(space = "top", > text = list(group.labels,col = "black"), > points = list(pch = group.symbols, cex = group.symbols.size, > col = "darkblue"))) > > # specify a simple model with bobblehead entered last > my.model <- {attend ~ ordered_month + ordered_day_of_week + bobblehead} > > # employ a training-and-test regimen > set.seed(1234) # set seed for repeatability of training-and-test split > training_test <- c(rep(1,length=trunc((2/3)*nrow(dodgers))), > rep(2,length=(nrow(dodgers) - trunc((2/3)*nrow(dodgers))))) > dodgers$training_test <- sample(training_test) # random permutation > dodgers$training_test <- factor(dodgers$training_test, > levels=c(1,2), labels=c("TRAIN","TEST")) > dodgers.train <- subset(dodgers, training_test == "TRAIN") > print(str(dodgers.train)) # check training data frame > dodgers.test <- subset(dodgers, training_test == "TEST") > print(str(dodgers.test)) # check test data frame > > # fit the model to the training set > train.model.fit <- lm(my.model, data = dodgers.train) > # obtain predictions from the training set > dodgers.train$predict_attend <- predict(train.model.fit) > > # evaluate the fitted model on the test set > dodgers.test$predict_attend <- predict(train.model.fit, > newdata = dodgers.test) > > # compute the proportion of response variance > # accounted for when predicting out-of-sample > cat("\n","Proportion of Test Set Variance Accounted for: ", > round((with(dodgers.test,cor(attend,predict_attend)^2)), > digits=3),"\n",sep="") > > # merge the training and test sets for plotting > dodgers.plotting.frame <- rbind(dodgers.train,dodgers.test) > > # generate predictive modeling visual for management > group.labels <- c("No Bobbleheads","Bobbleheads") > group.symbols <- c(21,24) > group.colors <- c("black","black") > group.fill <- c("black","red") > xyplot(predict_attend/1000 ~ attend/1000 | training_test, > data = dodgers.plotting.frame, groups = bobblehead, cex = 2, > pch = group.symbols, col = group.colors, fill = group.fill, > layout = c(2, 1), xlim = c(20,65), ylim = c(20,65), > aspect=1, type = c("p","g"), > panel=function(x,y, ...) > {panel.xyplot(x,y,...) > panel.segments(25,25,60,60,col="black",cex=2) > }, > strip=function(...) strip.default(..., style=1), > xlab = "Actual Attendance (thousands)", > ylab = "Predicted Attendance (thousands)", > key = list(space = "top", > text = list(rev(group.labels),col = rev(group.colors)), > points = list(pch = rev(group.symbols), > col = rev(group.colors), > fill = rev(group.fill)))) > > # use the full data set to obtain an estimate of the increase in > # attendance due to bobbleheads, controlling for other factors > my.model.fit <- lm(my.model, data = dodgers) # use all available data > print(summary(my.model.fit)) > # tests statistical significance of the bobblehead promotion > # type I anova computes sums of squares for sequential tests > print(anova(my.model.fit)) > > cat("\n","Estimated Effect of Bobblehead Promotion on Attendance: ", > round(my.model.fit$coefficients[length(my.model.fit$coefficients)], > digits = 0),"\n",sep="") > > # standard graphics provide diagnostic plots > plot(my.model.fit) > > # additional model diagnostics drawn from the car package > library(car) > residualPlots(my.model.fit) > marginalModelPlots(my.model.fit) > print(outlierTest(my.model.fit)) > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.-- Bert Gunter Genentech Nonclinical Biostatistics (650) 467-7374
Richard M. Heiberger
2013-Dec-02 15:24 UTC
[R] why change days of the week from a factor to an ordered factor?
If days of the week is not an Ordered Factor, then it will be sorted alphabetically. Fr Mo Sa Su Th Tu We Rich On Mon, Dec 2, 2013 at 6:24 AM, Bill <william108 at gmail.com> wrote:> I am reading the code below. It acts on a csv file called dodgers.csv with > the following variables. > > >> print(str(dodgers)) # check the structure of the data frame > 'data.frame': 81 obs. of 12 variables: > $ month : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 1 1 1 1 > 1 ... > $ day : int 10 11 12 13 14 15 23 24 25 27 ... > $ attend : int 56000 29729 28328 31601 46549 38359 26376 44014 26345 > 44807 ... > $ day_of_week: Factor w/ 7 levels "Friday","Monday",..: 6 7 5 1 3 4 2 6 7 > 1 ... > $ opponent : Factor w/ 17 levels "Angels","Astros",..: 13 13 13 11 11 11 > 3 3 3 10 ... > $ temp : int 67 58 57 54 57 65 60 63 64 66 ... > $ skies : Factor w/ 2 levels "Clear ","Cloudy": 1 2 2 2 2 1 2 2 2 1 > ... > $ day_night : Factor w/ 2 levels "Day","Night": 1 2 2 2 2 1 2 2 2 2 ... > $ cap : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... > $ shirt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... > $ fireworks : Factor w/ 2 levels "NO","YES": 1 1 1 2 1 1 1 1 1 2 ... > $ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... > NULL >> > > I don't understand why the author of the code decided to make the factor > days_of_week into an ordered factor. Anyone know why this should be done? > Thank you. > > Here is the code: > > # Predictive Model for Los Angeles Dodgers Promotion and Attendance > > library(car) # special functions for linear regression > library(lattice) # graphics package > > # read in data and create a data frame called dodgers > dodgers <- read.csv("dodgers.csv") > print(str(dodgers)) # check the structure of the data frame > > # define an ordered day-of-week variable > # for plots and data summaries > dodgers$ordered_day_of_week <- with(data=dodgers, > ifelse ((day_of_week == "Monday"),1, > ifelse ((day_of_week == "Tuesday"),2, > ifelse ((day_of_week == "Wednesday"),3, > ifelse ((day_of_week == "Thursday"),4, > ifelse ((day_of_week == "Friday"),5, > ifelse ((day_of_week == "Saturday"),6,7))))))) > dodgers$ordered_day_of_week <- factor(dodgers$ordered_day_of_week, > levels=1:7, > labels=c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun")) > > # exploratory data analysis with standard graphics: attendance by day of > week > with(data=dodgers,plot(ordered_day_of_week, attend/1000, > xlab = "Day of Week", ylab = "Attendance (thousands)", > col = "violet", las = 1)) > > # when do the Dodgers use bobblehead promotions > with(dodgers, table(bobblehead,ordered_day_of_week)) # bobbleheads on > Tuesday > > # define an ordered month variable > # for plots and data summaries > dodgers$ordered_month <- with(data=dodgers, > ifelse ((month == "APR"),4, > ifelse ((month == "MAY"),5, > ifelse ((month == "JUN"),6, > ifelse ((month == "JUL"),7, > ifelse ((month == "AUG"),8, > ifelse ((month == "SEP"),9,10))))))) > dodgers$ordered_month <- factor(dodgers$ordered_month, levels=4:10, > labels = c("April", "May", "June", "July", "Aug", "Sept", "Oct")) > > # exploratory data analysis with standard R graphics: attendance by month > with(data=dodgers,plot(ordered_month,attend/1000, xlab = "Month", > ylab = "Attendance (thousands)", col = "light blue", las = 1)) > > # exploratory data analysis displaying many variables > # looking at attendance and conditioning on day/night > # the skies and whether or not fireworks are displayed > library(lattice) # used for plotting > # let us prepare a graphical summary of the dodgers data > group.labels <- c("No Fireworks","Fireworks") > group.symbols <- c(21,24) > group.colors <- c("black","black") > group.fill <- c("black","red") > xyplot(attend/1000 ~ temp | skies + day_night, > data = dodgers, groups = fireworks, pch = group.symbols, > aspect = 1, cex = 1.5, col = group.colors, fill = group.fill, > layout = c(2, 2), type = c("p","g"), > strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1), > xlab = "Temperature (Degrees Fahrenheit)", > ylab = "Attendance (thousands)", > key = list(space = "top", > text = list(rev(group.labels),col = rev(group.colors)), > points = list(pch = rev(group.symbols), col = rev(group.colors), > fill = rev(group.fill)))) > > # attendance by opponent and day/night game > group.labels <- c("Day","Night") > group.symbols <- c(1,20) > group.symbols.size <- c(2,2.75) > bwplot(opponent ~ attend/1000, data = dodgers, groups = day_night, > xlab = "Attendance (thousands)", > panel = function(x, y, groups, subscripts, ...) > {panel.grid(h = (length(levels(dodgers$opponent)) - 1), v = -1) > panel.stripplot(x, y, groups = groups, subscripts = subscripts, > cex = group.symbols.size, pch = group.symbols, col = "darkblue") > }, > key = list(space = "top", > text = list(group.labels,col = "black"), > points = list(pch = group.symbols, cex = group.symbols.size, > col = "darkblue"))) > > # specify a simple model with bobblehead entered last > my.model <- {attend ~ ordered_month + ordered_day_of_week + bobblehead} > > # employ a training-and-test regimen > set.seed(1234) # set seed for repeatability of training-and-test split > training_test <- c(rep(1,length=trunc((2/3)*nrow(dodgers))), > rep(2,length=(nrow(dodgers) - trunc((2/3)*nrow(dodgers))))) > dodgers$training_test <- sample(training_test) # random permutation > dodgers$training_test <- factor(dodgers$training_test, > levels=c(1,2), labels=c("TRAIN","TEST")) > dodgers.train <- subset(dodgers, training_test == "TRAIN") > print(str(dodgers.train)) # check training data frame > dodgers.test <- subset(dodgers, training_test == "TEST") > print(str(dodgers.test)) # check test data frame > > # fit the model to the training set > train.model.fit <- lm(my.model, data = dodgers.train) > # obtain predictions from the training set > dodgers.train$predict_attend <- predict(train.model.fit) > > # evaluate the fitted model on the test set > dodgers.test$predict_attend <- predict(train.model.fit, > newdata = dodgers.test) > > # compute the proportion of response variance > # accounted for when predicting out-of-sample > cat("\n","Proportion of Test Set Variance Accounted for: ", > round((with(dodgers.test,cor(attend,predict_attend)^2)), > digits=3),"\n",sep="") > > # merge the training and test sets for plotting > dodgers.plotting.frame <- rbind(dodgers.train,dodgers.test) > > # generate predictive modeling visual for management > group.labels <- c("No Bobbleheads","Bobbleheads") > group.symbols <- c(21,24) > group.colors <- c("black","black") > group.fill <- c("black","red") > xyplot(predict_attend/1000 ~ attend/1000 | training_test, > data = dodgers.plotting.frame, groups = bobblehead, cex = 2, > pch = group.symbols, col = group.colors, fill = group.fill, > layout = c(2, 1), xlim = c(20,65), ylim = c(20,65), > aspect=1, type = c("p","g"), > panel=function(x,y, ...) > {panel.xyplot(x,y,...) > panel.segments(25,25,60,60,col="black",cex=2) > }, > strip=function(...) strip.default(..., style=1), > xlab = "Actual Attendance (thousands)", > ylab = "Predicted Attendance (thousands)", > key = list(space = "top", > text = list(rev(group.labels),col = rev(group.colors)), > points = list(pch = rev(group.symbols), > col = rev(group.colors), > fill = rev(group.fill)))) > > # use the full data set to obtain an estimate of the increase in > # attendance due to bobbleheads, controlling for other factors > my.model.fit <- lm(my.model, data = dodgers) # use all available data > print(summary(my.model.fit)) > # tests statistical significance of the bobblehead promotion > # type I anova computes sums of squares for sequential tests > print(anova(my.model.fit)) > > cat("\n","Estimated Effect of Bobblehead Promotion on Attendance: ", > round(my.model.fit$coefficients[length(my.model.fit$coefficients)], > digits = 0),"\n",sep="") > > # standard graphics provide diagnostic plots > plot(my.model.fit) > > # additional model diagnostics drawn from the car package > library(car) > residualPlots(my.model.fit) > marginalModelPlots(my.model.fit) > print(outlierTest(my.model.fit)) > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.
Duncan Mackay
2013-Dec-03 02:29 UTC
[R] why change days of the week from a factor to an ordered factor?
Hi Bill eg> colours = 1:8 > coloursf = factor(1:8) > colourso = ordered(1:8) > str(coloursf)Factor w/ 8 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8> str(colourso)Ord.factor w/ 8 levels "1"<"2"<"3"<"4"<..: 1 2 3 4 5 6 7 8 coloursf2 <- factor(1:8, levels = 8:1) str(coloursf2) Duncan Duncan Duncan Mackay Department of Agronomy and Soil Science University of New England Armidale NSW 2351 Email: home: mackay at northnet.com.au ordered used in used in MASS::polr and GEE for polytomous logistic regression -----Original Message----- From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf Of Bill Sent: Monday, 2 December 2013 21:24 To: r-help at r-project.org Subject: [R] why change days of the week from a factor to an ordered factor? I am reading the code below. It acts on a csv file called dodgers.csv with the following variables.> print(str(dodgers)) # check the structure of the data frame'data.frame': 81 obs. of 12 variables: $ month : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 1 1 1 1 1 ... $ day : int 10 11 12 13 14 15 23 24 25 27 ... $ attend : int 56000 29729 28328 31601 46549 38359 26376 44014 26345 44807 ... $ day_of_week: Factor w/ 7 levels "Friday","Monday",..: 6 7 5 1 3 4 2 6 7 1 ... $ opponent : Factor w/ 17 levels "Angels","Astros",..: 13 13 13 11 11 11 3 3 3 10 ... $ temp : int 67 58 57 54 57 65 60 63 64 66 ... $ skies : Factor w/ 2 levels "Clear ","Cloudy": 1 2 2 2 2 1 2 2 2 1 ... $ day_night : Factor w/ 2 levels "Day","Night": 1 2 2 2 2 1 2 2 2 2 ... $ cap : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... $ shirt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... $ fireworks : Factor w/ 2 levels "NO","YES": 1 1 1 2 1 1 1 1 1 2 ... $ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ... NULL>I don't understand why the author of the code decided to make the factor days_of_week into an ordered factor. Anyone know why this should be done? Thank you. Here is the code: # Predictive Model for Los Angeles Dodgers Promotion and Attendance library(car) # special functions for linear regression library(lattice) # graphics package # read in data and create a data frame called dodgers dodgers <- read.csv("dodgers.csv") print(str(dodgers)) # check the structure of the data frame # define an ordered day-of-week variable # for plots and data summaries dodgers$ordered_day_of_week <- with(data=dodgers, ifelse ((day_of_week == "Monday"),1, ifelse ((day_of_week == "Tuesday"),2, ifelse ((day_of_week == "Wednesday"),3, ifelse ((day_of_week == "Thursday"),4, ifelse ((day_of_week == "Friday"),5, ifelse ((day_of_week == "Saturday"),6,7))))))) dodgers$ordered_day_of_week <- factor(dodgers$ordered_day_of_week, levels=1:7, labels=c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun")) # exploratory data analysis with standard graphics: attendance by day of week with(data=dodgers,plot(ordered_day_of_week, attend/1000, xlab = "Day of Week", ylab = "Attendance (thousands)", col = "violet", las = 1)) # when do the Dodgers use bobblehead promotions with(dodgers, table(bobblehead,ordered_day_of_week)) # bobbleheads on Tuesday # define an ordered month variable # for plots and data summaries dodgers$ordered_month <- with(data=dodgers, ifelse ((month == "APR"),4, ifelse ((month == "MAY"),5, ifelse ((month == "JUN"),6, ifelse ((month == "JUL"),7, ifelse ((month == "AUG"),8, ifelse ((month == "SEP"),9,10))))))) dodgers$ordered_month <- factor(dodgers$ordered_month, levels=4:10, labels c("April", "May", "June", "July", "Aug", "Sept", "Oct")) # exploratory data analysis with standard R graphics: attendance by month with(data=dodgers,plot(ordered_month,attend/1000, xlab = "Month", ylab "Attendance (thousands)", col = "light blue", las = 1)) # exploratory data analysis displaying many variables # looking at attendance and conditioning on day/night # the skies and whether or not fireworks are displayed library(lattice) # used for plotting # let us prepare a graphical summary of the dodgers data group.labels <- c("No Fireworks","Fireworks") group.symbols <- c(21,24) group.colors <- c("black","black") group.fill <- c("black","red") xyplot(attend/1000 ~ temp | skies + day_night, data = dodgers, groups = fireworks, pch = group.symbols, aspect = 1, cex = 1.5, col = group.colors, fill = group.fill, layout = c(2, 2), type = c("p","g"), strip=strip.custom(strip.levels=TRUE,strip.names=FALSE, style=1), xlab = "Temperature (Degrees Fahrenheit)", ylab = "Attendance (thousands)", key = list(space = "top", text = list(rev(group.labels),col = rev(group.colors)), points = list(pch = rev(group.symbols), col = rev(group.colors), fill = rev(group.fill)))) # attendance by opponent and day/night game group.labels <- c("Day","Night") group.symbols <- c(1,20) group.symbols.size <- c(2,2.75) bwplot(opponent ~ attend/1000, data = dodgers, groups = day_night, xlab = "Attendance (thousands)", panel = function(x, y, groups, subscripts, ...) {panel.grid(h = (length(levels(dodgers$opponent)) - 1), v = -1) panel.stripplot(x, y, groups = groups, subscripts = subscripts, cex = group.symbols.size, pch = group.symbols, col = "darkblue") }, key = list(space = "top", text = list(group.labels,col = "black"), points = list(pch = group.symbols, cex = group.symbols.size, col = "darkblue"))) # specify a simple model with bobblehead entered last my.model <- {attend ~ ordered_month + ordered_day_of_week + bobblehead} # employ a training-and-test regimen set.seed(1234) # set seed for repeatability of training-and-test split training_test <- c(rep(1,length=trunc((2/3)*nrow(dodgers))), rep(2,length=(nrow(dodgers) - trunc((2/3)*nrow(dodgers))))) dodgers$training_test <- sample(training_test) # random permutation dodgers$training_test <- factor(dodgers$training_test, levels=c(1,2), labels=c("TRAIN","TEST")) dodgers.train <- subset(dodgers, training_test == "TRAIN") print(str(dodgers.train)) # check training data frame dodgers.test <- subset(dodgers, training_test == "TEST") print(str(dodgers.test)) # check test data frame # fit the model to the training set train.model.fit <- lm(my.model, data = dodgers.train) # obtain predictions from the training set dodgers.train$predict_attend <- predict(train.model.fit) # evaluate the fitted model on the test set dodgers.test$predict_attend <- predict(train.model.fit, newdata = dodgers.test) # compute the proportion of response variance # accounted for when predicting out-of-sample cat("\n","Proportion of Test Set Variance Accounted for: ", round((with(dodgers.test,cor(attend,predict_attend)^2)), digits=3),"\n",sep="") # merge the training and test sets for plotting dodgers.plotting.frame <- rbind(dodgers.train,dodgers.test) # generate predictive modeling visual for management group.labels <- c("No Bobbleheads","Bobbleheads") group.symbols <- c(21,24) group.colors <- c("black","black") group.fill <- c("black","red") xyplot(predict_attend/1000 ~ attend/1000 | training_test, data = dodgers.plotting.frame, groups = bobblehead, cex = 2, pch = group.symbols, col = group.colors, fill = group.fill, layout = c(2, 1), xlim = c(20,65), ylim = c(20,65), aspect=1, type = c("p","g"), panel=function(x,y, ...) {panel.xyplot(x,y,...) panel.segments(25,25,60,60,col="black",cex=2) }, strip=function(...) strip.default(..., style=1), xlab = "Actual Attendance (thousands)", ylab = "Predicted Attendance (thousands)", key = list(space = "top", text = list(rev(group.labels),col = rev(group.colors)), points = list(pch = rev(group.symbols), col = rev(group.colors), fill = rev(group.fill)))) # use the full data set to obtain an estimate of the increase in # attendance due to bobbleheads, controlling for other factors my.model.fit <- lm(my.model, data = dodgers) # use all available data print(summary(my.model.fit)) # tests statistical significance of the bobblehead promotion # type I anova computes sums of squares for sequential tests print(anova(my.model.fit)) cat("\n","Estimated Effect of Bobblehead Promotion on Attendance: ", round(my.model.fit$coefficients[length(my.model.fit$coefficients)], digits = 0),"\n",sep="") # standard graphics provide diagnostic plots plot(my.model.fit) # additional model diagnostics drawn from the car package library(car) residualPlots(my.model.fit) marginalModelPlots(my.model.fit) print(outlierTest(my.model.fit)) [[alternative HTML version deleted]] ______________________________________________ R-help at r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.
Alok Shende
2013-Dec-03 14:00 UTC
[R] why change days of the week from a factor to an ordered factor?
Hi, There are two reasons. First is that in the "day_of_week", the starting day is Friday so if you plot a graph, the left most column will start with Friday. You may like to start the column with Monday. The second reason is that instead of having all these long factor names (Monday,...), the code writer has used shorter words (Mon, Tue...). Regards, Alok Shende [[alternative HTML version deleted]]