Paul Johnson
2012-Apr-20 16:37 UTC
[R] predictOMatic for regression. Please try and advise me
I'm pasting below a working R file featuring a function I'd like to polish up. I'm teaching regression this semester and every time I come to something that is very difficult to explain in class, I try to simplify it by writing an R function (eventually into my package "rockchalk"). Students have a difficult time with predict and newdata objects, so right now I'm concentrated on that problem. Fitting regressions is pretty easy, but getting predicted values for particular combinations of input values can be difficult when the model is complicated. If you agree, please try out the following and let me know how its use could be enhanced, or what other features you might want. I know folks are busy, so to save you the trouble of actually running the code, I also paste in a session demonstrating one run through. Here's the code: ##Paul Johnson ## 2012-04-20 ## Facilitate creation of "newdata" objects for use in predict ## statements (for interpretation of regression models) ## newdataMaker creates the newdata frame required in predict. ## It supplies a data frame with all input variables at a ## central value (mean, mode) except for some variables that ## the user wants to focus on. User should ## supply a fitted model "model" and a focus list "fl" of variable ## values. See examples below. The list must be a named list, ## using names of variables from the regression formula. ## It is not needed to call this ## directly if one is satisfied with the results from ## predictOMatic newdataMaker <- function (model = NULL, fl = NULL){ mf <- model.frame(model) mterms <- terms(model) mfnames <- colnames(mf) modelcv <- centralValues(mf) if (sum(!names(fl) %in% mfnames) > 0) stop(cat(c("Error. The focus list (fl) requests variables that are not included in the original model. The names of the variables in the focus list be drawn from this list: ", mfnames))) ## Consider "padding" range of fl for numeric variables so that we ## get newdata objects including the min and max values. mixAndMatch <- expand.grid(fl) mixAndMatch$combo <- 1:nrow(mixAndMatch) newdf <- cbind(mixAndMatch, modelcv[ ,!colnames(modelcv) %in% colnames(mixAndMatch)]) newdf } predictOMatic <- function(model = NULL, fl = NULL, ...){ nd <- newdataMaker(model, fl) fit <- predict(model, newdata=nd, ...) cbind(fit, nd) } set.seed(12345) x1 <- rnorm(100) x2 <- rnorm(100) x3 <- rnorm(100) x4 <- rnorm(100) x5 <- rpois(100, 5) x6 <- rgamma(100, 2,1) xcat1 <- gl(2,50, labels=c("M","F")) xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf), labels=c("R", "M", "D", "P", "G")) dat <- data.frame(x1, x2, x3, x4, x5, xcat1, xcat2) rm(x1, x2, x3, x4, xcat1, xcat2) dat$y <- with(dat, 0.03 + 0.1*x1 + 0.1*x2 + 0.4*x3 -0.1*x4 + 0.01*x5 + 0.1*x6 + 2*rnorm(100)) ##ordinary regression. m1 <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data=dat) summary(m1) ## Use the fitted model's data frame for analysis m1mf <- model.frame(m1) ## If you have rockchalk 1.5.4, run these: library(rockchalk) summarize(m1mf) ## otherwise, run summary(m1mf) ## First, overview for values of xcat1 newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat1))) ## mix and match all combinations of xcat1 and xcat2 newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat2), xcat2 levels(m1mf$xcat2))) ## Pick some particular values for focus newdataMaker(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"))) ## Generate a newdata frame and predictions in one step predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"))) predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")), interval="conf") newdf <- predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 c("M","D"), x1=plotSeq(dat$x1))) plot(y ~ x1, data= model.frame(m1)) by(newdf, list(newdf$x2, newdf$xcat2), function(x) {lines(x$x1, x$fit)}) newdataMaker(m1, fl = list(x2 = c(-1,0, 1), xcat2 = c("M","D"))) predictOMatic(m1, fl = list(x2 = range(dat$x2), xcat2 = c("M","D"))) newdf <- predictOMatic(m1, fl = list(x2 = quantile(dat$x2), xcat2 = c("M","D"))) plot(y ~ x2 , data=model.frame(m1)) lines(y ~ x2, newdf) predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf") ## just gets the new data nd <- newdataMaker(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D"))) pr <- predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf") ## m2 <- glm(xcat2 ~ x1 + x2 + x3 + xcat1, data=dat, family=binomial(logit)) summary(m2) m2mf <- model.frame(m2) summarize(m2mf) predict(m2, newdata=data.frame(xcat1=c("M","F"), x1=c(1), x2=mean(m2mf$x2), x3=mean(m2mf$x3)), se.fit=TRUE) predictOMatic(m2, fl = list(x2 = c(-1, 1), xcat1 = c("M","F")), interval="conf", type="response") ############################################ ################Now, the example session ###################> newdataMaker <- function (model = NULL, fl = NULL){+ mf <- model.frame(model) + mterms <- terms(model) + mfnames <- colnames(mf) + modelcv <- centralValues(mf) + if (sum(!names(fl) %in% mfnames) > 0) stop(cat(c("Error. The focus list (fl) requests variables that are not included in the original model. The names of the variables in the focus list be drawn from this list: ", mfnames))) + ## Consider "padding" range of fl for numeric variables so that we + ## get newdata objects including the min and max values. + mixAndMatch <- expand.grid(fl) + mixAndMatch$combo <- 1:nrow(mixAndMatch) + newdf <- cbind(mixAndMatch, modelcv[ ,!colnames(modelcv) %in% colnames(mixAndMatch)]) + newdf + }> predictOMatic <- function(model = NULL, fl = NULL, ...){+ nd <- newdataMaker(model, fl) + fit <- predict(model, newdata=nd, ...) + cbind(fit, nd) + }> set.seed(12345) > x1 <- rnorm(100) > x2 <- rnorm(100) > x3 <- rnorm(100) > x4 <- rnorm(100) > x5 <- rpois(100, 5) > x6 <- rgamma(100, 2,1) > xcat1 <- gl(2,50, labels=c("M","F")) > xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf), labels=c("R", "M", "D", "P", "G")) > dat <- data.frame(x1, x2, x3, x4, x5, xcat1, xcat2) > rm(x1, x2, x3, x4, xcat1, xcat2) > dat$y <- with(dat, 0.03 + 0.1*x1 + 0.1*x2 + 0.4*x3 -0.1*x4 + 0.01*x5 + 0.1*x6 + 2*rnorm(100)) > m1 <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data=dat) > summary(m1)Call: lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data = dat) Residuals: Min 1Q Median 3Q Max -3.7624 -1.1740 -0.0753 0.9174 4.9910 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -0.24744 0.64684 -0.383 0.702989 x1 -0.44055 0.16902 -2.607 0.010741 * x2 -0.21342 0.18811 -1.135 0.259644 x3 0.74456 0.21434 3.474 0.000798 *** x4 0.20394 0.20454 0.997 0.321453 x5 0.01176 0.09374 0.125 0.900426 x6 0.16072 0.14412 1.115 0.267809 xcat1F 0.81233 0.38271 2.123 0.036599 * xcat2M 1.03244 0.51701 1.997 0.048922 * xcat2D 0.13699 0.64662 0.212 0.832712 xcat2P -0.43391 0.83018 -0.523 0.602517 xcat2G 0.16521 0.49741 0.332 0.740567 --- Signif. codes: 0 ?***? 0.001 ?**? 0.01 ?*? 0.05 ?.? 0.1 ? ? 1 Residual standard error: 1.805 on 88 degrees of freedom Multiple R-squared: 0.23, Adjusted R-squared: 0.1338 F-statistic: 2.39 on 11 and 88 DF, p-value: 0.01216> m1mf <- model.frame(m1) > library(rockchalk)Loading required package: MASS Loading required package: car Loading required package: nnet> summarize(m1mf)$numerics x1 x2 x3 x4 x5 x6 y 0% -2.3800 -2.12400 -2.29000 -2.5820 1.000 0.1373 -4.8140 25% -0.5901 -0.51290 -0.64720 -0.3191 3.000 0.8342 -0.7684 50% 0.4837 0.02596 0.01349 0.3466 5.000 1.5240 0.6293 75% 0.9004 0.69840 0.63120 0.6968 6.000 2.5230 1.7550 100% 2.4770 2.65600 2.74700 2.2680 11.000 5.7050 5.7990 mean 0.2452 0.04523 -0.04621 0.2153 4.830 1.8200 0.6151 sd 1.1150 1.01100 0.93230 0.9707 2.050 1.3130 1.9390 var 1.2430 1.02300 0.86930 0.9422 4.203 1.7230 3.7620 NA's 0.0000 0.00000 0.00000 0.0000 0.000 0.0000 0.0000 $factors xcat1 xcat2 M :50 R :46.000 F :50 M :19.000 NA's : 0 G :19.000 entropy : 1 D :10.000 normedEntropy: 1 P : 6.000 NA's : 0.000 entropy : 2.002 normedEntropy: 0.862> ## First, overview for values of xcat1 > newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat1)))xcat1 combo y x1 x2 x3 x4 x5 1 M 1 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 2 F 2 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 x6 xcat2 1 1.819744 R 2 1.819744 R> newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat2), xcat2 = levels(m1mf$xcat2)))xcat1 xcat2 combo y x1 x2 x3 x4 x5 1 R R 1 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 2 M R 2 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 3 D R 3 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 4 P R 4 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 5 G R 5 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 6 R M 6 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 7 M M 7 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 8 D M 8 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 9 P M 9 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 10 G M 10 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 11 R D 11 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 12 M D 12 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 13 D D 13 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 14 P D 14 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 15 G D 15 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 16 R P 16 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 17 M P 17 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 18 D P 18 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 19 P P 19 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 20 G P 20 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 21 R G 21 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 22 M G 22 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 23 D G 23 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 24 P G 24 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 25 G G 25 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 x6 1 1.819744 2 1.819744 3 1.819744 4 1.819744 5 1.819744 6 1.819744 7 1.819744 8 1.819744 9 1.819744 10 1.819744 11 1.819744 12 1.819744 13 1.819744 14 1.819744 15 1.819744 16 1.819744 17 1.819744 18 1.819744 19 1.819744 20 1.819744 21 1.819744 22 1.819744 23 1.819744 24 1.819744 25 1.819744> newdataMaker(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))x2 xcat2 combo y x1 x3 x4 x5 x6 1 0.25 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 2 1.00 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 3 0.25 D 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 4 1.00 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 xcat1 1 M 2 M 3 M 4 M> predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")))fit x2 xcat2 combo y x1 x3 x4 x5 1 0.98241259 0.25 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 2 0.82235069 1.00 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 3 0.08695955 0.25 D 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 4 -0.07310236 1.00 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 x6 xcat1 1 1.819744 M 2 1.819744 M 3 1.819744 M 4 1.819744 M> predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")), interval="conf")fit lwr upr x2 xcat2 combo y x1 1 0.98241259 0.1140244 1.850801 0.25 M 1 0.6150553 0.2451972 2 0.82235069 -0.1039242 1.748626 1.00 M 2 0.6150553 0.2451972 3 0.08695955 -1.1263000 1.300219 0.25 D 3 0.6150553 0.2451972 4 -0.07310236 -1.3139912 1.167787 1.00 D 4 0.6150553 0.2451972 x3 x4 x5 x6 xcat1 1 -0.04621158 0.2152759 4.83 1.819744 M 2 -0.04621158 0.2152759 4.83 1.819744 M 3 -0.04621158 0.2152759 4.83 1.819744 M 4 -0.04621158 0.2152759 4.83 1.819744 M> newdf <- predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"), x1=plotSeq(dat$x1)))plot(y ~ x1, data= model.frame(m1))> by(newdf, list(newdf$x2, newdf$xcat2), function(x) {lines(x$x1, x$fit)}) > : 0.25: M NULL ------------------------------------------------------------ : 1 : M NULL ------------------------------------------------------------ : 0.25 : D NULL ------------------------------------------------------------ : 1 : D NULL> newdataMaker(m1, fl = list(x2 = c(-1,0, 1), xcat2 = c("M","D")))x2 xcat2 combo y x1 x3 x4 x5 x6 xcat1 1 -1 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M 2 0 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M 3 1 M 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M 4 -1 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M 5 0 D 5 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M 6 1 D 6 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M> predictOMatic(m1, fl = list(x2 = range(dat$x2), xcat2 = c("M","D")))fit x2 xcat2 combo y x1 x3 x4 1 1.4889658 -2.123550 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 2 0.4689792 2.655788 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 3 0.5935128 -2.123550 D 3 0.6150553 0.2451972 -0.04621158 0.2152759 4 -0.4264739 2.655788 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 x5 x6 xcat1 1 4.83 1.819744 M 2 4.83 1.819744 M 3 4.83 1.819744 M 4 4.83 1.819744 M> newdf <- predictOMatic(m1, fl = list(x2 = quantile(dat$x2), xcat2 = c("M","D"))) > plot(y ~ x2 , data=model.frame(m1)) > lines(y ~ x2, newdf) > predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf")fit lwr upr x2 xcat2 combo y x1 x3 1 -9.635027 -28.29784 9.027789 50 M 1 0.6150553 0.2451972 -0.04621158 2 -11.769186 -34.16684 10.628471 60 M 2 0.6150553 0.2451972 -0.04621158 3 -10.530480 -29.14836 8.087395 50 D 3 0.6150553 0.2451972 -0.04621158 4 -12.664639 -35.01410 9.684824 60 D 4 0.6150553 0.2451972 -0.04621158 x4 x5 x6 xcat1 1 0.2152759 4.83 1.819744 M 2 0.2152759 4.83 1.819744 M 3 0.2152759 4.83 1.819744 M 4 0.2152759 4.83 1.819744 M> nd <- newdataMaker(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D"))) > pr <- predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf") > m2 <- glm(xcat2 ~ x1 + x2 + x3 + xcat1, data=dat, family=binomial(logit)) > summary(m2)Call: glm(formula = xcat2 ~ x1 + x2 + x3 + xcat1, family = binomial(logit), data = dat) Deviance Residuals: Min 1Q Median 3Q Max -1.743 -1.181 0.912 1.112 1.353 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) 0.41371 0.29465 1.404 0.160 x1 0.09728 0.18532 0.525 0.600 x2 0.09622 0.20650 0.466 0.641 x3 -0.25314 0.22816 -1.109 0.267 xcat1F -0.57235 0.41433 -1.381 0.167 (Dispersion parameter for binomial family taken to be 1) Null deviance: 137.99 on 99 degrees of freedom Residual deviance: 134.56 on 95 degrees of freedom AIC: 144.56 Number of Fisher Scoring iterations: 4> m2mf <- model.frame(m2) > summarize(m2mf)$numerics x1 x2 x3 0% -2.3800 -2.12400 -2.29000 25% -0.5901 -0.51290 -0.64720 50% 0.4837 0.02596 0.01349 75% 0.9004 0.69840 0.63120 100% 2.4770 2.65600 2.74700 mean 0.2452 0.04523 -0.04621 sd 1.1150 1.01100 0.93230 var 1.2430 1.02300 0.86930 NA's 0.0000 0.00000 0.00000 $factors xcat1 xcat2 M :50 R :46.000 F :50 M :19.000 NA's : 0 G :19.000 entropy : 1 D :10.000 normedEntropy: 1 P : 6.000 NA's : 0.000 entropy : 2.002 normedEntropy: 0.862> predict(m2, newdata=data.frame(xcat1=c("M","F"), x1=c(1), x2=mean(m2mf$x2), x3=mean(m2mf$x3)), se.fit=TRUE)$fit 1 2 0.52704389 -0.04530271 $se.fit 1 2 0.3336836 0.3139795 $residual.scale [1] 1> predictOMatic(m2, fl = list(x2 = c(-1, 1), xcat1 = c("M","F")), interval="conf", type="response")fit x2 xcat1 combo xcat2 x1 x3 1 0.5873563 -1 M 1 R 0.2451972 -0.04621158 2 0.6330860 1 M 2 R 0.2451972 -0.04621158 3 0.4453938 -1 F 3 R 0.2451972 -0.04621158 4 0.4932835 1 F 4 R 0.2451972 -0.04621158 -- Paul E. Johnson Professor, Political Science ? ?Assoc. Director 1541 Lilac Lane, Room 504 ? ? Center for Research Methods University of Kansas ? ? ? ? ? ? ? University of Kansas http://pj.freefaculty.org ? ? ? ? ? ?http://quant.ku.edu
William Dunlap
2012-Apr-22 04:42 UTC
[R] predictOMatic for regression. Please try and advise me
> newdataMaker <- function (model = NULL, fl = NULL){ > mf <- model.frame(model) > mterms <- terms(model) > mfnames <- colnames(mf)I have not studied this carefully, but it looks like you are taking the predictor variables to be the names of the columns of the model.frame. For a formula like y ~ poly(x1, 2) + sin(x2) + cos(x2) that would give "y", "poly(x1, 2)", "sin(x2)", and "cos(x2)" where I'd expect you would want the predictors to be "x1" and "x2". You can get the latter with by running all.vars() over the entries in the "variables" attributes of the terms. E.g., > d <- data.frame(x1=1:10,x2=1:10,y=1:10) > fit <- lm(y ~ poly(x1,2) + sin(x2) + cos(x2), data=d) > unique(unlist(lapply(attr(delete.response(terms(fit)), "variables")[-1], all.vars))) [1] "x1" "x2" There may be variables in there that are really constants, as with the "d" in poly(x,d). It can be tricky to completely automate eliminating those. Bill Dunlap Spotfire, TIBCO Software wdunlap tibco.com> -----Original Message----- > From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf > Of Paul Johnson > Sent: Friday, April 20, 2012 9:38 AM > To: R-help; r-group-ku at googlegroups.com > Subject: [R] predictOMatic for regression. Please try and advise me > > I'm pasting below a working R file featuring a function I'd like to polish up. > > I'm teaching regression this semester and every time I come to > something that is very difficult to explain in class, I try to > simplify it by writing an R function (eventually into my package > "rockchalk"). Students have a difficult time with predict and newdata > objects, so right now I'm concentrated on that problem. > > Fitting regressions is pretty easy, but getting predicted values for > particular combinations of input values can be difficult when the > model is complicated. If you agree, please try out the following and > let me know how its use could be enhanced, or what other features you > might want. > > I know folks are busy, so to save you the trouble of actually running > the code, I also paste in a session demonstrating one run through. > > Here's the code: > > ##Paul Johnson > ## 2012-04-20 > ## Facilitate creation of "newdata" objects for use in predict > ## statements (for interpretation of regression models) > > ## newdataMaker creates the newdata frame required in predict. > ## It supplies a data frame with all input variables at a > ## central value (mean, mode) except for some variables that > ## the user wants to focus on. User should > ## supply a fitted model "model" and a focus list "fl" of variable > ## values. See examples below. The list must be a named list, > ## using names of variables from the regression formula. > ## It is not needed to call this > ## directly if one is satisfied with the results from > ## predictOMatic > > newdataMaker <- function (model = NULL, fl = NULL){ > mf <- model.frame(model) > mterms <- terms(model) > mfnames <- colnames(mf) > modelcv <- centralValues(mf) > if (sum(!names(fl) %in% mfnames) > 0) stop(cat(c("Error. The focus > list (fl) requests variables that are not included in the original > model. The names of the variables in the focus list be drawn from this > list: ", mfnames))) > ## Consider "padding" range of fl for numeric variables so that we > ## get newdata objects including the min and max values. > mixAndMatch <- expand.grid(fl) > mixAndMatch$combo <- 1:nrow(mixAndMatch) > newdf <- cbind(mixAndMatch, modelcv[ ,!colnames(modelcv) %in% > colnames(mixAndMatch)]) > newdf > } > > > > predictOMatic <- function(model = NULL, fl = NULL, ...){ > nd <- newdataMaker(model, fl) > fit <- predict(model, newdata=nd, ...) > cbind(fit, nd) > } > > > > set.seed(12345) > x1 <- rnorm(100) > x2 <- rnorm(100) > x3 <- rnorm(100) > x4 <- rnorm(100) > x5 <- rpois(100, 5) > x6 <- rgamma(100, 2,1) > xcat1 <- gl(2,50, labels=c("M","F")) > xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf), > labels=c("R", "M", "D", "P", "G")) > dat <- data.frame(x1, x2, x3, x4, x5, xcat1, xcat2) > rm(x1, x2, x3, x4, xcat1, xcat2) > dat$y <- with(dat, 0.03 + 0.1*x1 + 0.1*x2 + 0.4*x3 -0.1*x4 + 0.01*x5 + > 0.1*x6 + 2*rnorm(100)) > > ##ordinary regression. > m1 <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data=dat) > summary(m1) > ## Use the fitted model's data frame for analysis > m1mf <- model.frame(m1) > > ## If you have rockchalk 1.5.4, run these: > library(rockchalk) > summarize(m1mf) > ## otherwise, run > summary(m1mf) > > ## First, overview for values of xcat1 > newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat1))) > > ## mix and match all combinations of xcat1 and xcat2 > newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat2), xcat2 > levels(m1mf$xcat2))) > > ## Pick some particular values for focus > newdataMaker(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"))) > > ## Generate a newdata frame and predictions in one step > predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"))) > > > predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")), > interval="conf") > > newdf <- predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 > c("M","D"), x1=plotSeq(dat$x1))) > > plot(y ~ x1, data= model.frame(m1)) > by(newdf, list(newdf$x2, newdf$xcat2), function(x) {lines(x$x1, x$fit)}) > > newdataMaker(m1, fl = list(x2 = c(-1,0, 1), xcat2 = c("M","D"))) > > predictOMatic(m1, fl = list(x2 = range(dat$x2), xcat2 = c("M","D"))) > > newdf <- predictOMatic(m1, fl = list(x2 = quantile(dat$x2), xcat2 = c("M","D"))) > plot(y ~ x2 , data=model.frame(m1)) > > lines(y ~ x2, newdf) > > > predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), > interval="conf") > > ## just gets the new data > nd <- newdataMaker(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D"))) > > pr <- predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), > interval="conf") > > ## > m2 <- glm(xcat2 ~ x1 + x2 + x3 + xcat1, data=dat, family=binomial(logit)) > summary(m2) > m2mf <- model.frame(m2) > summarize(m2mf) > > predict(m2, newdata=data.frame(xcat1=c("M","F"), x1=c(1), > x2=mean(m2mf$x2), x3=mean(m2mf$x3)), se.fit=TRUE) > > predictOMatic(m2, fl = list(x2 = c(-1, 1), xcat1 = c("M","F")), > interval="conf", type="response") > > ############################################ > > ################Now, the example session ################### > > > newdataMaker <- function (model = NULL, fl = NULL){ > + mf <- model.frame(model) > + mterms <- terms(model) > + mfnames <- colnames(mf) > + modelcv <- centralValues(mf) > + if (sum(!names(fl) %in% mfnames) > 0) stop(cat(c("Error. The > focus list (fl) requests variables that are not included in the > original model. The names of the variables in the focus list be drawn > from this list: ", mfnames))) > + ## Consider "padding" range of fl for numeric variables so that we > + ## get newdata objects including the min and max values. > + mixAndMatch <- expand.grid(fl) > + mixAndMatch$combo <- 1:nrow(mixAndMatch) > + newdf <- cbind(mixAndMatch, modelcv[ ,!colnames(modelcv) %in% > colnames(mixAndMatch)]) > + newdf > + } > > predictOMatic <- function(model = NULL, fl = NULL, ...){ > + nd <- newdataMaker(model, fl) > + fit <- predict(model, newdata=nd, ...) > + cbind(fit, nd) > + } > > set.seed(12345) > > x1 <- rnorm(100) > > x2 <- rnorm(100) > > x3 <- rnorm(100) > > x4 <- rnorm(100) > > x5 <- rpois(100, 5) > > x6 <- rgamma(100, 2,1) > > xcat1 <- gl(2,50, labels=c("M","F")) > > xcat2 <- cut(rnorm(100), breaks=c(-Inf, 0, 0.4, 0.9, 1, Inf), labels=c("R", "M", "D", "P", > "G")) > > dat <- data.frame(x1, x2, x3, x4, x5, xcat1, xcat2) > > rm(x1, x2, x3, x4, xcat1, xcat2) > > dat$y <- with(dat, 0.03 + 0.1*x1 + 0.1*x2 + 0.4*x3 -0.1*x4 + 0.01*x5 + 0.1*x6 + > 2*rnorm(100)) > > m1 <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, data=dat) > > summary(m1) > > Call: > lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + xcat1 + xcat2, > data = dat) > > Residuals: > Min 1Q Median 3Q Max > -3.7624 -1.1740 -0.0753 0.9174 4.9910 > > Coefficients: > Estimate Std. Error t value Pr(>|t|) > (Intercept) -0.24744 0.64684 -0.383 0.702989 > x1 -0.44055 0.16902 -2.607 0.010741 * > x2 -0.21342 0.18811 -1.135 0.259644 > x3 0.74456 0.21434 3.474 0.000798 *** > x4 0.20394 0.20454 0.997 0.321453 > x5 0.01176 0.09374 0.125 0.900426 > x6 0.16072 0.14412 1.115 0.267809 > xcat1F 0.81233 0.38271 2.123 0.036599 * > xcat2M 1.03244 0.51701 1.997 0.048922 * > xcat2D 0.13699 0.64662 0.212 0.832712 > xcat2P -0.43391 0.83018 -0.523 0.602517 > xcat2G 0.16521 0.49741 0.332 0.740567 > --- > Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 > > Residual standard error: 1.805 on 88 degrees of freedom > Multiple R-squared: 0.23, Adjusted R-squared: 0.1338 > F-statistic: 2.39 on 11 and 88 DF, p-value: 0.01216 > > > m1mf <- model.frame(m1) > > library(rockchalk) > Loading required package: MASS > Loading required package: car > Loading required package: nnet > > summarize(m1mf) > $numerics > x1 x2 x3 x4 x5 x6 y > 0% -2.3800 -2.12400 -2.29000 -2.5820 1.000 0.1373 -4.8140 > 25% -0.5901 -0.51290 -0.64720 -0.3191 3.000 0.8342 -0.7684 > 50% 0.4837 0.02596 0.01349 0.3466 5.000 1.5240 0.6293 > 75% 0.9004 0.69840 0.63120 0.6968 6.000 2.5230 1.7550 > 100% 2.4770 2.65600 2.74700 2.2680 11.000 5.7050 5.7990 > mean 0.2452 0.04523 -0.04621 0.2153 4.830 1.8200 0.6151 > sd 1.1150 1.01100 0.93230 0.9707 2.050 1.3130 1.9390 > var 1.2430 1.02300 0.86930 0.9422 4.203 1.7230 3.7620 > NA's 0.0000 0.00000 0.00000 0.0000 0.000 0.0000 0.0000 > > $factors > xcat1 xcat2 > M :50 R :46.000 > F :50 M :19.000 > NA's : 0 G :19.000 > entropy : 1 D :10.000 > normedEntropy: 1 P : 6.000 > NA's : 0.000 > entropy : 2.002 > normedEntropy: 0.862 > > > ## First, overview for values of xcat1 > > newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat1))) > xcat1 combo y x1 x2 x3 x4 x5 > 1 M 1 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 2 F 2 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > x6 xcat2 > 1 1.819744 R > 2 1.819744 R > > newdataMaker(m1, fl = list(xcat1 = levels(m1mf$xcat2), xcat2 = levels(m1mf$xcat2))) > xcat1 xcat2 combo y x1 x2 x3 x4 x5 > 1 R R 1 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 2 M R 2 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 3 D R 3 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 4 P R 4 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 5 G R 5 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 6 R M 6 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 7 M M 7 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 8 D M 8 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 9 P M 9 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 10 G M 10 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 11 R D 11 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 12 M D 12 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 13 D D 13 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 14 P D 14 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 15 G D 15 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 16 R P 16 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 17 M P 17 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 18 D P 18 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 19 P P 19 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 20 G P 20 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 21 R G 21 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 22 M G 22 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 23 D G 23 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 24 P G 24 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > 25 G G 25 0.6150553 0.2451972 0.04523311 -0.04621158 0.2152759 4.83 > x6 > 1 1.819744 > 2 1.819744 > 3 1.819744 > 4 1.819744 > 5 1.819744 > 6 1.819744 > 7 1.819744 > 8 1.819744 > 9 1.819744 > 10 1.819744 > 11 1.819744 > 12 1.819744 > 13 1.819744 > 14 1.819744 > 15 1.819744 > 16 1.819744 > 17 1.819744 > 18 1.819744 > 19 1.819744 > 20 1.819744 > 21 1.819744 > 22 1.819744 > 23 1.819744 > 24 1.819744 > 25 1.819744 > > newdataMaker(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"))) > x2 xcat2 combo y x1 x3 x4 x5 x6 > 1 0.25 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 > 2 1.00 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 > 3 0.25 D 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 > 4 1.00 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 > xcat1 > 1 M > 2 M > 3 M > 4 M > > predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"))) > fit x2 xcat2 combo y x1 x3 x4 x5 > 1 0.98241259 0.25 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 > 2 0.82235069 1.00 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 > 3 0.08695955 0.25 D 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 > 4 -0.07310236 1.00 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 > x6 xcat1 > 1 1.819744 M > 2 1.819744 M > 3 1.819744 M > 4 1.819744 M > > predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D")), interval="conf") > fit lwr upr x2 xcat2 combo y x1 > 1 0.98241259 0.1140244 1.850801 0.25 M 1 0.6150553 0.2451972 > 2 0.82235069 -0.1039242 1.748626 1.00 M 2 0.6150553 0.2451972 > 3 0.08695955 -1.1263000 1.300219 0.25 D 3 0.6150553 0.2451972 > 4 -0.07310236 -1.3139912 1.167787 1.00 D 4 0.6150553 0.2451972 > x3 x4 x5 x6 xcat1 > 1 -0.04621158 0.2152759 4.83 1.819744 M > 2 -0.04621158 0.2152759 4.83 1.819744 M > 3 -0.04621158 0.2152759 4.83 1.819744 M > 4 -0.04621158 0.2152759 4.83 1.819744 M > > newdf <- predictOMatic(m1, fl = list(x2 = c(0.25, 1.0), xcat2 = c("M","D"), > x1=plotSeq(dat$x1))) > plot(y ~ x1, data= model.frame(m1)) > > by(newdf, list(newdf$x2, newdf$xcat2), function(x) {lines(x$x1, x$fit)}) > > : 0.25 > : M > NULL > ------------------------------------------------------------ > : 1 > : M > NULL > ------------------------------------------------------------ > : 0.25 > : D > NULL > ------------------------------------------------------------ > : 1 > : D > NULL > > newdataMaker(m1, fl = list(x2 = c(-1,0, 1), xcat2 = c("M","D"))) > x2 xcat2 combo y x1 x3 x4 x5 x6 xcat1 > 1 -1 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M > 2 0 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M > 3 1 M 3 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M > 4 -1 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M > 5 0 D 5 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M > 6 1 D 6 0.6150553 0.2451972 -0.04621158 0.2152759 4.83 1.819744 M > > predictOMatic(m1, fl = list(x2 = range(dat$x2), xcat2 = c("M","D"))) > fit x2 xcat2 combo y x1 x3 x4 > 1 1.4889658 -2.123550 M 1 0.6150553 0.2451972 -0.04621158 0.2152759 > 2 0.4689792 2.655788 M 2 0.6150553 0.2451972 -0.04621158 0.2152759 > 3 0.5935128 -2.123550 D 3 0.6150553 0.2451972 -0.04621158 0.2152759 > 4 -0.4264739 2.655788 D 4 0.6150553 0.2451972 -0.04621158 0.2152759 > x5 x6 xcat1 > 1 4.83 1.819744 M > 2 4.83 1.819744 M > 3 4.83 1.819744 M > 4 4.83 1.819744 M > > newdf <- predictOMatic(m1, fl = list(x2 = quantile(dat$x2), xcat2 = c("M","D"))) > > plot(y ~ x2 , data=model.frame(m1)) > > lines(y ~ x2, newdf) > > predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf") > fit lwr upr x2 xcat2 combo y x1 x3 > 1 -9.635027 -28.29784 9.027789 50 M 1 0.6150553 0.2451972 -0.04621158 > 2 -11.769186 -34.16684 10.628471 60 M 2 0.6150553 0.2451972 -0.04621158 > 3 -10.530480 -29.14836 8.087395 50 D 3 0.6150553 0.2451972 -0.04621158 > 4 -12.664639 -35.01410 9.684824 60 D 4 0.6150553 0.2451972 -0.04621158 > x4 x5 x6 xcat1 > 1 0.2152759 4.83 1.819744 M > 2 0.2152759 4.83 1.819744 M > 3 0.2152759 4.83 1.819744 M > 4 0.2152759 4.83 1.819744 M > > nd <- newdataMaker(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D"))) > > pr <- predictOMatic(m1, fl = list(x2 = c(50, 60), xcat2 = c("M","D")), interval="conf") > > m2 <- glm(xcat2 ~ x1 + x2 + x3 + xcat1, data=dat, family=binomial(logit)) > > summary(m2) > > Call: > glm(formula = xcat2 ~ x1 + x2 + x3 + xcat1, family = binomial(logit), > data = dat) > > Deviance Residuals: > Min 1Q Median 3Q Max > -1.743 -1.181 0.912 1.112 1.353 > > Coefficients: > Estimate Std. Error z value Pr(>|z|) > (Intercept) 0.41371 0.29465 1.404 0.160 > x1 0.09728 0.18532 0.525 0.600 > x2 0.09622 0.20650 0.466 0.641 > x3 -0.25314 0.22816 -1.109 0.267 > xcat1F -0.57235 0.41433 -1.381 0.167 > > (Dispersion parameter for binomial family taken to be 1) > > Null deviance: 137.99 on 99 degrees of freedom > Residual deviance: 134.56 on 95 degrees of freedom > AIC: 144.56 > > Number of Fisher Scoring iterations: 4 > > > m2mf <- model.frame(m2) > > summarize(m2mf) > $numerics > x1 x2 x3 > 0% -2.3800 -2.12400 -2.29000 > 25% -0.5901 -0.51290 -0.64720 > 50% 0.4837 0.02596 0.01349 > 75% 0.9004 0.69840 0.63120 > 100% 2.4770 2.65600 2.74700 > mean 0.2452 0.04523 -0.04621 > sd 1.1150 1.01100 0.93230 > var 1.2430 1.02300 0.86930 > NA's 0.0000 0.00000 0.00000 > > $factors > xcat1 xcat2 > M :50 R :46.000 > F :50 M :19.000 > NA's : 0 G :19.000 > entropy : 1 D :10.000 > normedEntropy: 1 P : 6.000 > NA's : 0.000 > entropy : 2.002 > normedEntropy: 0.862 > > > predict(m2, newdata=data.frame(xcat1=c("M","F"), x1=c(1), x2=mean(m2mf$x2), > x3=mean(m2mf$x3)), se.fit=TRUE) > $fit > 1 2 > 0.52704389 -0.04530271 > > $se.fit > 1 2 > 0.3336836 0.3139795 > > $residual.scale > [1] 1 > > > predictOMatic(m2, fl = list(x2 = c(-1, 1), xcat1 = c("M","F")), interval="conf", > type="response") > fit x2 xcat1 combo xcat2 x1 x3 > 1 0.5873563 -1 M 1 R 0.2451972 -0.04621158 > 2 0.6330860 1 M 2 R 0.2451972 -0.04621158 > 3 0.4453938 -1 F 3 R 0.2451972 -0.04621158 > 4 0.4932835 1 F 4 R 0.2451972 -0.04621158 > > > -- > Paul E. Johnson > Professor, Political Science ? ?Assoc. Director > 1541 Lilac Lane, Room 504 ? ? Center for Research Methods > University of Kansas ? ? ? ? ? ? ? University of Kansas > http://pj.freefaculty.org ? ? ? ? ? ?http://quant.ku.edu > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.