David Hugh-Jones
2006-Mar-23 18:14 UTC
[R] Writing a function to fit ALSOS models. problem with normalization?
Dear all, Below is my attempt at a function to fit Alternate Least Squares Optimal Scaling models, as described in Young (1981) "Quantitative Analysis of Qualitative Data" and Jacoby (1999) "Levels of Measurement and Political Research: An Optimistic View". I would welcome any comments on coding style, tips & tricks etc. I also have a specific problem: the output tends to give very small coefficients, and very large fitted values for specific factor levels. Am I doing the normalization right? Cheers David library(car) # for "recode" alsos.fit = function (y, x, tol = 1e-7) { # y is a vector or factor or ordered factor # x is a data frame of vectors/factors/ordereds # we treat the y's as the first column of the matrix x = cbind(y, x) x = x[complete.cases(x),] ox = x x.facs = sapply(x, is.factor) x.ords = sapply(x, is.ordered) # start with our numeric values whatever they are x = sapply(x, as.numeric) old.SSE = Inf while(T) { # least squares regression with an intercept ols = lm.fit(cbind(rep(1,nrow(x)), x[,-1]) , x[,1]) b = ols$coefficients SSE = drop(ols$residuals %*% ols$residuals) if (old.SSE-SSE<tol) { factor.scores=list() for (i in (1:ncol(x))[x.facs]) { nm = colnames(x)[i] factor.scores[[nm]] = tapply(x[,i], ox[,i], function (foo) {return(foo[1])}) names(factor.scores[[nm]]) = levels(ox[,i]) } return(list( factor.scores=factor.scores, scaled.y=x[,1], scaled.x=x[,-1], coefficients=b, SSE=SSE, )) } old.SSE=SSE mx = nx = x mx[] = 0 nx[] = 0 for (i in (1:ncol(x))[x.facs]) { # optimal scaling if (i==1) nx[,i] = ols$fitted.values else nx[,i] = (x[,1] - cbind(rep(1,nrow(x)), x[,c(-1,-i)]) %*% b[-i])/b[i] # create within-category means tmpfac = factor(ox[,i], labels=1:nlevels(ox[,i])) catmeans = tapply(nx[,i], tmpfac, mean) # ensure ordinal values are correctly ordered if (x.ords[i]) { tmp = kruskal.ordering(nx[,i], tmpfac) tmpfac = tmp$tmpfac catmeans = tmp$catmeans } # set values to within-category means mx[,i] = catmeans[tmpfac] # normalize according to Young (1981) mx[,i] = mx[,i] * (nx[,i] %*% nx[,i]) / (mx[,i] %*% mx[,i]) } x[,x.facs] = mx[,x.facs] } } # as described in Kruskal (1964) kruskal.ordering = function(numeric.data, tmpfac) { j = 1 upact = T while(T) { catmeans = tapply(numeric.data, tmpfac, mean) # vector w as many items as tmpfac cats # have we finished? if (j>nlevels(tmpfac)) return (list(catmeans=catmeans,tmpfac=tmpfac)) if ((j==nlevels(tmpfac) || catmeans[j] <= catmeans[j+1]) && (j==1 || catmeans[j] >= catmeans[j-1])) { j=j+1 upact=T next } if (upact) { if (j < nlevels(tmpfac) && catmeans[j] > catmeans[j+1]) { tmpfac = recode(tmpfac, paste(j, ":", j+1,"='", j+1, "'", sep="")) levels(tmpfac) = 1:nlevels(tmpfac) } upact=F } else { if (j > 1 && catmeans[j] < catmeans[j-1]) { tmpfac = recode(tmpfac, paste(j-1, ":", j, "='", j, "'", sep="")) levels(tmpfac) = 1:nlevels(tmpfac) j=j-1 } upact=T } } }