Ben Rhelp
2011-Jan-05 19:58 UTC
[R] Nnet and AIC: selection of a parsimonious parameterisation
Hi All, I am trying to use a neural network for my work, but I am not sure about my approach to select a parsimonious model. In R with nnet, the IAC has not been defined for a feed-forward neural network with a single hidden layer. Is this because it does not make sens mathematically in this case? For example, is this pseudo code sensible? Thanks in advance for your help. I am sorry if this has been answered before, but I haven't found an answer for this in the archive. Below, I have added an implementation of this idea based on (Modern Applied Statistic with S) MASS code of chapter 8. Cheers, Ben -------------------------------------------------------------------------------- Pseudo code -------------------------------------------------------------------------------- Define RSS as: RSS = (1-alpha)*RSS(identification set) + alpha* RSS(validation set) and AIC as: AIC = 2*np + N*log(RSS) where np corresponds to the non-null parameters of the neural network and N is the sample size (based on http://en.wikipedia.org/wiki/Akaike_information_criterion). Assuming a feed-forward neural network with a single hidden layer and a maximum number of neurons (maxSize), For size = 1 to maxSize Optimise the decay Select the neural network with the smallest AIC for a given size and decay using random starting parameterisation and random identification set For the lowest to the largest diagonal element of the Hessian, Equate the corresponding parameter to 0 If AIC(i)>AIC(i-1), break; The neural network selected is the one with the smallest AIC. -------------------------------------------------------------------------------- an example based on cpus data in Chapter 8 of MASS -------------------------------------------------------------------------------- library(nnet) library(MASS) # From Chapter 6, for comparisons set.seed(123) cpus.samp <- c(3, 5, 6, 7, 8, 10, 11, 16, 20, 21, 22, 23, 24, 25, 29, 33, 39, 41, 44, 45, 46, 49, 57, 58, 62, 63, 65, 66, 68, 69, 73, 74, 75, 76, 78, 83, 86, 88, 98, 99, 100, 103, 107, 110, 112, 113, 115, 118, 119, 120, 122, 124, 125, 126, 127, 132, 136, 141, 144, 146, 147, 148, 149, 150, 151, 152, 154, 156, 157, 158, 159, 160, 161, 163, 166, 167, 169, 170, 173, 174, 175, 176, 177, 183, 184, 187, 188, 189, 194, 195, 196, 197, 198, 199, 202, 204, 205, 206, 208, 209) cpus2 <- cpus[, 2:8] # excludes names, authors? predictions attach(cpus2) cpus3 <- data.frame(syct = syct-2, mmin = mmin-3, mmax = mmax-4, cach=cach/256,chmin=chmin/100, chmax=chmax/100, perf) detach() CVnn.cpus <- function(formula, data = cpus3[cpus.samp, ], maxSize = 10, decayRange = c(0,0.2), nreps = 5, nifold = 10, alpha= 9/10, linout = TRUE, skip = TRUE, maxit = 1000,...){ #nreps=number of attempts to fit a nnet model with randomly chosen initial parameters # The one with the smallest RSS on the training data is then chosen nnWtsPrunning <-function(nn,data,alpha,i){ truth <- log10(data$perf) RSS=(1-alpha)*sum((truth[ri != i] - predict(nn, data[ri != i,]))^2) + alpha* sum((truth[ri == i] - predict(nn, data[ri == i,]))^2) AIC=2*sum(nn$wts!=0) + length(data$perf)*log(RSS) nn.tmp=nn for (j in (1:length(nn$wts))) { nn.tmp$wts[order(diag(nn.tmp$Hessian))[j]]=0 RSS.tmp=(1-alpha)*sum((truth[ri != i] - predict(nn.tmp, data[ri != i,]))^2) + alpha* sum((truth[ri == i] - predict(nn.tmp, data[ri == i,]))^2) AIC.tmp=2*sum(nn.tmp$wts!=0) + length(data$perf)*log(RSS.tmp) if (is.nan(AIC.tmp) || AIC.tmp>AIC ) { cat('\n j',j,'AIC'=AIC.tmp,'AIC_1',AIC,'\n') break } else { nn=nn.tmp; AIC=AIC.tmp; RSS=RSS.tmp } } list(choice=sqrt(RSS/100),nparam=sum(nn$wts!=0),AIC=AIC,nn=nn) } #Modified function for optimisation CVnn1 <- function(decay, formula, data, nreps=1, ri, size, linout, skip, maxit, optimFlag=FALSE, alpha) { truth <- log10(data$perf) nn <- nnet(formula, data[ri !=1,], trace=FALSE, size=size, linout=linout, skip=skip, maxit=maxit, Hess = TRUE) RSS=(alpha-1)*sum((truth[ri != 1] - predict(nn, data[ri != 1,]))^2) + alpha* sum((truth[ri == 1] - predict(nn, data[ri == 1,]))^2) ii=1 for (i in sort(unique(ri))) { for(rep in 1:nreps) { nn.tmp <- nnet(formula, data[ri !=i,], trace=FALSE, size=size, linout=linout, skip=skip, maxit=maxit, Hess = TRUE) RSS.tmp=(alpha-1)*sum((truth[ri != i] - predict(nn.tmp, data[ri != i,]))^2) + alpha* sum((truth[ri == i] - predict(nn.tmp, data[ri == i,]))^2) if (RSS.tmp<RSS){ RSS=RSS.tmp; nn=nn.tmp; ii=i} } } if (optimFlag) { return(RSS) }else{ prn=nnWtsPrunning(nn,data,alpha,ii) list(choice=prn$choice,nparam=prn$nparam,nparaminit=length(nn$wts),AIC=prn$AIC,nn1=prn$nn) } } maxSize=maxSize+1; j=1; choice <- numeric(maxSize); nparam <- numeric(maxSize); lambdaj <- numeric(maxSize) AIC <- numeric(maxSize); nparamInit <- numeric(maxSize) ri <- sample(nifold, nrow(data), replace = TRUE) size=seq(maxSize-1,0); minAIC=1000000 for(j in 1:maxSize) { tlambda=optimize(CVnn1,decayRange,tol = 0.0001,formula, data, nreps=nreps, ri=ri, size=size[j], linout=linout, skip=skip, maxit=maxit, optimFlag=TRUE, alpha=.5) lambdaj[j]=tlambda[[1]] cat(" size =", size[j], "decay =", round(tlambda[[1]],3), "\n") nntmp <- CVnn1(decay=lambdaj[j], formula, data, nreps=nreps, ri=ri, size=size[j], alpha=alpha, linout=linout, skip=skip, maxit=maxit,...) choice[j]=nntmp$choice; nparam[j]=nntmp$nparam; nparamInit[j]=nntmp$nparaminit; AIC[j]=nntmp$AIC if (nntmp$AIC<=minAIC) nn=nntmp$nn1 } #Display the result tr = cbind(size=size, decay=lambdaj, fit=sqrt(choice/100), nparam=nparam, nparamInit=nparamInit, AIC=AIC) modelOptimAIC=order(AIC)[1] return(list(tr=tr,modelOptimAIC=modelOptimAIC,nn=nn)) } (nn.res = CVnn.cpus(log10(perf) ~ ., data = cpus3[cpus.samp,]))