Dear Muhammed:
You have got to be kidding!
Learn to use R's debugging tools! See the section in the R Language
Definition Manual on Debugging and get to work.
-- Bert
On Thu, May 16, 2013 at 7:42 PM, Muhammad Azam <mazam72@yahoo.com> wrote:
> Dear friends!
> Hope all of you are fine. I am doing work on decision trees. I
> have made following program to construct classification trees. It works but
> some times it gives an error message e.g. "negative extents to
matrix". I
> tried to solve the issue but could not. Now, I need help, how to avoid this
> message? Thanks
>
> ####################################
> ds=iris; dl=nrow(ds) # ds: dataset, dl: data length
> c1=ds[,1]; c2=ds[,2]; c3=ds[,3]; c4=ds[,4]; c5=ds[,5]; id=c(1:dl);
> iris=cbind(c1,c2,c3,c4,c5)
> tec1NF = 0; tec2NF = 0; lmsum = 0; indTrainlist = list(); indTestlist >
list()
> options(digits=6)
>
>
CTPrundUnprund=function(dataset,samplesize,noofsamples,noofpredictors,response,method,nc,nlvls,nnodes)
> {
> vlf1=list(); vlf2=array(0,dim=c(noofsamples,nlvls,nnodes)); cl=list();
> uncl=array(0,dim=c(noofsamples,nlvls,1)); vvlf1=list(); nodeclas=c();
> tdi=c(); eTraining=c(); nodeside=c(); unclp=array(0,dim=c(nlvls,1));
> vvlf2=array(0,dim=c(noofsamples,nlvls,nnodes)); nodeclasp=c();
nodesidep=c()
>
> for (j in 1:noofsamples)
> {
> groupindex = 1:nrow(dataset)
> index = sample(groupindex,samplesize, replace=TRUE)
> indTrain=unique(sort(index)); indTest=groupindex[-indTrain];
> m=dataset[indTrain,]; lm <- nrow(m); lmsum <<- lmsum + lm;
> #w=dataset[indTest,];
> tdi[j]=list(indTrain); indTrainlist[j] = list(indTrain);
> indTestlist[j] > list(indTest)
> mcat=table(m[,response])
> mcat=data.frame(mcat) # or mcat=list(mcat)
> mcat=mcat[,1]
> mcat=matrix(mcat)
> mcat=as.real(mcat)
> print(j)
> d1=0
> sv=function(dataset)
> {
> gnew=c(); ggini=c(); gtwoing=c(); gentropy=c(); val=c(); mGnewLr=c();
> mGginiLr=c(); mGtwoingLr=c(); mGentropyLr=c(); sPjnew=c(); spLnew=c();
> spRnew=c(); matrGnew=c(); matrGgini=c(); matrGtwoing=c(); matrGentropy=c();
> sPjgini=c(); spLgini=c(); spRgini=c(); sAbsDiff=c(); sPj2=c(); spL2=c();
> spR2=c(); sPj=c(); spL=c(); spR=c(); logPj=c(); logpR=c(); logpL=c();
> variables=c()
> m=dataset
> mcat=table(m[,response])
> mcat=data.frame(mcat) # or
> mcat=list(mcat)
> mcat=mcat[,1]
> mcat=matrix(mcat)
> mcat=as.real(mcat)
>
> Pj=table(m[,response])/nrow(m)
> Pj
> # Left Node
> for (variable in 1:noofpredictors)
> {
> gnew=NULL; ggini=NULL; gtwoing=NULL; gentropy=NULL; val=NULL
> varSort=sort(unique(dataset[,variable]))
> #varSort1=sort(unique(m[,variable])); minval <- min(varSort1); maxval
<-
> max(varSort1); r <- maxval-minval; K <-
> round(1+(3.3*log(nrow(m),10)),digits=0)
> #; H <- r/K; varSort <- seq(minval, maxval,H)
> L=length(varSort)
> for (k in 1 : (L-1))
> {
> val[k]=varSort[k]
> x=m[m[,variable]<=val[k]] # Obtained set of data with all columns
> which satisfies the condition
> lenx=length(x) # Length of
> x
> lenx=lenx/ncol(m) # xl/no of columns
> x_left=matrix(x,lenx) # matrix(x,xr)
> p1=lenx/nrow(m)
> y=x_left[,response]
> y1=table(y)
> feqy1=matrix(y1)
> y1=data.frame(y1)
> y1=y1[,1]
> y1=matrix(y1)
> y1=as.real(y1)
> z=match(mcat,y1)
> y1=y1[z]
> y1[is.na(y1)]=0
> feqy1=feqy1[z]
> feqy1[is.na(feqy1)]=0
> cbind(y1, feqy1)
> pL=feqy1/length(x_left[,response])
> pL
>
> # Right Node
> X=m[m[,variable]>val[k]] # Obtained set of data with all columns
which
> satisfies the condition
> lenX=length(X) # Length of x
> lenX=lenX/ncol(m) # xl/no of
> columns
> x_right=matrix(X,lenX) # matrix(x,xr)
> p2=lenX/nrow(m)
> Y=x_right[,response]
> Y1=table(Y)
> feqy2=matrix(Y1)
> Y1=data.frame(Y1)
> Y1=Y1[,1]
> Y1=matrix(Y1)
> Y1=as.real(Y1)
> Z=match(mcat,Y1)
> Y1=Y1[Z]
> Y1[is.na(Y1)]=0
> feqy2=feqy2[Z]
> feqy2[is.na(feqy2)]=0
> cbind(Y1, feqy2)
> pR=feqy2/length(x_right[,response])
> pR
>
> for ( i in 1: length(Pj))
> {
> sPjnew[i]=Pj[i]*exp(Pj[i]); spRnew[i]=pR[i]*exp(pR[i]);
> spLnew[i]=pL[i]*exp(pL[i]) # New method
> sPjgini[i]=(Pj[i]^2); spRgini[i]=(pR[i]^2); spLgini[i]=(pL[i]^2) #
> Gini method
> sAbsDiff[i]=abs(pL[i]-pR[i]) # Twoing method
> if (Pj[i]!=0) # Entropy method
> {
> logPj[i]=log(Pj[i])
> }
> else
> { logPj[i]= 0
> }
> if (pR[i]!=0)
> {
> logpR[i]=log(pR[i])
> }
> else
> { logpR[i]= 0
> }
> if
> (pL[i]!=0)
> {
> logpL[i]=log(pL[i])
> }
> else
> {
> logpL[i]= 0
> }
> sPj2[i]=(Pj[i]^2); spR2[i]=(pR[i]^2); spL2[i]=(pL[i]^2);
> sPj[i]=-(Pj[i]*logPj[i]); spL[i]=-(pL[i]*logpL[i]);
spR[i]=-(pR[i]*logpR[i])
> }
> G1=exp(-1)*(p1*sum(spLnew)+p2*sum(spRnew)-sum(sPjnew));
> G2=p1*sum(spLgini)+p2*sum(spRgini)-sum(sPjgini)
> G3=0.25*p1*p2*(sum(sAbsDiff))^2; G4=sum(sPj)-(p1*sum(spL))-(p2*sum(spR))
>
> gnew[k]=G1; ggini[k]=G2; gtwoing[k]=G3; gentropy[k]=G4
>
> }
> gnew; ggini; gtwoing; gentropy; val
>
> ind1=sort.list(gnew); ind2=sort.list(ggini); ind3=sort.list(gtwoing);
> ind4=sort.list(gentropy)
> gnew=gnew[ind1]; ggini=ggini[ind2]; gtwoing=gtwoing[ind3];
> gentropy=gentropy[ind4]
> valGnew=val[ind1]; valGgini=val[ind2]; valGtwoing=val[ind3];
> valGentropy=val[ind4]
>
> cmGnew=cbind(valGnew,gnew); cmGgini=cbind(valGgini,ggini);
> cmGtwoing=cbind(valGtwoing,gtwoing); cmGentropy=cbind(valGentropy,gentropy)
> mGnew=matrix(cmGnew,length(gnew)); mGgini=matrix(cmGgini,length(ggini));
> mGtwoing=matrix(cmGtwoing,length(gtwoing));
> mGentropy=matrix(cmGentropy,length(gentropy))
>
> mGnewLr[variable]=list(mGnew[length(gnew),]);
> mGginiLr[variable]=list(mGgini[length(ggini),]);
> mGtwoingLr[variable]=list(mGtwoing[length(gtwoing),]);
> mGentropyLr[variable]=list(mGentropy[length(gentropy),])
> }
> cbnew=do.call('cbind',mGnewLr);
> cbgini=do.call('cbind',mGginiLr);
cbtwoing=do.call('cbind',mGtwoingLr);
> cbentropy=do.call('cbind',mGentropyLr)
>
> for(i in 1:noofpredictors){variables[i]=i}
>
> cb11=cbnew[1,]; cb12=cbnew[2,]; cb21=cbgini[1,]; cb22=cbgini[2,];
> cb31=cbtwoing[1,]; cb32=cbtwoing[2,]; cb41=cbentropy[1,];
cb42=cbentropy[2,]
>
> indexGnew=sort.list(cb12); indexGgini=sort.list(cb22);
> indexGtwoing=sort.list(cb32); indexGentropy=sort.list(cb42)
> cb11=cb11[indexGnew]; cb12=cb12[indexGnew]
> cb21=cb21[indexGgini]; cb22=cb22[indexGgini]
> cb31=cb31[indexGtwoing]; cb32=cb32[indexGtwoing]
> cb41=cb41[indexGentropy]; cb42=cb42[indexGentropy]
> varGnew=variables[indexGnew]; varGgini=variables[indexGgini];
> varGtwoing=variables[indexGtwoing]; varGentropy=variables[indexGentropy]
> cbGnew=cbind(varGnew,cb11,cb12)
> cbGgini=cbind(varGgini,cb21,cb22)
> cbGtwoing=cbind(varGtwoing,cb31,cb32)
> cbGentropy=cbind(varGentropy,cb41,cb42)
> mGnew=matrix(cbGnew,length(cb12))
> mGgini=matrix(cbGgini,length(cb22))
> mGtwoing=matrix(cbGtwoing,length(cb32))
> mGentropy=matrix(cbGentropy,length(cb42))
> matrGnew=mGnew[length(cb12),]
> matrGgini=mGgini[length(cb22),]
> matrGtwoing=mGtwoing[length(cb32),]
> matrGentropy=mGentropy[length(cb42),]
> variableimportance=list(matrGnew,matrGgini,matrGtwoing,matrGentropy)
> variableimportance
> }
> #sv(m) # Passing parameters
> LN=function(dataset,variable,value)
> {
>
> x_left=c()
> m=dataset
> x=m[m[,variable]<=value] # Obtained set of data with all columns
> which satisfies the condition
> lenx=length(x) # Length of x
> lenx=lenx/ncol(m) # xl/no of columns
> x_left=matrix(x,lenx) # matrix(x,xr)
> return(x_left)
> }
> RN=function(dataset,variable,value)
> {
> x_right=c()
> m=dataset
> X=m[m[,variable]>value] # Obtained set of data with all columns
which
> satisfies the condition
> lengX=length(X) # Length of x
> lengX=lengX/ncol(m) # xl/no of columns
> x_right=matrix(X,lengX) # Function "SpVal" ends
> return(x_right)
> }
> e=0; n <- 0; vr<-c(); vv<-c(); vl=NULL;
> vl=array(0,dim=c(nc,nlvls,nnodes)); vvl=NULL;
> vvl=array(0,dim=c(nc,nlvls,nnodes)); vlf=NULL; vlf=list(); vvlf=NULL;
> vvlf=list(); trec1N = 0
> func=function(data,level)
> {
> var1=c()
> f=table(data[,response]); wf=as.real(names(which.max(f))); trecN <-
> nrow(data)-max(table(data[,response]));
> for(q in 1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0;
> if (any(var1==0) || nrow(data)<=5)
> {
> n <<- n + 1
> if (level > 0){ for (a in 1 : level)
> {
> vl[wf,a,n] <<- vr[a]; vvl[wf,a,n] <<- vv[a] # vl=variable
level,
> vvl=variable value level
> }}
> trec1N <<- trec1N + trecN
>
> return()
> }else
> {
> data1 = data; s=sv(data1)[[method]]; s1 <- s[1]; s2=s[2]; level <-
> level + 1
>
> vr[level] <<- s1; vv[level] <<- s2 # vv=variable value
> data=LN(data1,s1,s2)
> func(data,level)
> data=RN(data1,s1,s2)
> func(data,level)
> }
> }
> func(m,0) # Passing parameters
> eTraining[j] <- trec1N
> cl[[j]]=apply(vl,3,function(vl) which(rowSums(vl)!=0))[1:n]; for(i in
> 1:n){uncl[j,i,1]=unlist(cl[[j]][i])}
> for(i in 1:nnodes){for(ii in 1:nc){if(any(vl[,,i][ii,]!=0)){vlf[[i]] <-
> vl[,,i][ii,]; vvlf[[i]] <- vvl[,,i][ii,]}}}; vlf1[[j]] <-
> do.call('rbind',vlf); vvlf1[[j]] <-
> do.call('rbind',vvlf)
> }; junk <- sapply(vlf1,function(i) !is.null(i))
> l=vlf1[junk] # l for variable and ll for value selection
> x <- sapply(1:length(l), function(x) {
> sum(sapply(l, function(y) {
> if ( nrow(l[[x]]) != nrow(y) | ncol(l[[x]]) != ncol(y) ) FALSE
> else sum(y != l[[x]]) == 0
> }))
> } ); varseq.max <- max(x)
> wxmax <- which(x==max(x)); lwxmax_var <- length(wxmax);
> ll=vvlf1[junk]; llfresh <- ll[wxmax]; lfresh <- l[wxmax]
> # l for value and ll for variable selection
> x <- sapply(1:length(llfresh), function(x) {
> sum(sapply(llfresh, function(y) {
> if ( nrow(llfresh[[x]]) != nrow(y) | ncol(llfresh[[x]]) != ncol(y) )
> FALSE
> else sum(y != llfresh[[x]]) == 0
> }))
> } ); valseq.max <- max(x)
> wxmax1 <- which(x==max(x)); lwxmax_val <- length(wxmax1); bestsamples
<-
> wxmax[wxmax1]; eTrainbest <- eTraining[bestsamples];
> weTrainmin <- which(eTrainbest == min(eTrainbest)); stepwisemax <-
> cbind(varseq.max,valseq.max); lweTrain_min <- length(weTrainmin);
> lastweTrainmin <- weTrainmin[lweTrain_min]; bestsamplesf <-
> bestsamples[weTrainmin]; lngth <- length(bestsamplesf); finalsample
<-
> bestsamplesf[lngth];
> mm <- dataset[tdi[[finalsample]],]; l1 <- l[[finalsample]]; indxl1
<-
> colSums(l1)!=0; var <- matrix(l1[,indxl1],nrow=nrow(l1));
> allmaxfreq <- cbind(lwxmax_var,lwxmax_val,lweTrain_min); ll2 <-
> ll[[finalsample]]; indxll2 <- colSums(ll2)!=0;
> val <- round(matrix(ll2[,indxll2],nrow=nrow(ll2)),digits=4);
> mat <-
as.data.frame(matrix(paste(var,"(",val,")",sep=""),nrow=nrow(var)));
> print("Sample number having max freq"); print(wxmax);
print("Samples index
> having max value-wise freq from above samples"); print(wxmax1);
> print("length of max and same frequencies"); print(allmaxfreq);
> print("step-wise most repeated variable-value sequence");
> print(stepwisemax);
> print("Sample index with min error"); print(weTrainmin);
print("final
> sample"); print(finalsample);
> print("Each row of matrix is terminal node, L for left and R for right
> node")
> indTrainfinal = indTrainlist[[finalsample]]; indTestfinal >
indTestlist[[finalsample]]
> mm = dataset[indTrainfinal,]; ww = dataset[indTestfinal,]
> #print(indTrainfinal); print(indTestfinal)
> e=0; n <- 0; avelength <- lmsum/noofsamples; units <- c();
misc.units <-
> c(); deviance <- c()
> func=function(data,testdata,level)
> {
> var1=c(); #print(cbind(ncol(testdata),nrow(testdata)))
> y=table(testdata[,response]); pp=y/nrow(testdata)
> f=matrix(y);
> pp=matrix(pp)
> Y=as.real(names(y))
> z=match(mcat,Y); Y=Y[z]; Y[is.na(Y)]=0
> f=f[z]; f[is.na(f)]=0; sumf <- sum(f)
> pp=pp[z]; pp[is.na(pp)]=0
> devN=f*log(pp); devN[is.na(devN)]=0; dTest=(-2)*sum(matrix(devN));
> fr=table(data[,response]); wf=as.real(names(which.max(fr)));
> tecN <- nrow(testdata)-max(table(testdata[,response])); for(q in
> 1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0
> if (any(var1==0) || nrow(data)<=5)
> {
> n <<- n + 1; nodeclas[n] <<- wf; nodeside[n] <<- side
> tec1NF <<- tec1NF + tecN; units[n] <<- sumf; misc.units[n]
<<- tecN;
> deviance[n] <<- dTest
>
> return()
> }else
> {
> data1 = data; data2 = testdata; s=sv(data1)[[method]]; s1 <- s[1];
> s2=s[2]; level <- level +
> 1
>
> data=LN(data1,s1,s2); testdata=LN(data2,s1,s2); side <<-
c("L")
> func(data,testdata,level)
> data=RN(data1,s1,s2); testdata=RN(data2,s1,s2); side <<-
c("R")
> func(data,testdata,level)
> }
> }
> func(mm,ww,0)
> deviance <- round(deviance,digits=2); LorRnode <- nodeside; class
<-
> nodeclas
> resMat <- cbind(mat,LorRnode,units,misc.units,deviance,class);
> print("un-pruned tree"); print(resMat)
> eTrain <- round(mean(eTraining)/avelength*100,digits=2); eTest <-
> round(tec1NF/nrow(ww)*100,digits=2); dev.sum <- sum(deviance);
> print(cbind(eTrain,eTest,dev.sum))
> n <- 0; vr1<-c(); vv1<-c(); vlp=array(0,dim=c(nc,nlvls,nnodes));
> vvlp=array(0,dim=c(nc,nlvls,nnodes)); avelength <- lmsum/noofsamples;
unit
> <- c();
> misc.unit <- c(); deviancep <- c(); wf1 <- 0.1; wf2 <- 0.2;
vlfp=list();
> vvlfp=list(); #esum <- c("esum")
> funcp=function(data,testdata,levelp)
> {
> var1=c(); var1l=c(); var1r=c()
> y=table(testdata[,response]); pp=y/nrow(testdata)
> f=matrix(y); pp=matrix(pp)
> Y=as.real(names(y))
> z=match(mcat,Y); Y=Y[z]; Y[is.na(Y)]=0
> f=f[z]; f[is.na(f)]=0; sumf <- sum(f)
> pp=pp[z]; pp[is.na(pp)]=0
> devN=f*log(pp); devN[is.na(devN)]=0; dTest=(-2)*sum(matrix(devN));
> fr=table(data[,response]); wf=as.real(names(which.max(fr)));
> tecN <- nrow(testdata)-max(table(testdata[,response])); e <-
> nrow(data)-max(table(data[,response]))
> for(q in 1:response){var1[q]=var(data[,q])}; var1[is.na(var1)]=0
> if(nrow(data) > 5 && all(var1 !> 0)){ss=sv(data)[[method]];
ss1=ss[1]; ss2=ss[2]; dat=LN(data,ss1,ss2);
> n1=nrow(dat);
> for(q1 in 1:response){var1l[q1]=var(dat[,q1])}; var1l[is.na(var1l)]=0; e1
> <- nrow(dat)-max(table(dat[,response])); dat1=RN(data,ss1,ss2);
> n2=nrow(dat1);
> for(q2 in 1:response){var1r[q2]=var(dat1[,q2])}; var1r[is.na(var1r)]=0;
> e2 <- nrow(dat1)-max(table(dat1[,response])); f1=table(dat[,response]);
> f2=table(dat1[,response]); wf1=as.real(names(which.max(f1)));
> wf2=as.real(names(which.max(f2)))}; #esum <- sum(e1,e2)
> if (any(var1==0) || wf1==wf2 || nrow(data)<=5) # || esum==e
> {
> n <<- n + 1; nodeclasp[n] <<- wf; nodesidep[n] <<- side
> if (levelp > 0){ for (b in 1 : levelp)
> {
> vlp[wf,b,n] <<- vr1[b]; vvlp[wf,b,n] <<- vv1[b] #
vl=variable level,
> vvl=variable value
> level
> }}
> tec2NF <<- tec2NF + tecN; unit[n] <<- sumf; misc.unit[n]
<<- tecN;
> deviancep[n] <<- dTest
>
> return()
> }else
> {
> data1 = data; data2 = testdata; s=sv(data1)[[method]]; s1 <- s[1];
> s2=s[2]; levelp <- levelp + 1
>
> vr1[levelp] <<- s1; vv1[levelp] <<- s2 # vv=variable value
>
> data=LN(data1,s1,s2); testdata=LN(data2,s1,s2); side <<-
c("L")
> funcp(data,testdata,levelp)
> data=RN(data1,s1,s2); testdata=RN(data2,s1,s2); side <<-
c("R")
> funcp(data,testdata,levelp)
> }
> }
> funcp(mm,ww,0)
> clp=apply(vlp,3,function(vlp) which(rowSums(vlp)!=0))[1:n];
> for(i in 1:nnodes){for(ii in 1:nc){if(any(vlp[,,i][ii,]!=0)){vlfp[[i]]
<-
> vlp[,,i][ii,]; vvlfp[[i]] <- vvlp[,,i][ii,]}}}; vlf1p <-
> do.call('rbind',vlfp); vvlf1p <- do.call('rbind',vvlfp);
> lp=vlf1p; indlp <- colSums(lp) != 0;
lp=matrix(lp[,indlp],nrow=nrow(lp));
> llp=vvlf1p; indllp <- colSums(llp) != 0;
> llp=round(matrix(llp[,indllp],nrow=nrow(llp)),digits=4);
> print("pruned tree"); matp <-
>
as.data.frame(matrix(paste(lp,"(",llp,")",sep=""),nrow=nrow(lp)));
> deviance <- round(deviancep,digits=2); LorRnode <- nodesidep; class
<-
> nodeclasp
> resMatp <- cbind(matp,LorRnode,unit,misc.unit,deviance,class);
> print(resMatp)
> eTrain <- round(mean(eTraining)/avelength*100,digits=2); eTest <-
> round(tec2NF/nrow(ww)*100,digits=2); dev.sum <- sum(deviance);
> print(cbind(eTrain,eTest,dev.sum))
> }
> CTPrundUnprund(iris,150,1,4,5,1,3,10,10)
>
>
>
>
> best regards
>
> Muhammad Azam
> [[alternative HTML version deleted]]
>
>
> ______________________________________________
> R-help@r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
> http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>
>
--
Bert Gunter
Genentech Nonclinical Biostatistics
Internal Contact Info:
Phone: 467-7374
Website:
http://pharmadevelopment.roche.com/index/pdb/pdb-functional-groups/pdb-biostatistics/pdb-ncb-home.htm
[[alternative HTML version deleted]]