Hi, Try this: final3New<-read.table(file="real_data_cecilia.txt",sep="\t") final3New1<-read.csv("real_data_cecilia_new.csv") fun2<-function(dat){ ??? ??? indx<- duplicated(dat)|duplicated(dat,fromLast=TRUE)??? ??? ??? dat1<- subset(dat[indx,],dummy==1) ??? ??? dat2<- dat1[order(dat1$dimension),] ??? ??? indx1<- as.numeric(row.names(dat2)) ??? ??? names(indx1)<- (seq_along(indx1)-1)%/%2+1 ??? ??? dat3<- dat[c(indx1,indx1+1),] ??? ??? dat3$id<- names(c(indx1,indx1+1)) ??? ??? lst1<- lapply(split(dat3,dat3$id),function(x){ ??? ??? ??? ??????? x1<- x[-1,] ??? ??? ??? ??? x2<- x1[which.min(abs(x1$dimension[1]-x1$dimension[-1]))+1,] ??? ??? ??? ??? x3<- subset(x,dummy==1) ??? ??? ??? ??? rowNx2<- as.numeric(row.names(x2)) ??? ??? ??? ??? rowNx3<- as.numeric(row.names(x3)) ??? ??? ??? ??? x4<- x3[which.min(abs(rowNx2-rowNx3)),] ??? ??? ??? ??? x5<- rbind(x4,x2) ??? ??? ??? ??????? x6<- x[is.na(match(row.names(x),row.names(x5))),] ??? ??? ??? ??? ? }) ??? ? dat4<- do.call(rbind,lst1) ??? ? row.names(dat4)<- gsub(".*\\.","",row.names(dat4)) ??? ? indxNew1<- sort(as.numeric(unique(row.names(dat4)))) ??? ? dat0<- subset(dat[indx,],dummy==0) ??? ?if(nrow(dat0)>0){??? ??? ? dat20<-dat0[order(dat0$dimension),] ??? ? indx0<- as.numeric(row.names(dat20)) ??? ? names(indx0)<- (seq_along(indx0)-1)%/%2+1 ??? ? dat30<- dat[c(indx0-1,indx0),] ??? ? dat30$id<- names(c(indx0-1,indx0)) ??? ? lst0<- lapply(split(dat30,dat30$id),function(x) { ??? ??? ??? ??? ??? x1<- subset(x,dummy==1) ??? ??? ??? ??? ??? x2<- subset(x,dummy==0) ??? ??? ??? ??? ??? x3<- x1[which.min(abs(x1$dimension- unique(x2$dimension))),] ??? ??? ??? ??? ??? rowNx2<- as.numeric(row.names(x2)) ??? ??? ??? ??? ??? rowNx3<- as.numeric(row.names(x3)) ??? ??? ??? ??? ??? x4<- x2[which.min(abs(rowNx2-rowNx3)),] ??? ??? ??? ??? ??????? x5<- rbind(x3,x4) ??? ??? ??? ??? ??? x6<- x[is.na(match(row.names(x),row.names(x5))),] ??? ??? ??? ??? ??? }) ??? dat40<- do.call(rbind,lst0) ??? row.names(dat40)<- gsub(".*\\.","",row.names(dat40)) ??? indxNew0<- sort(as.numeric(unique(row.names(dat40)))) ??? res1Del<-dat[indxNew1,] ??? res0Del<-dat[indxNew0,] ??? indx10<-sort(as.numeric(union(row.names(res0Del),row.names(res1Del)))) ??? if(length(indx10)%%2==1){ ??? res10Del<-unique(rbind(res1Del,res0Del)) ???? ?indx10New<- sort(as.numeric(row.names(res10Del))) ??? ?resF<- dat[-indx10New,] ??? resF ??? } ??? else{ ??????? resF<- dat[-indx10,] ??????? resF ??? } ??? } ??? else{ ??? resF<- dat[-indxNew1,] ??? } ??? } ###Old Function fun3<- function(dat){ ????????? indx<- duplicated(dat) ??? ? dat1<- subset(dat[indx,],dummy==1) ??? ? dat0<- subset(dat[indx,],dummy==0) ??? ? indx1<- as.numeric(row.names(dat1)) ??? ?indx11<- sort(c(indx1,indx1+1)) ??? ?indx0<- as.numeric(row.names(dat0)) ??? ?indx00<- sort(c(indx0,indx0-1)) ??? ? indx10<- sort(c(indx11,indx00)) ??? ?res <- dat[-indx10,] ??? res ??? } ##Applying fun1() (from previous post) ? ??? res5Percent<- fun1(final3New,0.05,50) res5Percent1<- fun1(final3New1,0.05,50) res10Percent<- fun1(final3New,0.10,200) res10Percent1<- fun1(final3New1,0.10,200) res20Percent<- fun1(final3New,0.20,100) res20Percent1<- fun1(final3New1,0.20,100) ###Applying fun2() res5F2<- fun2(res5Percent) res5F2_1<- fun2(res5Percent1) res10F2<- fun2(res10Percent) res10F2_1<- fun2(res10Percent1) res20F2<- fun2(res20Percent) res20F2_1<- fun2(res20Percent1) ??? #Applying fun3() res5F3<- fun3(res5Percent) res5F3_1<- fun3(res5Percent1) res10F3<- fun3(res10Percent) res10F3_1<- fun3(res10Percent1) res20F3<- fun3(res20Percent) res20F3_1<- fun3(res20Percent1) vec1<- rep(c("res5F2","res10F2","res20F2"),2) vec2<- rep(c("res5F3","res10F3","res20F3"),2) vec1[4:6]<-paste(vec1[4:6],"_1",sep="") vec2[4:6]<-paste(vec2[4:6],"_1",sep="") ?resTbl<-data.frame( Dataset=rep(rep(c("final3New","final3New1"),each=3),2),Funct=rep(c("fun2","fun3"),each=6),do.call(rbind,lapply(as.list(c(vec1,vec2)),function(x) {x1<-get(x);c(N_row=nrow(x1),Sub0_Nrow=nrow(subset(x1,dummy==0)),Sub1_Nrow=nrow(subset(x1,dummy==1)),Uniq_Nrow=nrow(unique(x1)))})),stringsAsFactors=FALSE) ?row.names(resTbl)<- c(vec1,vec2) resTbl #???????????? Dataset Funct N_row Sub0_Nrow Sub1_Nrow Uniq_Nrow #res5F2???? final3New? fun2?? 276?????? 138?????? 138?????? 276 #res10F2??? final3New? fun2?? 454?????? 227?????? 227?????? 454 #res20F2??? final3New? fun2?? 284?????? 142?????? 142?????? 284 #res5F2_1? final3New1? fun2?? 288?????? 144?????? 144?????? 288 #res10F2_1 final3New1? fun2?? 488?????? 244?????? 244?????? 488 #res20F2_1 final3New1? fun2?? 310?????? 155?????? 155?????? 310 #res5F3???? final3New? fun3?? 276?????? 138?????? 138?????? 276 #res10F3??? final3New? fun3?? 452?????? 226?????? 226?????? 452 #res20F3??? final3New? fun3?? 284?????? 142?????? 142?????? 284 #res5F3_1? final3New1? fun3?? 288?????? 144?????? 144?????? 288 #res10F3_1 final3New1? fun3?? 488?????? 244?????? 244?????? 488 #res20F3_1 final3New1? fun3?? 310?????? 155?????? 155?????? 310 ?head(res5F2_1,4) #????? firm year industry dummy dimension #1 500622043 2004??????? 1???? 1????? 1172 #2 501611886 2004??????? 1???? 0????? 1183 #3 500778787 2004??????? 1???? 1????? 5680 #4 500047006 2004??????? 1???? 0????? 5692 A.K. ________________________________ From: Cecilia Carmo <cecilia.carmo at ua.pt> To: arun <smartpink111 at yahoo.com> Sent: Tuesday, June 11, 2013 4:36 PM Subject: new data Here it is. Cec?lia?