It seems to do all I want: 1. matchs the greatest number of firms within an year and industry, by dimension 2. diferences between dimensions should not be more than 10% or/and 200 (if I want just the 10% restrition I think I could substitute 200 for a big number that is impossible to exist) 3. if there are many cases that verif y 2. then it chooses the closest. Thank you very much. Cec?lia Carmo Universidade de Aveiro - Portugal ________________________________________ De: arun [smartpink111 at yahoo.com] Enviado: s?bado, 8 de Junho de 2013 9:06 Para: Cecilia Carmo Cc: R help Assunto: Re: data Hi, Try this: final3New<-read.table(file="real_data_cecilia.txt",sep="\t") dim(final3New) #[1] 5369 5 #Inside the split within split, dummy==1 for the first row. For lists that have many rows, I selected the row with dummy==0 (from the rest) using the #condition that the absolute difference between the dimensions of those rows and the first row dimension was minimum (after I applied the first #constraint). I guess you wanted to select only a pair of rows (dummy=0 and dummy=1) for each lists. fun1<- function(dat,percent,number){ lst1<- split(dat,list(dat$year,dat$industry)) lst1New<- lapply(lst1,function(x) x[!(all(x$dummy==0)|all(x$dummy==1)),]) lst2<- lst1New[lapply(lst1New,nrow)>0] lst3<- lapply(lst2,function(x){ lapply(x$dimension,function(y){ x1<- x[(y < (x$dimension+(x$dimension*percent))) & (y > (x$dimension-(x$dimension*percent))),] }) }) lst4<- lapply(lst3,function(x){ lst<- lapply(x,function(y){ y[!all(y$dummy==0),] }) lstNew<- lst[lapply(lst,nrow)>1] lstNew1<- unique(lstNew) }) lst5<- lst4[lapply(lst4,length)>0] lst6<- lapply(lst5,function(x) { lst<- lapply(x,function(y){ y[!all(y$dummy==1),] }) lst[lapply(lst,nrow)>0] }) lst7<- lapply(lst6,function(x){ lst<- lapply(x,function(y) { x1<- y[1,] x2<- y[-1,] x3<- subset(x2,dummy==0) x4<- x3[which.min(abs(x1$dimension-x3$dimension)),] rbind(x1,x4) }) lstNew<-unique(lst) lstNew1<- lapply(lstNew,function(x){ x[abs(diff(x$dimension)) < number,] }) lstNew1[lapply(lstNew1,nrow)>0] }) lst8<- lst7[lapply(lst7,length)>0] res<- do.call(rbind,lapply(lst8,function(x){ do.call(rbind,x) }) ) row.names(res)<- 1:nrow(res) res} res10Percent<- fun1(final3New,0.1,200) dim(res10Percent) #[1] 508 5 nrow(subset(res10Percent,dummy==0)) #[1] 254 nrow(subset(res10Percent,dummy==1)) #[1] 254 head(res10Percent) # firm year industry dummy dimension #1 500622043 2004 1 1 1198 #2 501611886 2004 1 0 1208 #3 501164600 2005 1 1 1332 #4 504243349 2005 1 0 1455 #5 500862893 2006 1 1 5324 #6 501744860 2006 1 0 5453 res5Percent<- fun1(final3New,0.05,200) dim(res5Percent) #[1] 548 5 nrow(subset(res5Percent,dummy==0)) #[1] 274 nrow(subset(res5Percent,dummy==1)) #[1] 274 res5percent1<-fun1(final3New,0.05,50) dim(res5percent1) #[1] 302 5 nrow(subset(res5percent1,dummy==0)) #[1] 151 nrow(subset(res5percent1,dummy==1)) #[1] 151 Hope it helps. A.K. ________________________________ From: Cecilia Carmo <cecilia.carmo at ua.pt> To: arun <smartpink111 at yahoo.com> Sent: Friday, June 7, 2013 7:30 PM Subject: data I'm sending the data. Thank you very much. Cec?lia The code final3<-read.table(file="real data cecilia.txt",sep="\t") lst1<-split(final3,list(final3$year,final3$industry)) lst2<-lst1[lapply(lst1,nrow)>0] lst3<-lapply(lst2,function(x) lapply(x$dimension,function(y) x[(y< (x$dimension+x$dimension*0.10)) & (y> (x$dimension-x$dimension*0.10)),])) lst4<-lapply(lst3,function(x) x[lapply(x,nrow)==2]) lst5<-lapply(lst4,function(x)x[!duplicated(x)]) lst6<-lst5[lapply(lst5,length)>0] lst7<-lapply(lst6,function(x) {lst<-lapply(x,function(y) y[sum(y$dummy)==1,]);lst[lapply(lst,nrow)>0]}) res<-do.call(rbind,lapply(lst7,function(x) do.call(rbind,x))) row.names(res)<-1:nrow(res) nrow(subset(res,res$dummy==1)) nrow(subset(res,res$dummy==0))