Hi,
Try this:
final3New<-read.table(file="real_data_cecilia.txt",sep="\t")
final3New1<-read.csv("real_data_cecilia_new.csv")
fun2<-function(dat){
??? ??? indx<- duplicated(dat)|duplicated(dat,fromLast=TRUE)???
??? ??? dat1<- subset(dat[indx,],dummy==1)
??? ??? dat2<- dat1[order(dat1$dimension),]
??? ??? indx1<- as.numeric(row.names(dat2))
??? ??? names(indx1)<- (seq_along(indx1)-1)%/%2+1
??? ??? dat3<- dat[c(indx1,indx1+1),]
??? ??? dat3$id<- names(c(indx1,indx1+1))
??? ??? lst1<- lapply(split(dat3,dat3$id),function(x){
??? ??? ??? ??????? x1<- x[-1,]
??? ??? ??? ??? x2<- x1[which.min(abs(x1$dimension[1]-x1$dimension[-1]))+1,]
??? ??? ??? ??? x3<- subset(x,dummy==1)
??? ??? ??? ??? rowNx2<- as.numeric(row.names(x2))
??? ??? ??? ??? rowNx3<- as.numeric(row.names(x3))
??? ??? ??? ??? x4<- x3[which.min(abs(rowNx2-rowNx3)),]
??? ??? ??? ??? x5<- rbind(x4,x2)
??? ??? ??? ??????? x6<- x[is.na(match(row.names(x),row.names(x5))),]
??? ??? ??? ??? ? })
??? ? dat4<- do.call(rbind,lst1)
??? ? row.names(dat4)<- gsub(".*\\.","",row.names(dat4))
??? ? indxNew1<- sort(as.numeric(unique(row.names(dat4))))
??? ? dat0<- subset(dat[indx,],dummy==0)
??? ?if(nrow(dat0)>0){???
??? ? dat20<-dat0[order(dat0$dimension),]
??? ? indx0<- as.numeric(row.names(dat20))
??? ? names(indx0)<- (seq_along(indx0)-1)%/%2+1
??? ? dat30<- dat[c(indx0-1,indx0),]
??? ? dat30$id<- names(c(indx0-1,indx0))
??? ? lst0<- lapply(split(dat30,dat30$id),function(x) {
??? ??? ??? ??? ??? x1<- subset(x,dummy==1)
??? ??? ??? ??? ??? x2<- subset(x,dummy==0)
??? ??? ??? ??? ??? x3<- x1[which.min(abs(x1$dimension-
unique(x2$dimension))),]
??? ??? ??? ??? ??? rowNx2<- as.numeric(row.names(x2))
??? ??? ??? ??? ??? rowNx3<- as.numeric(row.names(x3))
??? ??? ??? ??? ??? x4<- x2[which.min(abs(rowNx2-rowNx3)),]
??? ??? ??? ??? ??????? x5<- rbind(x3,x4)
??? ??? ??? ??? ??? x6<- x[is.na(match(row.names(x),row.names(x5))),]
??? ??? ??? ??? ??? })
??? dat40<- do.call(rbind,lst0)
??? row.names(dat40)<- gsub(".*\\.","",row.names(dat40))
??? indxNew0<- sort(as.numeric(unique(row.names(dat40))))
??? res1Del<-dat[indxNew1,]
??? res0Del<-dat[indxNew0,]
??? indx10<-sort(as.numeric(union(row.names(res0Del),row.names(res1Del))))
??? if(length(indx10)%%2==1){
??? res10Del<-unique(rbind(res1Del,res0Del))
???? ?indx10New<- sort(as.numeric(row.names(res10Del)))
??? ?resF<- dat[-indx10New,]
??? resF
??? }
??? else{
??????? resF<- dat[-indx10,]
??????? resF
??? }
??? }
??? else{
??? resF<- dat[-indxNew1,]
??? }
??? }
###Old Function
fun3<- function(dat){
????????? indx<- duplicated(dat)
??? ? dat1<- subset(dat[indx,],dummy==1)
??? ? dat0<- subset(dat[indx,],dummy==0)
??? ? indx1<- as.numeric(row.names(dat1))
??? ?indx11<- sort(c(indx1,indx1+1))
??? ?indx0<- as.numeric(row.names(dat0))
??? ?indx00<- sort(c(indx0,indx0-1))
??? ? indx10<- sort(c(indx11,indx00))
??? ?res <- dat[-indx10,]
??? res
??? }
##Applying fun1() (from previous post) ? ???
res5Percent<- fun1(final3New,0.05,50)
res5Percent1<- fun1(final3New1,0.05,50)
res10Percent<- fun1(final3New,0.10,200)
res10Percent1<- fun1(final3New1,0.10,200)
res20Percent<- fun1(final3New,0.20,100)
res20Percent1<- fun1(final3New1,0.20,100)
###Applying fun2()
res5F2<- fun2(res5Percent)
res5F2_1<- fun2(res5Percent1)
res10F2<- fun2(res10Percent)
res10F2_1<- fun2(res10Percent1)
res20F2<- fun2(res20Percent)
res20F2_1<- fun2(res20Percent1)
???
#Applying fun3()
res5F3<- fun3(res5Percent)
res5F3_1<- fun3(res5Percent1)
res10F3<- fun3(res10Percent)
res10F3_1<- fun3(res10Percent1)
res20F3<- fun3(res20Percent)
res20F3_1<- fun3(res20Percent1)
vec1<- rep(c("res5F2","res10F2","res20F2"),2)
vec2<- rep(c("res5F3","res10F3","res20F3"),2)
vec1[4:6]<-paste(vec1[4:6],"_1",sep="")
vec2[4:6]<-paste(vec2[4:6],"_1",sep="")
?resTbl<-data.frame(
Dataset=rep(rep(c("final3New","final3New1"),each=3),2),Funct=rep(c("fun2","fun3"),each=6),do.call(rbind,lapply(as.list(c(vec1,vec2)),function(x)
{x1<-get(x);c(N_row=nrow(x1),Sub0_Nrow=nrow(subset(x1,dummy==0)),Sub1_Nrow=nrow(subset(x1,dummy==1)),Uniq_Nrow=nrow(unique(x1)))})),stringsAsFactors=FALSE)
?row.names(resTbl)<- c(vec1,vec2)
resTbl
#???????????? Dataset Funct N_row Sub0_Nrow Sub1_Nrow Uniq_Nrow
#res5F2???? final3New? fun2?? 276?????? 138?????? 138?????? 276
#res10F2??? final3New? fun2?? 454?????? 227?????? 227?????? 454
#res20F2??? final3New? fun2?? 284?????? 142?????? 142?????? 284
#res5F2_1? final3New1? fun2?? 288?????? 144?????? 144?????? 288
#res10F2_1 final3New1? fun2?? 488?????? 244?????? 244?????? 488
#res20F2_1 final3New1? fun2?? 310?????? 155?????? 155?????? 310
#res5F3???? final3New? fun3?? 276?????? 138?????? 138?????? 276
#res10F3??? final3New? fun3?? 452?????? 226?????? 226?????? 452
#res20F3??? final3New? fun3?? 284?????? 142?????? 142?????? 284
#res5F3_1? final3New1? fun3?? 288?????? 144?????? 144?????? 288
#res10F3_1 final3New1? fun3?? 488?????? 244?????? 244?????? 488
#res20F3_1 final3New1? fun3?? 310?????? 155?????? 155?????? 310
?head(res5F2_1,4)
#????? firm year industry dummy dimension
#1 500622043 2004??????? 1???? 1????? 1172
#2 501611886 2004??????? 1???? 0????? 1183
#3 500778787 2004??????? 1???? 1????? 5680
#4 500047006 2004??????? 1???? 0????? 5692
A.K.
________________________________
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com>
Sent: Tuesday, June 11, 2013 4:36 PM
Subject: new data
Here it is.
Cec?lia?