I R-helpers #I have a data panel of thousands of firms, by year and industry and #one dummy variable that separates the firms in two categories: 1 if the firm have an auditor; 0 if not #and another variable the represents the firm dimension (total assets in thousand of euros) #I need to create two separated samples with the same number os firms where #one firm in the first have a corresponding firm in the second with the same #year, industry and dimension (the dimension doesn't need to be exactly the #same, it could vary in an interval of +/- 10%, for example) #My reproducible example firm1<-sort(rep(1:10,5),decreasing=F) year1<-rep(2000:2004,10) industry1<-rep(20,50) dummy1<-c(0,0,1,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1) dimension1<-c(2120,345,2341,5678,10900,4890,2789,3412,9500,8765,4532,6593,12900,123,2345,3178,2678,6666,647,23789, 2189,4289,8543,637,23456,781,35489,2345,5754,8976,3245,1234,25,1200,2345,2765,389,23456,2367,3892,5438,37824, 23,2897,3456,7690,6022,3678,9431,2890) data1<-data.frame(firm1,year1,industry1,dummy1,dimension1) data1 colnames(data1)<-c("firm","year","industry","dummy","dimension") firm2<-sort(rep(11:15,3),decreasing=F) year2<-rep(2001:2003,5) industry2<-rep(30,15) dummy2<-c(0,0,0,0,0,0,1,1,1,1,1,1,1,0,1) dimension2<-c(12456,781,32489,2345,5754,8976,3245,2120,345,2341,5678,10900,12900,123,2345) data2<-data.frame(firm2,year2,industry2,dummy2,dimension2) data2 colnames(data2)<-c("firm","year","industry","dummy","dimension") firm3<-sort(rep(16:20,4),decreasing=F) year3<-rep(2001:2004,5) industry3<-rep(40,20) dummy3<-c(0,0,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,1,0,0) dimension3<-c(23456,1181,32489,2345,6754,8976,3245,1234,1288,1200,2345,2765,389,23456,2367,3892,6438,24824, 23,2897) data3<-data.frame(firm3,year3,industry3,dummy3,dimension3) data3 colnames(data3)<-c("firm","year","industry","dummy","dimension") final1<-rbind(data1,data2) final2<-rbind(final1,data3) final2 final3<-final2[order(final2$year,final2$industry,final2$dimension),] final3 Thank you very much, Cecília Carmo Universidade de Aveiro - Portugal [[alternative HTML version deleted]]
Again my problem, better explained. #I have a data panel of thousands of firms, by year and industry and #one dummy variable that identifies one kind of firms (1 if the firm have an auditor; 0 if not) #and another variable the represents the firm dimension (total assets in thousand of euros) #I need to create two separated samples with the same number os firms where #one firm in the first have a corresponding firm in the second with the same #year, industry and dimension (the dimension doesn't need to be exatly the #same, it could vary in an interval of +/- 10%, for example) #My reproducible example firm1<-sort(rep(1:10,5),decreasing=F) year1<-rep(2000:2004,10) industry1<-rep(20,50) dummy1<-c(0,0,1,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1) dimension1<-c(2120,345,2341,5678,10900,4890,2789,3412,9500,8765,4532,6593,12900,123,2345,3178,2678,6666,647,23789, 2189,4289,8543,637,23456,781,35489,2345,5754,8976,3245,1234,25,1200,2345,2765,389,23456,2367,3892,5438,37824, 23,2897,3456,7690,6022,3678,9431,2890) data1<-data.frame(firm1,year1,industry1,dummy1,dimension1) data1 colnames(data1)<-c("firm","year","industry","dummy","dimension") firm2<-sort(rep(11:15,3),decreasing=F) year2<-rep(2001:2003,5) industry2<-rep(30,15) dummy2<-c(0,0,0,0,0,0,1,1,1,1,1,1,1,0,1) dimension2<-c(12456,781,32489,2345,5754,8976,3245,2120,345,2341,5678,10900,12900,123,2345) data2<-data.frame(firm2,year2,industry2,dummy2,dimension2) data2 colnames(data2)<-c("firm","year","industry","dummy","dimension") firm3<-sort(rep(16:20,4),decreasing=F) year3<-rep(2001:2004,5) industry3<-rep(40,20) dummy3<-c(0,0,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,1,0,0) dimension3<-c(23456,1181,32489,2345,6754,8976,3245,1234,1288,1200,2345,2765,389,23456,2367,3892,6438,24824, 23,2897) data3<-data.frame(firm3,year3,industry3,dummy3,dimension3) data3 colnames(data3)<-c("firm","year","industry","dummy","dimension") final1<-rbind(data1,data2) final2<-rbind(final1,data3) final2 final3<-final2[order(final2$year,final2$industry,final2$dimension),] final3 #So my data is final3 is like this: firm year industry dummy dimension 26 6 2000 20 0 781 1 1 2000 20 0 2120 21 5 2000 20 1 2189 36 8 2000 20 1 2765 16 4 2000 20 0 3178 31 7 2000 20 1 3245 11 3 2000 20 1 4532 6 2 2000 20 0 4890 41 9 2000 20 0 5438 46 10 2000 20 0 7690 2 1 2001 20 0 345 37 8 2001 20 1 389 32 7 2001 20 0 1234 17 4 2001 20 0 2678 7 2 2001 20 1 2789 22 5 2001 20 1 4289 47 10 2001 20 0 6022 12 3 2001 20 1 6593 27 6 2001 20 0 35489 42 9 2001 20 1 37824 60 14 2001 30 1 2341 54 12 2001 30 0 2345 57 13 2001 30 1 3245 51 11 2001 30 0 12456 63 15 2001 30 1 12900 78 19 2001 40 1 389 74 18 2001 40 1 1288 82 20 2001 40 0 6438 70 17 2001 40 1 6754 66 16 2001 40 0 23456 43 9 2002 20 0 23 33 7 2002 20 1 25 3 1 2002 20 1 2341 28 6 2002 20 0 2345 8 2 2002 20 1 3412 48 10 2002 20 1 3678 18 4 2002 20 0 6666 23 5 2002 20 0 8543 13 3 2002 20 0 12900 38 8 2002 20 1 23456 64 15 2002 30 0 123 52 11 2002 30 0 781 58 13 2002 30 1 2120 61 14 2002 30 1 5678 55 12 2002 30 0 5754 67 16 2002 40 0 1181 75 18 2002 40 1 1200 71 17 2002 40 0 8976 79 19 2002 40 0 23456 83 20 2002 40 1 24824 14 3 2003 20 0 123 24 5 2003 20 0 637 19 4 2003 20 1 647 34 7 2003 20 0 1200 39 8 2003 20 1 2367 44 9 2003 20 0 2897 4 1 2003 20 1 5678 29 6 2003 20 0 5754 49 10 2003 20 1 9431 9 2 2003 20 0 9500 59 13 2003 30 1 345 65 15 2003 30 1 2345 56 12 2003 30 0 8976 62 14 2003 30 1 10900 53 11 2003 30 0 32489 84 20 2003 40 0 23 76 18 2003 40 1 2345 80 19 2003 40 0 2367 72 17 2003 40 1 3245 68 16 2003 40 1 32489 15 3 2004 20 0 2345 35 7 2004 20 1 2345 50 10 2004 20 1 2890 45 9 2004 20 0 3456 40 8 2004 20 0 3892 10 2 2004 20 1 8765 30 6 2004 20 0 8976 5 1 2004 20 0 10900 25 5 2004 20 0 23456 20 4 2004 20 1 23789 73 17 2004 40 0 1234 69 16 2004 40 0 2345 77 18 2004 40 1 2765 85 20 2004 40 0 2897 81 19 2004 40 0 3892 I want to keep couples of firms one with dummy=1 and other with dummy=0 that matchs in industry, firm and dimension. But dimension doesn't need to be exactly the same, it is why I refer an interval of + or - 10%. For example firm 1 matchs with firm 5, because they have the same year, industry, dimension (10% x 2120 = 212 and 2189-2120<212) and firm 1 is dummy=0 and firm 5 is dummy=1. So I want to delete firm 6 because it doesn't macth with any firm, and keep firm 1 and 5. firm year industry dummy dimension 26 6 2000 20 0 781 1 1 2000 20 0 2120 21 5 2000 20 1 2189 Next, Now I can match firm 4 with firm 7 and delete firm 8. 36 8 2000 20 1 2765 16 4 2000 20 0 3178 31 7 2000 20 1 3245 And so on... At the end I want to keep only pairs of firms, matched by year, industry and dimension. If I separate firms with dummy=1 from firms with dummy=0 in two separated dataframes, I have two matched samples with the same number of observations. That's what I want. Thank you, Cecília Carmo Universidade de Aveiro - Portugal [[alternative HTML version deleted]]
Hi, Not sure if this is what you wanted. ?res<-do.call(rbind,lapply(lst6,function(x) do.call(rbind,x))) ?row.names(res)<-1:nrow(res) # this combines the list of lists to a data.frame res[1:4,] #? firm year industry dummy dimension #1??? 1 2000?????? 20???? 0????? 2120 #2??? 5 2000?????? 20???? 1????? 2189 #3??? 4 2000?????? 20???? 0????? 3178 #4??? 7 2000?????? 20???? 1????? 3245 #or ?res<-do.call(rbind,lapply(lst6,function(x) do.call(rbind,x))) ?res$group<-gsub("(.*\\..*)\\..*$","\\1",rownames(res)) ?row.names(res)<-1:nrow(res) ?res[1:4,] #? firm year industry dummy dimension?? group #1??? 1 2000?????? 20???? 0????? 2120 2000.20?? #1 group #2??? 5 2000?????? 20???? 1????? 2189 2000.20?? #1 #3??? 4 2000?????? 20???? 0????? 3178 2000.20?? #2 #4??? 7 2000?????? 20???? 1????? 3245 2000.20?? #2 A.K. ----- Original Message ----- From: Cecilia Carmo <cecilia.carmo at ua.pt> To: arun <smartpink111 at yahoo.com> Cc: Sent: Friday, June 7, 2013 11:33 AM Subject: RE: [R] matched samples, dataframe, panel data Thank you very much. Just a little thing: how can I put it like a dataframe? Thanks, Cec?lia ________________________________________ De: arun [smartpink111 at yahoo.com] Enviado: sexta-feira, 7 de Junho de 2013 16:27 Para: Cecilia Carmo Assunto: Re: [R] matched samples, dataframe, panel data Hi, There could be easier ways...? I am a bit busy now to try other ways. ----- Original Message ----- From: arun <smartpink111 at yahoo.com> To: Cecilia Carmo <cecilia.carmo at ua.pt> Cc: R help <r-help at r-project.org> Sent: Friday, June 7, 2013 11:25 AM Subject: Re: [R] matched samples, dataframe, panel data Hi, May be this helps: lst1<-split(final3,list(final3$year,final3$industry)) lst2<-lst1[lapply(lst1,nrow)>0] lst3<-lapply(lst2,function(x) lapply(x$dimension,function(y) x[(y< (x$dimension+x$dimension*0.1)) & (y> (x$dimension-x$dimension*0.1)),])) lst4<-lapply(lst3,function(x) x[lapply(x,nrow)==2]) lst5<-lapply(lst4,function(x)x[!duplicated(x)]) lst6<-lst5[lapply(lst5,length)>0] names(lst6) # [1] "2000.20" "2001.20" "2002.20" "2003.20" "2004.20" "2001.30" "2002.30" #[8] "2001.40" "2002.40" "2003.40" "2004.40" lst6["2000.20"] #$`2000.20` #$`2000.20`[[1]] #? firm year industry dummy dimension #1? ? 1 2000? ? ? 20? ? 0? ? ? 2120 #21? ? 5 2000? ? ? 20? ? 1? ? ? 2189 # #$`2000.20`[[2]] #? firm year industry dummy dimension #16? ? 4 2000? ? ? 20? ? 0? ? ? 3178 #31? ? 7 2000? ? ? 20? ? 1? ? ? 3245 # #$`2000.20`[[3]] #? firm year industry dummy dimension #11? ? 3 2000? ? ? 20? ? 1? ? ? 4532 #6? ? 2 2000? ? ? 20? ? 0? ? ? 4890 A.K. ________________________________ From: Cecilia Carmo <cecilia.carmo at ua.pt> To: "r-help at r-project.org" <r-help at r-project.org> Cc: "smartpink111 at yahoo.com" <smartpink111 at yahoo.com> Sent: Friday, June 7, 2013 9:56 AM Subject: Re: [R] matched samples, dataframe, panel data Again my problem, better explained. #I have a data panel of thousands of firms, by year and industry and #one dummy variable that identifies one kind of firms (1 if the firm have an auditor; 0 if not) #and another variable the represents the firm dimension (total assets in thousand of euros) #I need to create two separated samples with the same number os firms where #one firm in the first have a corresponding firm in the second with the same #year, industry and dimension (the dimension doesn't need to be exatly the #same, it could vary in an interval of +/- 10%, for example) #My reproducible example firm1<-sort(rep(1:10,5),decreasing=F) year1<-rep(2000:2004,10) industry1<-rep(20,50) dummy1<-c(0,0,1,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1) dimension1<-c(2120,345,2341,5678,10900,4890,2789,3412,9500,8765,4532,6593,12900,123,2345,3178,2678,6666,647,23789, 2189,4289,8543,637,23456,781,35489,2345,5754,8976,3245,1234,25,1200,2345,2765,389,23456,2367,3892,5438,37824, 23,2897,3456,7690,6022,3678,9431,2890) data1<-data.frame(firm1,year1,industry1,dummy1,dimension1) data1 colnames(data1)<-c("firm","year","industry","dummy","dimension") firm2<-sort(rep(11:15,3),decreasing=F) year2<-rep(2001:2003,5) industry2<-rep(30,15) dummy2<-c(0,0,0,0,0,0,1,1,1,1,1,1,1,0,1) dimension2<-c(12456,781,32489,2345,5754,8976,3245,2120,345,2341,5678,10900,12900,123,2345) data2<-data.frame(firm2,year2,industry2,dummy2,dimension2) data2 colnames(data2)<-c("firm","year","industry","dummy","dimension") firm3<-sort(rep(16:20,4),decreasing=F) year3<-rep(2001:2004,5) industry3<-rep(40,20) dummy3<-c(0,0,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,1,0,0) dimension3<-c(23456,1181,32489,2345,6754,8976,3245,1234,1288,1200,2345,2765,389,23456,2367,3892,6438,24824, 23,2897) data3<-data.frame(firm3,year3,industry3,dummy3,dimension3) data3 colnames(data3)<-c("firm","year","industry","dummy","dimension") final1<-rbind(data1,data2) final2<-rbind(final1,data3) final2 final3<-final2[order(final2$year,final2$industry,final2$dimension),] final3 #So my data is final3 is like this: ? firm year industry dummy dimension 26? ? 6 2000? ? ? 20? ? 0? ? ? 781 1? ? 1 2000? ? ? 20? ? 0? ? ? 2120 21? ? 5 2000? ? ? 20? ? 1? ? ? 2189 36? ? 8 2000? ? ? 20? ? 1? ? ? 2765 16? ? 4 2000? ? ? 20? ? 0? ? ? 3178 31? ? 7 2000? ? ? 20? ? 1? ? ? 3245 11? ? 3 2000? ? ? 20? ? 1? ? ? 4532 6? ? 2 2000? ? ? 20? ? 0? ? ? 4890 41? ? 9 2000? ? ? 20? ? 0? ? ? 5438 46? 10 2000? ? ? 20? ? 0? ? ? 7690 2? ? 1 2001? ? ? 20? ? 0? ? ? 345 37? ? 8 2001? ? ? 20? ? 1? ? ? 389 32? ? 7 2001? ? ? 20? ? 0? ? ? 1234 17? ? 4 2001? ? ? 20? ? 0? ? ? 2678 7? ? 2 2001? ? ? 20? ? 1? ? ? 2789 22? ? 5 2001? ? ? 20? ? 1? ? ? 4289 47? 10 2001? ? ? 20? ? 0? ? ? 6022 12? ? 3 2001? ? ? 20? ? 1? ? ? 6593 27? ? 6 2001? ? ? 20? ? 0? ? 35489 42? ? 9 2001? ? ? 20? ? 1? ? 37824 60? 14 2001? ? ? 30? ? 1? ? ? 2341 54? 12 2001? ? ? 30? ? 0? ? ? 2345 57? 13 2001? ? ? 30? ? 1? ? ? 3245 51? 11 2001? ? ? 30? ? 0? ? 12456 63? 15 2001? ? ? 30? ? 1? ? 12900 78? 19 2001? ? ? 40? ? 1? ? ? 389 74? 18 2001? ? ? 40? ? 1? ? ? 1288 82? 20 2001? ? ? 40? ? 0? ? ? 6438 70? 17 2001? ? ? 40? ? 1? ? ? 6754 66? 16 2001? ? ? 40? ? 0? ? 23456 43? ? 9 2002? ? ? 20? ? 0? ? ? ? 23 33? ? 7 2002? ? ? 20? ? 1? ? ? ? 25 3? ? 1 2002? ? ? 20? ? 1? ? ? 2341 28? ? 6 2002? ? ? 20? ? 0? ? ? 2345 8? ? 2 2002? ? ? 20? ? 1? ? ? 3412 48? 10 2002? ? ? 20? ? 1? ? ? 3678 18? ? 4 2002? ? ? 20? ? 0? ? ? 6666 23? ? 5 2002? ? ? 20? ? 0? ? ? 8543 13? ? 3 2002? ? ? 20? ? 0? ? 12900 38? ? 8 2002? ? ? 20? ? 1? ? 23456 64? 15 2002? ? ? 30? ? 0? ? ? 123 52? 11 2002? ? ? 30? ? 0? ? ? 781 58? 13 2002? ? ? 30? ? 1? ? ? 2120 61? 14 2002? ? ? 30? ? 1? ? ? 5678 55? 12 2002? ? ? 30? ? 0? ? ? 5754 67? 16 2002? ? ? 40? ? 0? ? ? 1181 75? 18 2002? ? ? 40? ? 1? ? ? 1200 71? 17 2002? ? ? 40? ? 0? ? ? 8976 79? 19 2002? ? ? 40? ? 0? ? 23456 83? 20 2002? ? ? 40? ? 1? ? 24824 14? ? 3 2003? ? ? 20? ? 0? ? ? 123 24? ? 5 2003? ? ? 20? ? 0? ? ? 637 19? ? 4 2003? ? ? 20? ? 1? ? ? 647 34? ? 7 2003? ? ? 20? ? 0? ? ? 1200 39? ? 8 2003? ? ? 20? ? 1? ? ? 2367 44? ? 9 2003? ? ? 20? ? 0? ? ? 2897 4? ? 1 2003? ? ? 20? ? 1? ? ? 5678 29? ? 6 2003? ? ? 20? ? 0? ? ? 5754 49? 10 2003? ? ? 20? ? 1? ? ? 9431 9? ? 2 2003? ? ? 20? ? 0? ? ? 9500 59? 13 2003? ? ? 30? ? 1? ? ? 345 65? 15 2003? ? ? 30? ? 1? ? ? 2345 56? 12 2003? ? ? 30? ? 0? ? ? 8976 62? 14 2003? ? ? 30? ? 1? ? 10900 53? 11 2003? ? ? 30? ? 0? ? 32489 84? 20 2003? ? ? 40? ? 0? ? ? ? 23 76? 18 2003? ? ? 40? ? 1? ? ? 2345 80? 19 2003? ? ? 40? ? 0? ? ? 2367 72? 17 2003? ? ? 40? ? 1? ? ? 3245 68? 16 2003? ? ? 40? ? 1? ? 32489 15? ? 3 2004? ? ? 20? ? 0? ? ? 2345 35? ? 7 2004? ? ? 20? ? 1? ? ? 2345 50? 10 2004? ? ? 20? ? 1? ? ? 2890 45? ? 9 2004? ? ? 20? ? 0? ? ? 3456 40? ? 8 2004? ? ? 20? ? 0? ? ? 3892 10? ? 2 2004? ? ? 20? ? 1? ? ? 8765 30? ? 6 2004? ? ? 20? ? 0? ? ? 8976 5? ? 1 2004? ? ? 20? ? 0? ? 10900 25? ? 5 2004? ? ? 20? ? 0? ? 23456 20? ? 4 2004? ? ? 20? ? 1? ? 23789 73? 17 2004? ? ? 40? ? 0? ? ? 1234 69? 16 2004? ? ? 40? ? 0? ? ? 2345 77? 18 2004? ? ? 40? ? 1? ? ? 2765 85? 20 2004? ? ? 40? ? 0? ? ? 2897 81? 19 2004? ? ? 40? ? 0? ? ? 3892 I want to keep couples of firms one with dummy=1 and other with dummy=0 that matchs in industry, firm and dimension. But dimension doesn't need to be exactly the same, it is why I refer an interval of + or - 10%. For example firm 1 matchs with firm 5, because they have the same year, industry, dimension (10% x 2120 = 212 and 2189-2120<212) and firm 1 is dummy=0 and firm 5 is dummy=1. So I want to delete firm 6 because it doesn't macth with any firm, and keep firm 1 and 5. ? ? firm year industry dummy dimension 26? ? 6 2000? ? ? 20? ? 0? ? ? 781 1? ? 1 2000? ? ? 20? ? 0? ? ? 2120 21? ? 5 2000? ? ? 20? ? 1? ? ? 2189 Next, Now I can match firm 4 with firm 7 and delete firm 8. 36? ? 8 2000? ? ? 20? ? 1? ? ? 2765 16? ? 4 2000? ? ? 20? ? 0? ? ? 3178 31? ? 7 2000? ? ? 20? ? 1? ? ? 3245 And so on... At the end I want to keep only pairs of firms, matched by year, industry and dimension. If I separate firms with dummy=1 from firms with dummy=0 in two separated dataframes, I have two matched samples with the same number of observations. That's what I want. Thank you, Cec?lia Carmo Universidade de Aveiro - Portugal
Hi, I changed the fun1().? Now, it should be possible to get all the possible combinations within each group. final3New<-read.table(file="real_data_cecilia.txt",sep="\t",header=T) final3New1<-read.csv("real_data_cecilia_new.csv") fun1New<- function(dat,percent,number){ ??? lst1<- split(dat,list(dat$year,dat$industry)) ??? lst2<- lst1[lapply(lst1,nrow)>1] ??? lst3<- lapply(lst2,function(x) { ??? ??? ??? ??? ??? CombN1<-combn(seq_len(nrow(x)),2) ??? ??? ??? ??? ??? lapply(split(CombN1,col(CombN1)),function(y){ ??? ??? ??? ??? ??? ??? ??? x1<-x[y,] ??? ??? ??? ??? ??? ??? ??? x1[sum(x1$dummy)==1,] ??? ??? ??? ??? ??? ??? ??? }) ??? ??? ??? ??? ??? }) ??????? lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0]) ??? lst5<- lst4[lapply(lst4,length)>0] ??? lst6<- lapply(lst5,function(x){ ??? ??? ??? ??? ?? lapply(x,function(y){ ??? ??? ??? ??? ??? x1<- abs(diff(y$dimension))< number ??? ??? ??? ??? ??? x2<- y$dimension[2]+ (y$dimension[2]*percent) ??? ??? ??? ??? ??? x3<- y$dimension[2]- (y$dimension[2]*percent) ??? ??? ??? ??? ??? x4<- (y$dimension[1] < x2) & (y$dimension[1] > x3) ??? ??? ??? ??? ??? y[x4 & x1,] ??? ??? ??? ??? ??? }) ??? ??? ??? ??? ??? } ??? ??? ??? ??? ??? ) ??? lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0]) ??? lst8<- lst7[lapply(lst7,length)>0] ??? res<- do.call(rbind,lapply(lst8,function(x){ ??? ??? ??? ??? ??? ?????? do.call(rbind,x) ??? ??? ??? ??? ??? ??? })) ??? row.names(res)<- 1:nrow(res) ??? res ??? }??? ??? ??? ??? ??? ??? ##Applying fun1New res5Percent<- fun1New(final3New,0.05,50) dim(res5Percent) #[1] 718?? 5 res5PercentHigh<- fun1New(final3New,0.05,500000) ?dim(res5PercentHigh) #[1] 2788??? 5 res5Percent1<- fun1New(final3New1,0.05,50) dim(res5Percent1) #[1] 870?? 5 res5Percent1High<- fun1New(final3New1,0.05,500000) dim(res5Percent1High) #[1] 2902??? 5 res10Percent<- fun1New(final3New,0.10,200) dim(res10Percent) #[1] 2928??? 5 res10Percent1<- fun1New(final3New1,0.10,200) dim(res10Percent1) #[1] 3092??? 5 fun3<- function(dat){ ????????? indx<- duplicated(dat) ??? ? dat1<- subset(dat[indx,],dummy==1) ??? ? dat0<- subset(dat[indx,],dummy==0) ??? ? indx1<- as.numeric(row.names(dat1)) ??? ?indx11<- sort(c(indx1,indx1+1)) ??? ?indx0<- as.numeric(row.names(dat0)) ??? ?indx00<- sort(c(indx0,indx0-1)) ??? ? indx10<- sort(c(indx11,indx00)) ??? ?res <- dat[-indx10,] ??? res ??? } #Applying fun3() res5F3<- fun3(res5Percent) dim(res5F3) #[1] 278?? 5 res5F3High<- fun3(res5PercentHigh) dim(res5F3High) #[1] 546?? 5 res5F3_1<- fun3(res5Percent1) #[1] 302?? 5 res5F3High_1<- fun3(res5Percent1High) dim(res5F3High_1) #[1] 570?? 5 res10F3<- fun3(res10Percent) dim(res10F3) #[1] 462?? 5 res10F3_1<- fun3(res10Percent1) #[1] 474?? 5 nrow(subset(res5F3,dummy==0)) #[1] 139 ?nrow(subset(res5F3,dummy==1)) #[1] 139 ?nrow(subset(res5F3High,dummy==1)) #[1] 273 ?nrow(subset(res5F3High,dummy==0)) #[1] 273 ?nrow(subset(res10F3,dummy==0)) #[1] 231 ?nrow(subset(res10F3,dummy==1)) #[1] 231 ?nrow(subset(res10F3_1,dummy==1)) #[1] 237 ?nrow(subset(res10F3_1,dummy==0)) #[1] 237 ?dim(unique(res5F3)) #[1] 278?? 5 dim(unique(res5F3High)) #[1] 546?? 5 ?dim(unique(res10F3_1)) #[1] 474?? 5 ?dim(unique(res10F3)) #[1] 462?? 5 A.K. ________________________________ From: Cecilia Carmo <cecilia.carmo at ua.pt> To: arun <smartpink111 at yahoo.com> Sent: Friday, June 14, 2013 10:44 AM Subject: me again There some matchs that are missing. That is, it is possible to have more matchs. I'm sending you a sript and the data. Than you. Cec?lia?
Hi, This gives me more combinations than you got with SAS.? Also, this selects the one with minimum dimension between duplicates. final3New<-read.table(file="real_data_cecilia.txt",sep="\t",header=T) final3New1<-read.csv("real_data_cecilia_new.csv") fun3<- function(dat){ ??? ??? if(any(duplicated(dat))){ ??? ?????? ??? indx<- which(duplicated(dat)) ??? ??? row.names(dat)<-1:nrow(dat) ??? ? ??? dat1<- subset(dat[indx,],dummy==1) ??? ? ??? dat0<- subset(dat[indx,],dummy==0) ??? ? ??? indx1<- as.numeric(row.names(dat1)) ??? ???? indx11<- sort(c(indx1,indx1+1)) ??? ???? indx0<- as.numeric(row.names(dat0)) ??? ???? indx00<- sort(c(indx0,indx0-1)) ??? ? ??? indx10<- sort(c(indx11,indx00)) ??? ???? res <- dat[-indx10,] ??? ??? res ??? ??? } ??? ??? else { ??? ??? ??? dat ??? ??? } ??? ??? } fun1New<-function(dat,percent,number){ ??? lst1<- split(dat,list(dat$year,dat$industry)) ??? lst2<- lst1[lapply(lst1,nrow)>1] ??? lst3<- lapply(lst2,function(x) { ??? ??? ??? ??? ??? CombN1<-combn(seq_len(nrow(x)),2) ??? ??? ??? ??? ??? lapply(split(CombN1,col(CombN1)),function(y){ ??? ??? ??? ??? ??? ??? ??? x1<-x[y,] ??? ??? ??? ??? ??? ??? ??? x1[sum(x1$dummy)==1,] ??? ??? ??? ??? ??? ??? ??? }) ??? ??? ??? ??? ??? }) ??????? lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0]) ??? lst5<- lst4[lapply(lst4,length)>0] ??? lst6<- lapply(lst5,function(x){ ??? ??? ??? ??? ?? lapply(x,function(y){ ??? ??? ??? ??? ??? x1<- abs(diff(y$dimension))< number ??? ??? ??? ??? ??? x2<- y$dimension[2]+ (y$dimension[2]*percent) ??? ??? ??? ??? ??? x3<- y$dimension[2]- (y$dimension[2]*percent) ??? ??? ??? ??? ??? x4<- y$dimension[1]+ (y$dimension[1]*percent) ??? ??? ??? ??? ??? x5<- y$dimension[1]- (y$dimension[1]*percent) ??? ??? ??? ??? ??? x6<- (y$dimension[1] < x2) & (y$dimension[1] > x3) ??? ??? ??? ??? ??? x7<- (y$dimension[2]< x4) & (y$dimension[2]> x5) ??? ??? ??? ??? ??? ??? y[((x6 & x1)| (x7 & x1)),] ??? ??? ??? ??? ??? }) ??? ??? ??? ??? ??? } ??? ??? ??? ??? ??? ) ??? lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0]) ??? lst8<- lst7[lapply(lst7,length)>0] ??? lst9<- lapply(lst8,function(x) do.call(rbind,x)) ??? lst10<-lapply(lst9,function(x) { ??? ??? ??? ??? row.names(x)<- 1:nrow(x) ??? ??? ??? ??? x1<-x[x$dummy==1,] ??? ??? ??? ??? do.call(rbind,lapply(split(x1,x1$dimension),function(y){ ??? ??? ??? ??? ????? indx1<-sort(c(as.numeric(row.names(y)),as.numeric(row.names(y))+1)) ??? ??? ??? ??? ????? x2<-x[indx1,] ??? ??? ??? ??? ????? x3<- subset(x2,dummy==0) ??? ??? ??? ??? ????? x4<-x3[which.min(abs(x2$dimension[1]-x3$dimension)),] ??? ??? ??? ??? ????? rbind(x2[1,],x4) ??? ??? ??? ??? ??? })) ??? ??? ??? ??? ??? }) ?????? res<- do.call(rbind,lapply(lst10,fun3)) ?????? row.names(res)<- 1:nrow(res) ??? res ??? } ??? ??? ???? ####1st dataset res10PercentHigh<- fun1New(final3New,0.10,500000000) ?dim(res10PercentHigh) #[1] 764?? 5 ?dim(unique(res10PercentHigh)) #[1] 764?? 5 ?nrow(subset(res10PercentHigh,dummy==0)) #[1] 382 ?nrow(subset(res10PercentHigh,dummy==1)) #[1] 382 res10PercentLow<- fun1New(final3New,0.10,50) ?dim(res10PercentLow) #[1] 294?? 5 ?dim(unique(res10PercentLow)) #[1] 294?? 5 ?nrow(subset(res10PercentLow,dummy==0)) #[1] 147 ?nrow(subset(res10PercentLow,dummy==1)) #[1] 147 res5PercentHigh<- fun1New(final3New,0.05,500000000) ?dim(res5PercentHigh) #[1] 630?? 5 ?dim(unique(res5PercentHigh)) #[1] 630?? 5 ?nrow(subset(res5PercentHigh,dummy==0)) #[1] 315 ?nrow(subset(res5PercentHigh,dummy==1)) #[1] 315 res5PercentLow<- fun1New(final3New,0.05,50) ?dim(res5PercentLow) #[1] 294?? 5 ?dim(unique(res5PercentLow)) #[1] 294?? 5 ?nrow(subset(res5PercentLow,dummy==0)) #[1] 147 ?nrow(subset(res5PercentLow,dummy==1)) #[1] 147 #######2nd dataset res10PercentHigh<- fun1New(final3New1,0.10,500000000) ?dim(res10PercentHigh) #[1] 760?? 5 ?dim(unique(res10PercentHigh)) #[1] 760?? 5 ?nrow(subset(res10PercentHigh,dummy==0)) #[1] 380 ?nrow(subset(res10PercentHigh,dummy==1)) #[1] 380 res10PercentLow<- fun1New(final3New1,0.10,100) ?dim(res10PercentLow) #[1] 418?? 5 ?dim(unique(res10PercentLow)) #[1] 418?? 5 ?nrow(subset(res10PercentLow,dummy==0)) #[1] 209 ?nrow(subset(res10PercentLow,dummy==1)) #[1] 209 res5PercentHigh<- fun1New(final3New1,0.05,500000000) ?dim(res5PercentHigh) #[1] 640?? 5 ?dim(unique(res5PercentHigh)) #[1] 640?? 5 ?nrow(subset(res5PercentHigh,dummy==0)) #[1] 320 ?nrow(subset(res5PercentHigh,dummy==1)) #[1] 320 res5PercentLow<- fun1New(final3New1,0.05,50) ?dim(res5PercentLow) #[1] 310?? 5 ?dim(unique(res5PercentLow)) #[1] 310?? 5 ?nrow(subset(res5PercentLow,dummy==0)) #[1] 155 ?nrow(subset(res5PercentLow,dummy==1)) #[1] 155 res20PercentHigh<- fun1New(final3New1,0.20,500000000) dim(res20PercentHigh) #[1] 846?? 5 ?dim(unique(res20PercentHigh)) #[1] 846?? 5 ?nrow(subset(res20PercentHigh,dummy==0)) #[1] 423 ?nrow(subset(res20PercentHigh,dummy==1)) #[1] 423 A.K. ----- Original Message ----- From: Cecilia Carmo <cecilia.carmo at ua.pt> To: arun <smartpink111 at yahoo.com> Cc: Sent: Sunday, June 16, 2013 5:57 AM Subject: RE: matched samples, dataframe, panel data In the script I send you and with the file that? I sent with it and with the old function 1 and 2 it got 350 combinations and it was possible to have more Now with new fun 1 and 3 I have less, so it is not ok, does it?> res10Percent<- fun1New(final3New2,0.10,500000000) > res10F3<- fun3(res10Percent) > dim(res10F3)[1] 600? 5> nrow(subset(res10F3,dummy==0))[1] 300> nrow(subset(res10F3,dummy==1))[1] 300 Sorry for making you spending so much time. I thought it could be easier. Cec?lia ________________________________________ De: arun [smartpink111 at yahoo.com] Enviado: sexta-feira, 14 de Junho de 2013 23:09 Para: Cecilia Carmo Assunto: Re: matched samples, dataframe, panel data One thing I forgot to mention.? I used fun3() because i found fun2() still have some problems with getting the correct dimensions.? You can check the results of fun1() and fun3() and see if all the combinations are got.? Then, if I get chance, I will correct fun2(). """"" And you conclude that they are the same! """"""" Here, also I am not concluding anything. A.K. ----- Original Message ----- From: arun <smartpink111 at yahoo.com> To: Cecilia Carmo <cecilia.carmo at ua.pt> Cc: R help <r-help at r-project.org> Sent: Friday, June 14, 2013 6:05 PM Subject: Re: matched samples, dataframe, panel data Hi, I changed the fun1().? Now, it should be possible to get all the possible combinations within each group. final3New<-read.table(file="real_data_cecilia.txt",sep="\t",header=T) final3New1<-read.csv("real_data_cecilia_new.csv") fun1New<- function(dat,percent,number){ ? ? lst1<- split(dat,list(dat$year,dat$industry)) ? ? lst2<- lst1[lapply(lst1,nrow)>1] ? ? lst3<- lapply(lst2,function(x) { ? ? ? ? ? ? ? ? ? ? CombN1<-combn(seq_len(nrow(x)),2) ? ? ? ? ? ? ? ? ? ? lapply(split(CombN1,col(CombN1)),function(y){ ? ? ? ? ? ? ? ? ? ? ? ? ? ? x1<-x[y,] ? ? ? ? ? ? ? ? ? ? ? ? ? ? x1[sum(x1$dummy)==1,] ? ? ? ? ? ? ? ? ? ? ? ? ? ? }) ? ? ? ? ? ? ? ? ? ? }) ? ? ? ? lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0]) ? ? lst5<- lst4[lapply(lst4,length)>0] ? ? lst6<- lapply(lst5,function(x){ ? ? ? ? ? ? ? ? ? lapply(x,function(y){ ? ? ? ? ? ? ? ? ? ? x1<- abs(diff(y$dimension))< number ? ? ? ? ? ? ? ? ? ? x2<- y$dimension[2]+ (y$dimension[2]*percent) ? ? ? ? ? ? ? ? ? ? x3<- y$dimension[2]- (y$dimension[2]*percent) ? ? ? ? ? ? ? ? ? ? x4<- (y$dimension[1] < x2) & (y$dimension[1] > x3) ? ? ? ? ? ? ? ? ? ? y[x4 & x1,] ? ? ? ? ? ? ? ? ? ? }) ? ? ? ? ? ? ? ? ? ? } ? ? ? ? ? ? ? ? ? ? ) ? ? lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0]) ? ? lst8<- lst7[lapply(lst7,length)>0] ? ? res<- do.call(rbind,lapply(lst8,function(x){ ? ? ? ? ? ? ? ? ? ? ? ? ? do.call(rbind,x) ? ? ? ? ? ? ? ? ? ? ? ? })) ? ? row.names(res)<- 1:nrow(res) ? ? res ? ? }? ##Applying fun1New res5Percent<- fun1New(final3New,0.05,50) dim(res5Percent) #[1] 718? 5 res5PercentHigh<- fun1New(final3New,0.05,500000) dim(res5PercentHigh) #[1] 2788? ? 5 res5Percent1<- fun1New(final3New1,0.05,50) dim(res5Percent1) #[1] 870? 5 res5Percent1High<- fun1New(final3New1,0.05,500000) dim(res5Percent1High) #[1] 2902? ? 5 res10Percent<- fun1New(final3New,0.10,200) dim(res10Percent) #[1] 2928? ? 5 res10Percent1<- fun1New(final3New1,0.10,200) dim(res10Percent1) #[1] 3092? ? 5 fun3<- function(dat){ ? ? ? ? ? indx<- duplicated(dat) ? ? ? dat1<- subset(dat[indx,],dummy==1) ? ? ? dat0<- subset(dat[indx,],dummy==0) ? ? ? indx1<- as.numeric(row.names(dat1)) ? ? indx11<- sort(c(indx1,indx1+1)) ? ? indx0<- as.numeric(row.names(dat0)) ? ? indx00<- sort(c(indx0,indx0-1)) ? ? ? indx10<- sort(c(indx11,indx00)) ? ? res <- dat[-indx10,] ? ? res ? ? } #Applying fun3() res5F3<- fun3(res5Percent) dim(res5F3) #[1] 278? 5 res5F3High<- fun3(res5PercentHigh) dim(res5F3High) #[1] 546? 5 res5F3_1<- fun3(res5Percent1) #[1] 302? 5 res5F3High_1<- fun3(res5Percent1High) dim(res5F3High_1) #[1] 570? 5 res10F3<- fun3(res10Percent) dim(res10F3) #[1] 462? 5 res10F3_1<- fun3(res10Percent1) #[1] 474? 5 nrow(subset(res5F3,dummy==0)) #[1] 139 nrow(subset(res5F3,dummy==1)) #[1] 139 nrow(subset(res5F3High,dummy==1)) #[1] 273 nrow(subset(res5F3High,dummy==0)) #[1] 273 nrow(subset(res10F3,dummy==0)) #[1] 231 nrow(subset(res10F3,dummy==1)) #[1] 231 nrow(subset(res10F3_1,dummy==1)) #[1] 237 nrow(subset(res10F3_1,dummy==0)) #[1] 237 dim(unique(res5F3)) #[1] 278? 5 dim(unique(res5F3High)) #[1] 546? 5 dim(unique(res10F3_1)) #[1] 474? 5 dim(unique(res10F3)) #[1] 462? 5 A.K. ________________________________ From: Cecilia Carmo <cecilia.carmo at ua.pt> To: arun <smartpink111 at yahoo.com> Sent: Friday, June 14, 2013 10:44 AM Subject: me again There some matchs that are missing. That is, it is possible to have more matchs. I'm sending you a sript and the data. Than you. Cec?lia