thr3ads.net - R help - [R] matched samples, dataframe, panel data [Jun 2013]

If this information is useful, please help other people find it:
Share via:

Cecilia Carmo

2013-Jun-07 09:37 UTC

[R] matched samples, dataframe, panel data

I R-helpers



#I have a data panel of thousands of firms, by year and industry and
#one dummy variable that separates the firms in two categories: 1 if the firm
have an auditor; 0 if not
#and another variable the represents the firm dimension (total assets in
thousand of euros)

#I need to create two separated samples with the same number os firms where
#one firm in the first have a corresponding firm in the second with the same
#year, industry and dimension (the dimension doesn't need to be exactly the
#same, it could vary in an interval of +/- 10%, for example)



#My reproducible example

firm1<-sort(rep(1:10,5),decreasing=F)
year1<-rep(2000:2004,10)
industry1<-rep(20,50)
dummy1<-c(0,0,1,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1)
dimension1<-c(2120,345,2341,5678,10900,4890,2789,3412,9500,8765,4532,6593,12900,123,2345,3178,2678,6666,647,23789,
2189,4289,8543,637,23456,781,35489,2345,5754,8976,3245,1234,25,1200,2345,2765,389,23456,2367,3892,5438,37824,
23,2897,3456,7690,6022,3678,9431,2890)
data1<-data.frame(firm1,year1,industry1,dummy1,dimension1)
data1
colnames(data1)<-c("firm","year","industry","dummy","dimension")

firm2<-sort(rep(11:15,3),decreasing=F)
year2<-rep(2001:2003,5)
industry2<-rep(30,15)
dummy2<-c(0,0,0,0,0,0,1,1,1,1,1,1,1,0,1)
dimension2<-c(12456,781,32489,2345,5754,8976,3245,2120,345,2341,5678,10900,12900,123,2345)
data2<-data.frame(firm2,year2,industry2,dummy2,dimension2)
data2
colnames(data2)<-c("firm","year","industry","dummy","dimension")

firm3<-sort(rep(16:20,4),decreasing=F)
year3<-rep(2001:2004,5)
industry3<-rep(40,20)
dummy3<-c(0,0,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,1,0,0)
dimension3<-c(23456,1181,32489,2345,6754,8976,3245,1234,1288,1200,2345,2765,389,23456,2367,3892,6438,24824,
23,2897)
data3<-data.frame(firm3,year3,industry3,dummy3,dimension3)
data3
colnames(data3)<-c("firm","year","industry","dummy","dimension")

final1<-rbind(data1,data2)
final2<-rbind(final1,data3)
final2
final3<-final2[order(final2$year,final2$industry,final2$dimension),]
final3



Thank you very much,
Cecília Carmo

Universidade de Aveiro - Portugal

	[[alternative HTML version deleted]]

Cecilia Carmo

2013-Jun-07 13:56 UTC

head link

[R] matched samples, dataframe, panel data

Again my problem, better explained.



#I have a data panel of thousands of firms, by year and industry and
#one dummy variable that identifies one kind of firms (1 if the firm have an
auditor; 0 if not)
#and another variable the represents the firm dimension (total assets in
thousand of euros)
#I need to create two separated samples with the same number os firms where
#one firm in the first have a corresponding firm in the second with the same
#year, industry and dimension (the dimension doesn't need to be exatly the
#same, it could vary in an interval of +/- 10%, for example)



#My reproducible example

firm1<-sort(rep(1:10,5),decreasing=F)
year1<-rep(2000:2004,10)
industry1<-rep(20,50)
dummy1<-c(0,0,1,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1)
dimension1<-c(2120,345,2341,5678,10900,4890,2789,3412,9500,8765,4532,6593,12900,123,2345,3178,2678,6666,647,23789,
2189,4289,8543,637,23456,781,35489,2345,5754,8976,3245,1234,25,1200,2345,2765,389,23456,2367,3892,5438,37824,
23,2897,3456,7690,6022,3678,9431,2890)
data1<-data.frame(firm1,year1,industry1,dummy1,dimension1)
data1
colnames(data1)<-c("firm","year","industry","dummy","dimension")

firm2<-sort(rep(11:15,3),decreasing=F)
year2<-rep(2001:2003,5)
industry2<-rep(30,15)
dummy2<-c(0,0,0,0,0,0,1,1,1,1,1,1,1,0,1)
dimension2<-c(12456,781,32489,2345,5754,8976,3245,2120,345,2341,5678,10900,12900,123,2345)
data2<-data.frame(firm2,year2,industry2,dummy2,dimension2)
data2
colnames(data2)<-c("firm","year","industry","dummy","dimension")

firm3<-sort(rep(16:20,4),decreasing=F)
year3<-rep(2001:2004,5)
industry3<-rep(40,20)
dummy3<-c(0,0,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,1,0,0)
dimension3<-c(23456,1181,32489,2345,6754,8976,3245,1234,1288,1200,2345,2765,389,23456,2367,3892,6438,24824,
23,2897)
data3<-data.frame(firm3,year3,industry3,dummy3,dimension3)
data3
colnames(data3)<-c("firm","year","industry","dummy","dimension")

final1<-rbind(data1,data2)
final2<-rbind(final1,data3)
final2
final3<-final2[order(final2$year,final2$industry,final2$dimension),]
final3





#So my data is final3 is like this:

   firm year industry dummy dimension
26    6 2000       20     0       781
1     1 2000       20     0      2120
21    5 2000       20     1      2189
36    8 2000       20     1      2765
16    4 2000       20     0      3178
31    7 2000       20     1      3245
11    3 2000       20     1      4532
6     2 2000       20     0      4890
41    9 2000       20     0      5438
46   10 2000       20     0      7690
2     1 2001       20     0       345
37    8 2001       20     1       389
32    7 2001       20     0      1234
17    4 2001       20     0      2678
7     2 2001       20     1      2789
22    5 2001       20     1      4289
47   10 2001       20     0      6022
12    3 2001       20     1      6593
27    6 2001       20     0     35489
42    9 2001       20     1     37824
60   14 2001       30     1      2341
54   12 2001       30     0      2345
57   13 2001       30     1      3245
51   11 2001       30     0     12456
63   15 2001       30     1     12900
78   19 2001       40     1       389
74   18 2001       40     1      1288
82   20 2001       40     0      6438
70   17 2001       40     1      6754
66   16 2001       40     0     23456
43    9 2002       20     0        23
33    7 2002       20     1        25
3     1 2002       20     1      2341
28    6 2002       20     0      2345
8     2 2002       20     1      3412
48   10 2002       20     1      3678
18    4 2002       20     0      6666
23    5 2002       20     0      8543
13    3 2002       20     0     12900
38    8 2002       20     1     23456
64   15 2002       30     0       123
52   11 2002       30     0       781
58   13 2002       30     1      2120
61   14 2002       30     1      5678
55   12 2002       30     0      5754
67   16 2002       40     0      1181
75   18 2002       40     1      1200
71   17 2002       40     0      8976
79   19 2002       40     0     23456
83   20 2002       40     1     24824
14    3 2003       20     0       123
24    5 2003       20     0       637
19    4 2003       20     1       647
34    7 2003       20     0      1200
39    8 2003       20     1      2367
44    9 2003       20     0      2897
4     1 2003       20     1      5678
29    6 2003       20     0      5754
49   10 2003       20     1      9431
9     2 2003       20     0      9500
59   13 2003       30     1       345
65   15 2003       30     1      2345
56   12 2003       30     0      8976
62   14 2003       30     1     10900
53   11 2003       30     0     32489
84   20 2003       40     0        23
76   18 2003       40     1      2345
80   19 2003       40     0      2367
72   17 2003       40     1      3245
68   16 2003       40     1     32489
15    3 2004       20     0      2345
35    7 2004       20     1      2345
50   10 2004       20     1      2890
45    9 2004       20     0      3456
40    8 2004       20     0      3892
10    2 2004       20     1      8765
30    6 2004       20     0      8976
5     1 2004       20     0     10900
25    5 2004       20     0     23456
20    4 2004       20     1     23789
73   17 2004       40     0      1234
69   16 2004       40     0      2345
77   18 2004       40     1      2765
85   20 2004       40     0      2897
81   19 2004       40     0      3892



I want to keep couples of firms one with dummy=1 and other with dummy=0 that
matchs in industry, firm and dimension.

But dimension doesn't need to be exactly the same, it is why I refer an
interval of + or - 10%.

For example firm 1 matchs with firm 5, because they have the same year,
industry, dimension (10% x 2120 = 212 and 2189-2120<212)
and firm 1 is dummy=0 and firm 5 is dummy=1.

So I want to delete firm 6 because it doesn't macth with any firm, and keep
firm 1 and 5.

     firm year industry dummy dimension
26    6 2000       20     0       781
1     1 2000       20     0      2120
21    5 2000       20     1      2189



Next,



Now I can match firm 4 with firm 7 and delete firm 8.
36    8 2000       20     1      2765
16    4 2000       20     0      3178
31    7 2000       20     1      3245



And so on...

At the end I want to keep only pairs of firms, matched by year, industry and
dimension.

If I separate firms with dummy=1 from firms with dummy=0 in two separated
dataframes, I have two matched samples
with the same number of observations. That's what I want.



Thank you,

Cecília Carmo
Universidade de Aveiro - Portugal





	[[alternative HTML version deleted]]

arun

2013-Jun-07 15:43 UTC

head link

[R] matched samples, dataframe, panel data

Hi,
Not sure if this is what you wanted.

?res<-do.call(rbind,lapply(lst6,function(x) do.call(rbind,x)))
?row.names(res)<-1:nrow(res)

# this combines the list of lists to a data.frame
res[1:4,]
#? firm year industry dummy dimension
#1??? 1 2000?????? 20???? 0????? 2120
#2??? 5 2000?????? 20???? 1????? 2189
#3??? 4 2000?????? 20???? 0????? 3178
#4??? 7 2000?????? 20???? 1????? 3245


#or
?res<-do.call(rbind,lapply(lst6,function(x) do.call(rbind,x)))
?res$group<-gsub("(.*\\..*)\\..*$","\\1",rownames(res))
?row.names(res)<-1:nrow(res)
?res[1:4,]
#? firm year industry dummy dimension?? group
#1??? 1 2000?????? 20???? 0????? 2120 2000.20?? #1 group
#2??? 5 2000?????? 20???? 1????? 2189 2000.20?? #1
#3??? 4 2000?????? 20???? 0????? 3178 2000.20?? #2
#4??? 7 2000?????? 20???? 1????? 3245 2000.20?? #2
A.K.


----- Original Message -----
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com>
Cc: 
Sent: Friday, June 7, 2013 11:33 AM
Subject: RE: [R] matched samples, dataframe, panel data

Thank you very much. 
Just a little thing: how can I put it like a dataframe?

Thanks,

Cec?lia

________________________________________
De: arun [smartpink111 at yahoo.com]
Enviado: sexta-feira, 7 de Junho de 2013 16:27
Para: Cecilia Carmo
Assunto: Re: [R] matched samples, dataframe, panel data

Hi,
There could be easier ways...? I am a bit busy now to try other ways.



----- Original Message -----
From: arun <smartpink111 at yahoo.com>
To: Cecilia Carmo <cecilia.carmo at ua.pt>
Cc: R help <r-help at r-project.org>
Sent: Friday, June 7, 2013 11:25 AM
Subject: Re: [R] matched samples, dataframe, panel data

Hi,
May be this helps:
lst1<-split(final3,list(final3$year,final3$industry))
lst2<-lst1[lapply(lst1,nrow)>0]
lst3<-lapply(lst2,function(x) lapply(x$dimension,function(y) x[(y<
(x$dimension+x$dimension*0.1)) & (y> (x$dimension-x$dimension*0.1)),]))
lst4<-lapply(lst3,function(x) x[lapply(x,nrow)==2])
lst5<-lapply(lst4,function(x)x[!duplicated(x)])
lst6<-lst5[lapply(lst5,length)>0]

names(lst6)
# [1] "2000.20" "2001.20" "2002.20"
"2003.20" "2004.20" "2001.30" "2002.30"
#[8] "2001.40" "2002.40" "2003.40"
"2004.40"


lst6["2000.20"]
#$`2000.20`
#$`2000.20`[[1]]
#? firm year industry dummy dimension
#1? ?  1 2000? ? ?  20? ?  0? ? ? 2120
#21? ? 5 2000? ? ?  20? ?  1? ? ? 2189
#
#$`2000.20`[[2]]
#? firm year industry dummy dimension
#16? ? 4 2000? ? ?  20? ?  0? ? ? 3178
#31? ? 7 2000? ? ?  20? ?  1? ? ? 3245
#
#$`2000.20`[[3]]
#? firm year industry dummy dimension
#11? ? 3 2000? ? ?  20? ?  1? ? ? 4532
#6? ?  2 2000? ? ?  20? ?  0? ? ? 4890
A.K.





________________________________
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: "r-help at r-project.org" <r-help at r-project.org>
Cc: "smartpink111 at yahoo.com" <smartpink111 at yahoo.com>
Sent: Friday, June 7, 2013 9:56 AM
Subject: Re: [R] matched samples, dataframe, panel data




Again my problem, better explained.

#I have a data panel of thousands of firms, by year and industry and
#one dummy variable that identifies one kind of firms (1 if the firm have an
auditor; 0 if not)
#and another variable the represents the firm dimension (total assets in
thousand of euros)
#I need to create two separated samples with the same number os firms where
#one firm in the first have a corresponding firm in the second with the same
#year, industry and dimension (the dimension doesn't need to be exatly the
#same, it could vary in an interval of +/- 10%, for example)

#My reproducible example
firm1<-sort(rep(1:10,5),decreasing=F)
year1<-rep(2000:2004,10)
industry1<-rep(20,50)
dummy1<-c(0,0,1,1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1)
dimension1<-c(2120,345,2341,5678,10900,4890,2789,3412,9500,8765,4532,6593,12900,123,2345,3178,2678,6666,647,23789,
2189,4289,8543,637,23456,781,35489,2345,5754,8976,3245,1234,25,1200,2345,2765,389,23456,2367,3892,5438,37824,
23,2897,3456,7690,6022,3678,9431,2890)
data1<-data.frame(firm1,year1,industry1,dummy1,dimension1)
data1
colnames(data1)<-c("firm","year","industry","dummy","dimension")

firm2<-sort(rep(11:15,3),decreasing=F)
year2<-rep(2001:2003,5)
industry2<-rep(30,15)
dummy2<-c(0,0,0,0,0,0,1,1,1,1,1,1,1,0,1)
dimension2<-c(12456,781,32489,2345,5754,8976,3245,2120,345,2341,5678,10900,12900,123,2345)
data2<-data.frame(firm2,year2,industry2,dummy2,dimension2)
data2
colnames(data2)<-c("firm","year","industry","dummy","dimension")
firm3<-sort(rep(16:20,4),decreasing=F)
year3<-rep(2001:2004,5)
industry3<-rep(40,20)
dummy3<-c(0,0,1,0,1,0,1,0,1,1,1,1,1,0,0,0,0,1,0,0)
dimension3<-c(23456,1181,32489,2345,6754,8976,3245,1234,1288,1200,2345,2765,389,23456,2367,3892,6438,24824,
23,2897)
data3<-data.frame(firm3,year3,industry3,dummy3,dimension3)
data3
colnames(data3)<-c("firm","year","industry","dummy","dimension")

final1<-rbind(data1,data2)
final2<-rbind(final1,data3)
final2
final3<-final2[order(final2$year,final2$industry,final2$dimension),]
final3


#So my data is final3 is like this:
?  firm year industry dummy dimension
26? ? 6 2000? ? ?  20? ?  0? ? ?  781
1? ?  1 2000? ? ?  20? ?  0? ? ? 2120
21? ? 5 2000? ? ?  20? ?  1? ? ? 2189
36? ? 8 2000? ? ?  20? ?  1? ? ? 2765
16? ? 4 2000? ? ?  20? ?  0? ? ? 3178
31? ? 7 2000? ? ?  20? ?  1? ? ? 3245
11? ? 3 2000? ? ?  20? ?  1? ? ? 4532
6? ?  2 2000? ? ?  20? ?  0? ? ? 4890
41? ? 9 2000? ? ?  20? ?  0? ? ? 5438
46?  10 2000? ? ?  20? ?  0? ? ? 7690
2? ?  1 2001? ? ?  20? ?  0? ? ?  345
37? ? 8 2001? ? ?  20? ?  1? ? ?  389
32? ? 7 2001? ? ?  20? ?  0? ? ? 1234
17? ? 4 2001? ? ?  20? ?  0? ? ? 2678
7? ?  2 2001? ? ?  20? ?  1? ? ? 2789
22? ? 5 2001? ? ?  20? ?  1? ? ? 4289
47?  10 2001? ? ?  20? ?  0? ? ? 6022
12? ? 3 2001? ? ?  20? ?  1? ? ? 6593
27? ? 6 2001? ? ?  20? ?  0? ?  35489
42? ? 9 2001? ? ?  20? ?  1? ?  37824
60?  14 2001? ? ?  30? ?  1? ? ? 2341
54?  12 2001? ? ?  30? ?  0? ? ? 2345
57?  13 2001? ? ?  30? ?  1? ? ? 3245
51?  11 2001? ? ?  30? ?  0? ?  12456
63?  15 2001? ? ?  30? ?  1? ?  12900
78?  19 2001? ? ?  40? ?  1? ? ?  389
74?  18 2001? ? ?  40? ?  1? ? ? 1288
82?  20 2001? ? ?  40? ?  0? ? ? 6438
70?  17 2001? ? ?  40? ?  1? ? ? 6754
66?  16 2001? ? ?  40? ?  0? ?  23456
43? ? 9 2002? ? ?  20? ?  0? ? ? ? 23
33? ? 7 2002? ? ?  20? ?  1? ? ? ? 25
3? ?  1 2002? ? ?  20? ?  1? ? ? 2341
28? ? 6 2002? ? ?  20? ?  0? ? ? 2345
8? ?  2 2002? ? ?  20? ?  1? ? ? 3412
48?  10 2002? ? ?  20? ?  1? ? ? 3678
18? ? 4 2002? ? ?  20? ?  0? ? ? 6666
23? ? 5 2002? ? ?  20? ?  0? ? ? 8543
13? ? 3 2002? ? ?  20? ?  0? ?  12900
38? ? 8 2002? ? ?  20? ?  1? ?  23456
64?  15 2002? ? ?  30? ?  0? ? ?  123
52?  11 2002? ? ?  30? ?  0? ? ?  781
58?  13 2002? ? ?  30? ?  1? ? ? 2120
61?  14 2002? ? ?  30? ?  1? ? ? 5678
55?  12 2002? ? ?  30? ?  0? ? ? 5754
67?  16 2002? ? ?  40? ?  0? ? ? 1181
75?  18 2002? ? ?  40? ?  1? ? ? 1200
71?  17 2002? ? ?  40? ?  0? ? ? 8976
79?  19 2002? ? ?  40? ?  0? ?  23456
83?  20 2002? ? ?  40? ?  1? ?  24824
14? ? 3 2003? ? ?  20? ?  0? ? ?  123
24? ? 5 2003? ? ?  20? ?  0? ? ?  637
19? ? 4 2003? ? ?  20? ?  1? ? ?  647
34? ? 7 2003? ? ?  20? ?  0? ? ? 1200
39? ? 8 2003? ? ?  20? ?  1? ? ? 2367
44? ? 9 2003? ? ?  20? ?  0? ? ? 2897
4? ?  1 2003? ? ?  20? ?  1? ? ? 5678
29? ? 6 2003? ? ?  20? ?  0? ? ? 5754
49?  10 2003? ? ?  20? ?  1? ? ? 9431
9? ?  2 2003? ? ?  20? ?  0? ? ? 9500
59?  13 2003? ? ?  30? ?  1? ? ?  345
65?  15 2003? ? ?  30? ?  1? ? ? 2345
56?  12 2003? ? ?  30? ?  0? ? ? 8976
62?  14 2003? ? ?  30? ?  1? ?  10900
53?  11 2003? ? ?  30? ?  0? ?  32489
84?  20 2003? ? ?  40? ?  0? ? ? ? 23
76?  18 2003? ? ?  40? ?  1? ? ? 2345
80?  19 2003? ? ?  40? ?  0? ? ? 2367
72?  17 2003? ? ?  40? ?  1? ? ? 3245
68?  16 2003? ? ?  40? ?  1? ?  32489
15? ? 3 2004? ? ?  20? ?  0? ? ? 2345
35? ? 7 2004? ? ?  20? ?  1? ? ? 2345
50?  10 2004? ? ?  20? ?  1? ? ? 2890
45? ? 9 2004? ? ?  20? ?  0? ? ? 3456
40? ? 8 2004? ? ?  20? ?  0? ? ? 3892
10? ? 2 2004? ? ?  20? ?  1? ? ? 8765
30? ? 6 2004? ? ?  20? ?  0? ? ? 8976
5? ?  1 2004? ? ?  20? ?  0? ?  10900
25? ? 5 2004? ? ?  20? ?  0? ?  23456
20? ? 4 2004? ? ?  20? ?  1? ?  23789
73?  17 2004? ? ?  40? ?  0? ? ? 1234
69?  16 2004? ? ?  40? ?  0? ? ? 2345
77?  18 2004? ? ?  40? ?  1? ? ? 2765
85?  20 2004? ? ?  40? ?  0? ? ? 2897
81?  19 2004? ? ?  40? ?  0? ? ? 3892

I want to keep couples of firms one with dummy=1 and other with dummy=0 that
matchs in industry, firm and dimension.

But dimension doesn't need to be exactly the same, it is why I refer an
interval of + or - 10%.

For example firm 1 matchs with firm 5, because they have the same year,
industry, dimension (10% x 2120 = 212 and 2189-2120<212)
and firm 1 is dummy=0 and firm 5 is dummy=1.

So I want to delete firm 6 because it doesn't macth with any firm, and keep
firm 1 and 5.

? ?  firm year industry dummy dimension
26? ? 6 2000? ? ?  20? ?  0? ? ?  781
1? ?  1 2000? ? ?  20? ?  0? ? ? 2120
21? ? 5 2000? ? ?  20? ?  1? ? ? 2189

Next,

Now I can match firm 4 with firm 7 and delete firm 8.
36? ? 8 2000? ? ?  20? ?  1? ? ? 2765
16? ? 4 2000? ? ?  20? ?  0? ? ? 3178
31? ? 7 2000? ? ?  20? ?  1? ? ? 3245

And so on...

At the end I want to keep only pairs of firms, matched by year, industry and
dimension.

If I separate firms with dummy=1 from firms with dummy=0 in two separated
dataframes, I have two matched samples
with the same number of observations. That's what I want.

Thank you,
Cec?lia Carmo
Universidade de Aveiro - Portugal

arun

2013-Jun-14 22:05 UTC

head link

[R] matched samples, dataframe, panel data

Hi,
I changed the fun1().? Now, it should be possible to get all the possible
combinations within each group.


final3New<-read.table(file="real_data_cecilia.txt",sep="\t",header=T)
final3New1<-read.csv("real_data_cecilia_new.csv")
fun1New<- function(dat,percent,number){
??? lst1<- split(dat,list(dat$year,dat$industry))
??? lst2<- lst1[lapply(lst1,nrow)>1]
??? lst3<- lapply(lst2,function(x) {
??? ??? ??? ??? ??? CombN1<-combn(seq_len(nrow(x)),2)
??? ??? ??? ??? ??? lapply(split(CombN1,col(CombN1)),function(y){
??? ??? ??? ??? ??? ??? ??? x1<-x[y,]
??? ??? ??? ??? ??? ??? ??? x1[sum(x1$dummy)==1,]
??? ??? ??? ??? ??? ??? ??? })
??? ??? ??? ??? ??? })

??????? lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0])
??? lst5<- lst4[lapply(lst4,length)>0]
??? lst6<- lapply(lst5,function(x){
??? ??? ??? ??? ?? lapply(x,function(y){
??? ??? ??? ??? ??? x1<- abs(diff(y$dimension))< number
??? ??? ??? ??? ??? x2<- y$dimension[2]+ (y$dimension[2]*percent)
??? ??? ??? ??? ??? x3<- y$dimension[2]- (y$dimension[2]*percent)
??? ??? ??? ??? ??? x4<- (y$dimension[1] < x2) & (y$dimension[1] >
x3)
??? ??? ??? ??? ??? y[x4 & x1,]
??? ??? ??? ??? ??? })
??? ??? ??? ??? ??? }
??? ??? ??? ??? ??? )
??? lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0])
??? lst8<- lst7[lapply(lst7,length)>0]
??? res<- do.call(rbind,lapply(lst8,function(x){
??? ??? ??? ??? ??? ?????? do.call(rbind,x)
??? ??? ??? ??? ??? ??? }))
??? row.names(res)<- 1:nrow(res)
??? res
??? }??? 
??? ??? ??? ??? ??? 
##Applying fun1New
res5Percent<- fun1New(final3New,0.05,50)
dim(res5Percent)
#[1] 718?? 5
res5PercentHigh<- fun1New(final3New,0.05,500000)
?dim(res5PercentHigh)
#[1] 2788??? 5

res5Percent1<- fun1New(final3New1,0.05,50)
dim(res5Percent1)
#[1] 870?? 5
res5Percent1High<- fun1New(final3New1,0.05,500000)
dim(res5Percent1High)
#[1] 2902??? 5

res10Percent<- fun1New(final3New,0.10,200)
dim(res10Percent)
#[1] 2928??? 5
res10Percent1<- fun1New(final3New1,0.10,200)
dim(res10Percent1)
#[1] 3092??? 5

fun3<- function(dat){
????????? indx<- duplicated(dat)
??? ? dat1<- subset(dat[indx,],dummy==1)
??? ? dat0<- subset(dat[indx,],dummy==0)
??? ? indx1<- as.numeric(row.names(dat1))
??? ?indx11<- sort(c(indx1,indx1+1))
??? ?indx0<- as.numeric(row.names(dat0))
??? ?indx00<- sort(c(indx0,indx0-1))
??? ? indx10<- sort(c(indx11,indx00))
??? ?res <- dat[-indx10,]
??? res
??? }




#Applying fun3()
res5F3<- fun3(res5Percent)
dim(res5F3)
#[1] 278?? 5

res5F3High<- fun3(res5PercentHigh)
dim(res5F3High)
#[1] 546?? 5

res5F3_1<- fun3(res5Percent1)
#[1] 302?? 5
res5F3High_1<- fun3(res5Percent1High)
dim(res5F3High_1)
#[1] 570?? 5

res10F3<- fun3(res10Percent)
dim(res10F3)
#[1] 462?? 5
res10F3_1<- fun3(res10Percent1)
#[1] 474?? 5
nrow(subset(res5F3,dummy==0))
#[1] 139
?nrow(subset(res5F3,dummy==1))
#[1] 139


?nrow(subset(res5F3High,dummy==1))
#[1] 273
?nrow(subset(res5F3High,dummy==0))
#[1] 273


?nrow(subset(res10F3,dummy==0))
#[1] 231
?nrow(subset(res10F3,dummy==1))
#[1] 231
?nrow(subset(res10F3_1,dummy==1))
#[1] 237
?nrow(subset(res10F3_1,dummy==0))
#[1] 237
?dim(unique(res5F3))
#[1] 278?? 5
dim(unique(res5F3High))
#[1] 546?? 5

?dim(unique(res10F3_1))
#[1] 474?? 5
?dim(unique(res10F3))
#[1] 462?? 5
A.K.



________________________________
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com> 
Sent: Friday, June 14, 2013 10:44 AM
Subject: me again




There some matchs that are missing. That is, it is possible to have more matchs.
I'm sending you a sript and the data.

Than you.
Cec?lia?

arun

2013-Jun-17 01:33 UTC

head link

[R] matched samples, dataframe, panel data

Hi,
This gives me more combinations than you got with SAS.? Also, this selects the
one with minimum dimension between duplicates.


final3New<-read.table(file="real_data_cecilia.txt",sep="\t",header=T)
final3New1<-read.csv("real_data_cecilia_new.csv")

fun3<- function(dat){
??? ??? if(any(duplicated(dat))){
??? ?????? ??? indx<- which(duplicated(dat))
??? ??? row.names(dat)<-1:nrow(dat)
??? ? ??? dat1<- subset(dat[indx,],dummy==1)
??? ? ??? dat0<- subset(dat[indx,],dummy==0)
??? ? ??? indx1<- as.numeric(row.names(dat1))
??? ???? indx11<- sort(c(indx1,indx1+1))
??? ???? indx0<- as.numeric(row.names(dat0))
??? ???? indx00<- sort(c(indx0,indx0-1))
??? ? ??? indx10<- sort(c(indx11,indx00))
??? ???? res <- dat[-indx10,]
??? ??? res
??? ??? }
??? ??? else {
??? ??? ??? dat
??? ??? }
??? ??? }

fun1New<-function(dat,percent,number){
??? lst1<- split(dat,list(dat$year,dat$industry))
??? lst2<- lst1[lapply(lst1,nrow)>1]
??? lst3<- lapply(lst2,function(x) {
??? ??? ??? ??? ??? CombN1<-combn(seq_len(nrow(x)),2)
??? ??? ??? ??? ??? lapply(split(CombN1,col(CombN1)),function(y){
??? ??? ??? ??? ??? ??? ??? x1<-x[y,]
??? ??? ??? ??? ??? ??? ??? x1[sum(x1$dummy)==1,]
??? ??? ??? ??? ??? ??? ??? })
??? ??? ??? ??? ??? })

??????? lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0])
??? lst5<- lst4[lapply(lst4,length)>0]
??? lst6<- lapply(lst5,function(x){
??? ??? ??? ??? ?? lapply(x,function(y){
??? ??? ??? ??? ??? x1<- abs(diff(y$dimension))< number
??? ??? ??? ??? ??? x2<- y$dimension[2]+ (y$dimension[2]*percent)
??? ??? ??? ??? ??? x3<- y$dimension[2]- (y$dimension[2]*percent)
??? ??? ??? ??? ??? x4<- y$dimension[1]+ (y$dimension[1]*percent)
??? ??? ??? ??? ??? x5<- y$dimension[1]- (y$dimension[1]*percent)
??? ??? ??? ??? ??? x6<- (y$dimension[1] < x2) & (y$dimension[1] >
x3)
??? ??? ??? ??? ??? x7<- (y$dimension[2]< x4) & (y$dimension[2]>
x5)
??? ??? ??? ??? ??? ??? y[((x6 & x1)| (x7 & x1)),]
??? ??? ??? ??? ??? })
??? ??? ??? ??? ??? }
??? ??? ??? ??? ??? )
??? lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0])
??? lst8<- lst7[lapply(lst7,length)>0]
??? lst9<- lapply(lst8,function(x) do.call(rbind,x))
??? lst10<-lapply(lst9,function(x) {
??? ??? ??? ??? row.names(x)<- 1:nrow(x)
??? ??? ??? ??? x1<-x[x$dummy==1,]
??? ??? ??? ??? do.call(rbind,lapply(split(x1,x1$dimension),function(y){
??? ??? ??? ??? ?????
indx1<-sort(c(as.numeric(row.names(y)),as.numeric(row.names(y))+1))
??? ??? ??? ??? ????? x2<-x[indx1,]
??? ??? ??? ??? ????? x3<- subset(x2,dummy==0)
??? ??? ??? ??? ????? x4<-x3[which.min(abs(x2$dimension[1]-x3$dimension)),]
??? ??? ??? ??? ????? rbind(x2[1,],x4)
??? ??? ??? ??? ??? }))
??? ??? ??? ??? ??? })
?????? res<- do.call(rbind,lapply(lst10,fun3))
?????? row.names(res)<- 1:nrow(res)
??? res
??? }


??? ??? 
???? 

####1st dataset

res10PercentHigh<- fun1New(final3New,0.10,500000000)
?dim(res10PercentHigh)
#[1] 764?? 5
?dim(unique(res10PercentHigh))
#[1] 764?? 5
?nrow(subset(res10PercentHigh,dummy==0))
#[1] 382
?nrow(subset(res10PercentHigh,dummy==1))
#[1] 382
res10PercentLow<- fun1New(final3New,0.10,50)
?dim(res10PercentLow)
#[1] 294?? 5
?dim(unique(res10PercentLow))
#[1] 294?? 5
?nrow(subset(res10PercentLow,dummy==0))
#[1] 147
?nrow(subset(res10PercentLow,dummy==1))
#[1] 147

res5PercentHigh<- fun1New(final3New,0.05,500000000)
?dim(res5PercentHigh)
#[1] 630?? 5
?dim(unique(res5PercentHigh))
#[1] 630?? 5
?nrow(subset(res5PercentHigh,dummy==0))
#[1] 315
?nrow(subset(res5PercentHigh,dummy==1))
#[1] 315

res5PercentLow<- fun1New(final3New,0.05,50)
?dim(res5PercentLow)
#[1] 294?? 5

?dim(unique(res5PercentLow))
#[1] 294?? 5
?nrow(subset(res5PercentLow,dummy==0))
#[1] 147
?nrow(subset(res5PercentLow,dummy==1))
#[1] 147

#######2nd dataset
res10PercentHigh<- fun1New(final3New1,0.10,500000000)
?dim(res10PercentHigh)
#[1] 760?? 5
?dim(unique(res10PercentHigh))
#[1] 760?? 5

?nrow(subset(res10PercentHigh,dummy==0))
#[1] 380
?nrow(subset(res10PercentHigh,dummy==1))
#[1] 380
res10PercentLow<- fun1New(final3New1,0.10,100)
?dim(res10PercentLow)
#[1] 418?? 5

?dim(unique(res10PercentLow))
#[1] 418?? 5
?nrow(subset(res10PercentLow,dummy==0))
#[1] 209
?nrow(subset(res10PercentLow,dummy==1))
#[1] 209


res5PercentHigh<- fun1New(final3New1,0.05,500000000)
?dim(res5PercentHigh)
#[1] 640?? 5
?dim(unique(res5PercentHigh))
#[1] 640?? 5

?nrow(subset(res5PercentHigh,dummy==0))
#[1] 320
?nrow(subset(res5PercentHigh,dummy==1))
#[1] 320
res5PercentLow<- fun1New(final3New1,0.05,50)
?dim(res5PercentLow)
#[1] 310?? 5

?dim(unique(res5PercentLow))
#[1] 310?? 5
?nrow(subset(res5PercentLow,dummy==0))
#[1] 155
?nrow(subset(res5PercentLow,dummy==1))
#[1] 155

res20PercentHigh<- fun1New(final3New1,0.20,500000000)
dim(res20PercentHigh)
#[1] 846?? 5

?dim(unique(res20PercentHigh))
#[1] 846?? 5

?nrow(subset(res20PercentHigh,dummy==0))
#[1] 423
?nrow(subset(res20PercentHigh,dummy==1))
#[1] 423


A.K.

----- Original Message -----
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com>
Cc: 
Sent: Sunday, June 16, 2013 5:57 AM
Subject: RE: matched samples, dataframe, panel data

In the script I send you and with the file that? I sent with it and with the old
function 1 and 2
it got 350 combinations and it was possible to have more

Now with new fun 1 and 3 I have less, so it is not ok, does it?
> res10Percent<- fun1New(final3New2,0.10,500000000)
> res10F3<- fun3(res10Percent)
> dim(res10F3)
[1] 600?  5> nrow(subset(res10F3,dummy==0))
[1] 300> nrow(subset(res10F3,dummy==1))[1] 300

Sorry for making you spending so much time. I thought it could be easier.

Cec?lia

________________________________________
De: arun [smartpink111 at yahoo.com]
Enviado: sexta-feira, 14 de Junho de 2013 23:09
Para: Cecilia Carmo
Assunto: Re: matched samples, dataframe, panel data

One thing I forgot to mention.? I used fun3() because i found fun2() still have
some problems with getting the correct dimensions.? You can check the results of
fun1() and fun3() and see if all the combinations are got.? Then, if I get
chance, I will correct fun2().
"""""
And you conclude that they are the same!
"""""""
Here, also I am not concluding anything.
A.K.


----- Original Message -----
From: arun <smartpink111 at yahoo.com>
To: Cecilia Carmo <cecilia.carmo at ua.pt>
Cc: R help <r-help at r-project.org>
Sent: Friday, June 14, 2013 6:05 PM
Subject: Re: matched samples, dataframe, panel data

Hi,
I changed the fun1().? Now, it should be possible to get all the possible
combinations within each group.


final3New<-read.table(file="real_data_cecilia.txt",sep="\t",header=T)
final3New1<-read.csv("real_data_cecilia_new.csv")
fun1New<- function(dat,percent,number){
? ? lst1<- split(dat,list(dat$year,dat$industry))
? ? lst2<- lst1[lapply(lst1,nrow)>1]
? ? lst3<- lapply(lst2,function(x) {
? ? ? ? ? ? ? ? ? ? CombN1<-combn(seq_len(nrow(x)),2)
? ? ? ? ? ? ? ? ? ? lapply(split(CombN1,col(CombN1)),function(y){
? ? ? ? ? ? ? ? ? ? ? ? ? ? x1<-x[y,]
? ? ? ? ? ? ? ? ? ? ? ? ? ? x1[sum(x1$dummy)==1,]
? ? ? ? ? ? ? ? ? ? ? ? ? ? })
? ? ? ? ? ? ? ? ? ? })

? ? ? ? lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0])
? ? lst5<- lst4[lapply(lst4,length)>0]
? ? lst6<- lapply(lst5,function(x){
? ? ? ? ? ? ? ? ?  lapply(x,function(y){
? ? ? ? ? ? ? ? ? ? x1<- abs(diff(y$dimension))< number
? ? ? ? ? ? ? ? ? ? x2<- y$dimension[2]+ (y$dimension[2]*percent)
? ? ? ? ? ? ? ? ? ? x3<- y$dimension[2]- (y$dimension[2]*percent)
? ? ? ? ? ? ? ? ? ? x4<- (y$dimension[1] < x2) & (y$dimension[1] >
x3)
? ? ? ? ? ? ? ? ? ? y[x4 & x1,]
? ? ? ? ? ? ? ? ? ? })
? ? ? ? ? ? ? ? ? ? }
? ? ? ? ? ? ? ? ? ? )
? ? lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0])
? ? lst8<- lst7[lapply(lst7,length)>0]
? ? res<- do.call(rbind,lapply(lst8,function(x){
? ? ? ? ? ? ? ? ? ? ? ? ?  do.call(rbind,x)
? ? ? ? ? ? ? ? ? ? ? ? }))
? ? row.names(res)<- 1:nrow(res)
? ? res
? ? }? 

##Applying fun1New
res5Percent<- fun1New(final3New,0.05,50)
dim(res5Percent)
#[1] 718?  5
res5PercentHigh<- fun1New(final3New,0.05,500000)
dim(res5PercentHigh)
#[1] 2788? ? 5

res5Percent1<- fun1New(final3New1,0.05,50)
dim(res5Percent1)
#[1] 870?  5
res5Percent1High<- fun1New(final3New1,0.05,500000)
dim(res5Percent1High)
#[1] 2902? ? 5

res10Percent<- fun1New(final3New,0.10,200)
dim(res10Percent)
#[1] 2928? ? 5
res10Percent1<- fun1New(final3New1,0.10,200)
dim(res10Percent1)
#[1] 3092? ? 5

fun3<- function(dat){
? ? ? ? ? indx<- duplicated(dat)
? ? ? dat1<- subset(dat[indx,],dummy==1)
? ? ? dat0<- subset(dat[indx,],dummy==0)
? ? ? indx1<- as.numeric(row.names(dat1))
? ?  indx11<- sort(c(indx1,indx1+1))
? ?  indx0<- as.numeric(row.names(dat0))
? ?  indx00<- sort(c(indx0,indx0-1))
? ? ? indx10<- sort(c(indx11,indx00))
? ?  res <- dat[-indx10,]
? ? res
? ? }




#Applying fun3()
res5F3<- fun3(res5Percent)
dim(res5F3)
#[1] 278?  5

res5F3High<- fun3(res5PercentHigh)
dim(res5F3High)
#[1] 546?  5

res5F3_1<- fun3(res5Percent1)
#[1] 302?  5
res5F3High_1<- fun3(res5Percent1High)
dim(res5F3High_1)
#[1] 570?  5

res10F3<- fun3(res10Percent)
dim(res10F3)
#[1] 462?  5
res10F3_1<- fun3(res10Percent1)
#[1] 474?  5
nrow(subset(res5F3,dummy==0))
#[1] 139
nrow(subset(res5F3,dummy==1))
#[1] 139


nrow(subset(res5F3High,dummy==1))
#[1] 273
nrow(subset(res5F3High,dummy==0))
#[1] 273


nrow(subset(res10F3,dummy==0))
#[1] 231
nrow(subset(res10F3,dummy==1))
#[1] 231
nrow(subset(res10F3_1,dummy==1))
#[1] 237
nrow(subset(res10F3_1,dummy==0))
#[1] 237
dim(unique(res5F3))
#[1] 278?  5
dim(unique(res5F3High))
#[1] 546?  5

dim(unique(res10F3_1))
#[1] 474?  5
dim(unique(res10F3))
#[1] 462?  5
A.K.



________________________________
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com>
Sent: Friday, June 14, 2013 10:44 AM
Subject: me again




There some matchs that are missing. That is, it is possible to have more matchs.
I'm sending you a sript and the data.

Than you.
Cec?lia

Reasonably Related Threads

Search for more reasonably related threads

R help - Jun 2013 - matched samples, dataframe, panel data

[R] matched samples, dataframe, panel data

[R] matched samples, dataframe, panel data

[R] matched samples, dataframe, panel data

[R] matched samples, dataframe, panel data

[R] matched samples, dataframe, panel data

Reasonably Related Threads