Hello
I have the following problem : group the lines of a dataframe when no
information change (Matricule, Nom, Sexe, DateNaissance, Contrat, Pays)
and when the value of Debut of lines i = value Fin of lines i-1
I can obtain it with a do loop. Is it possible to avoid the loop ?
The dataframe initial is df1
dput(df1)
structure(list(Matricule = c(1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L,
6L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L), Nom = c("VERON", "VERON",
"VERON", "BENARD",
"BENARD", "BENARD", "BENARD", "BENARD",
"BENARD", "DALNIC", "DALNIC",
"DALNIC", "DALNIC", "DALNIC", "DALNIC",
"DALNIC", "DALNIC", "DALNIC",
"FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
"FORNI", "FORNI"), Sexe = c("F?minin",
"F?minin", "F?minin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"F?minin", "F?minin", "F?minin",
"F?minin", "F?minin", "F?minin",
"F?minin", "F?minin", "F?minin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin"), DateNaissance
= c("02/09/1935",
"02/09/1935", "02/09/1935", "01/04/1935",
"01/04/1935", "01/04/1935",
"01/04/1935", "01/04/1935", "01/04/1935",
"19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961"), contrat = c("CDI
commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
"CDI commun", "CDD d?tach? ext. Cirad", "CDD d?tach?
ext. Cirad",
"CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad",
"CDD d?tach? ext.
Cirad",
"CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad",
"CDD d?tach? ext.
Cirad",
"CDD d?tach? ext. Cirad", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
"CDI commun"), Pays = c("France", "France",
"France", "Philippines",
"Philippines", "Philippines", "France",
"France", "France", "France",
"France", "Martinique", "Martinique",
"Martinique", "Martinique",
"Martinique", "Martinique", "Martinique",
"Cameroun", "Cameroun",
"Cameroun", "Cameroun", "Cameroun",
"Cameroun", "Cameroun", "Cameroun",
"Cameroun", "France", "France",
"France", "France", "France",
"France", "France", "Congo", "Congo",
"Congo", "Congo", "Congo",
"Congo", "Gabon", "Gabon", "Gabon",
"Gabon", "Gabon", "Gabon",
"Congo", "Congo"), Debut = c("24/01/1995",
"01/05/1997", "31/12/1997",
"02/02/1995", "28/02/1995", "01/03/1995",
"13/03/1995", "01/01/1996",
"31/01/1996", "24/01/1995", "01/07/1995",
"01/09/1995", "01/07/1997",
"01/01/1998", "01/08/1998", "01/01/2000",
"17/01/2000", "29/02/2000",
"26/01/1995", "01/07/1996", "16/09/1997",
"01/01/1998", "01/07/1998",
"04/11/1999", "01/01/2001", "01/04/2001",
"31/08/2001", "01/09/2001",
"02/09/2001", "01/12/2001", "01/02/2003",
"01/04/2003", "01/01/2004",
"01/03/2004", "01/09/2004", "01/01/2005",
"01/04/2005", "28/10/2006",
"01/01/2007", "01/04/2007", "01/09/2007",
"01/01/2009", "01/04/2009",
"01/01/2010", "01/01/2011", "01/04/2011",
"05/09/2012", "01/01/2013"
), Fin = c("30/04/1997", "30/12/1997",
"31/12/1997", "27/02/1995",
"28/02/1995", "12/03/1995", "30/06/1995",
"30/01/1996", "31/01/1996",
"30/06/1995", "31/08/1995", "30/06/1997",
"31/12/1997", "31/07/1998",
"31/12/1999", "16/01/2000", "28/02/2000",
"29/02/2000", "30/06/1996",
"15/09/1997", "31/12/1997", "30/06/1998",
"03/11/1999", "31/12/2000",
"31/03/2001", "30/08/2001", "31/08/2001",
"01/09/2001", "30/11/2001",
"31/01/2003", "31/03/2003", "31/12/2003",
"29/02/2004", "31/08/2004",
"31/12/2004", "31/03/2005", "27/10/2006",
"31/12/2006", "31/03/2007",
"31/08/2007", "31/12/2008", "31/03/2009",
"31/12/2009", "31/12/2010",
"31/03/2011", "04/09/2012", "31/12/2012",
"31/12/4712")), .Names =
c("Matricule",
"Nom", "Sexe", "DateNaissance",
"contrat", "Pays", "Debut", "Fin"
), class = "data.frame", row.names = c(NA, -48L))
The dataframe to be obtained is df2
dput(df2)
structure(list(Mat = c(1L, 6L, 6L, 6L, 8L, 8L, 934L, 934L, 934L,
934L, 934L), Nom = c("VERON", "BENARD", "BENARD",
"BENARD", "DALNIC",
"DALNIC", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI"), Sexe = c("F?minin",
"Masculin", "Masculin", "Masculin",
"F?minin", "F?minin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin"), DateNaissance =
c("02/09/1935",
"01/04/1935", "01/04/1935", "01/04/1935",
"19/02/1940", "19/02/1940",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961"
), contrat = c("CDI commun", "CDI commun", "CDI
commun", "CDI commun",
"CDI commun", "CDI commun", "CDD d?tach? ext.
Cirad", "CDI D?tach?s
Autres",
"CDI D?tach?s Autres", "CDI commun", "CDI
commun"), Pays = c("France",
"Philippines", "France", "France",
"France", "Martinique", "Cameroun",
"France", "Congo", "Gabon", "Congo"),
Debut = c("24/01/1995",
"02/02/1995", "13/03/1995", "01/01/1996",
"24/01/1995", "01/09/1995",
"26/01/1995", "01/09/2001", "01/09/2004",
"01/09/2007", "05/09/2012"
), Fin = c("31/12/1997", "12/03/1995",
"30/06/1995", "31/01/1996",
"31/08/1995", "29/02/2000", "31/08/2001",
"31/08/2004", "31/08/2007",
"04/09/2012", "31/12/4712")), .Names = c("Mat",
"Nom", "Sexe",
"DateNaissance", "contrat", "Pays",
"Debut", "Fin"), class =
"data.frame", row.names = c(NA,
-11L))
Thank you for your help
--
Michel ARNAUD
Charg? de mission aupr?s du DRH
DGDRD-Drh - TA 174/04
Av Agropolis 34398 Montpellier cedex 5
tel : 04.67.61.75.38
fax : 04.67.61.57.87
port: 06.47.43.55.31
Hello,
My solution is missing a row, but maybe you can find some inspiration.
cols <- c("Matricule", "Nom", "Sexe",
"DateNaissance", "contrat", "Pays")
irow1 <- duplicated(df1[, cols])
irow2 <- c(FALSE, df1$Debut[-1] == df1$Fin[-nrow(df1)])
df3 <- df1[!irow1 & !irow2, ]
dim(df2); dim(df3) # df3 has one row less
df2; df3
Hope this helps,
Rui Barradas
Em 12-07-2013 20:45, Arnaud Michel escreveu:> Hello
>
> I have the following problem : group the lines of a dataframe when no
> information change (Matricule, Nom, Sexe, DateNaissance, Contrat, Pays)
> and when the value of Debut of lines i = value Fin of lines i-1
> I can obtain it with a do loop. Is it possible to avoid the loop ?
>
> The dataframe initial is df1
> dput(df1)
> structure(list(Matricule = c(1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L,
> 6L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 934L, 934L, 934L, 934L,
> 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
> 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
> 934L, 934L, 934L, 934L), Nom = c("VERON", "VERON",
"VERON", "BENARD",
> "BENARD", "BENARD", "BENARD",
"BENARD", "BENARD", "DALNIC", "DALNIC",
> "DALNIC", "DALNIC", "DALNIC",
"DALNIC", "DALNIC", "DALNIC", "DALNIC",
> "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
> "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
> "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
> "FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
> "FORNI", "FORNI"), Sexe = c("F?minin",
"F?minin", "F?minin",
> "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
> "F?minin", "F?minin", "F?minin",
"F?minin", "F?minin", "F?minin",
> "F?minin", "F?minin", "F?minin",
"Masculin", "Masculin", "Masculin",
> "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
> "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
> "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
> "Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
> "Masculin", "Masculin", "Masculin"),
DateNaissance = c("02/09/1935",
> "02/09/1935", "02/09/1935", "01/04/1935",
"01/04/1935", "01/04/1935",
> "01/04/1935", "01/04/1935", "01/04/1935",
"19/02/1940", "19/02/1940",
> "19/02/1940", "19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940",
> "19/02/1940", "19/02/1940", "10/07/1961",
"10/07/1961", "10/07/1961",
> "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
> "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
> "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
> "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
> "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
> "10/07/1961", "10/07/1961"), contrat = c("CDI
commun", "CDI commun",
> "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
> "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
> "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
> "CDI commun", "CDD d?tach? ext. Cirad", "CDD
d?tach? ext. Cirad",
> "CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad",
"CDD d?tach? ext.
> Cirad",
> "CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad",
"CDD d?tach? ext.
> Cirad",
> "CDD d?tach? ext. Cirad", "CDI D?tach?s Autres",
"CDI D?tach?s Autres",
> "CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
> "CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
> "CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
> "CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
commun", "CDI commun",
> "CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
> "CDI commun"), Pays = c("France", "France",
"France", "Philippines",
> "Philippines", "Philippines", "France",
"France", "France", "France",
> "France", "Martinique", "Martinique",
"Martinique", "Martinique",
> "Martinique", "Martinique", "Martinique",
"Cameroun", "Cameroun",
> "Cameroun", "Cameroun", "Cameroun",
"Cameroun", "Cameroun", "Cameroun",
> "Cameroun", "France", "France",
"France", "France", "France",
> "France", "France", "Congo",
"Congo", "Congo", "Congo", "Congo",
> "Congo", "Gabon", "Gabon", "Gabon",
"Gabon", "Gabon", "Gabon",
> "Congo", "Congo"), Debut = c("24/01/1995",
"01/05/1997", "31/12/1997",
> "02/02/1995", "28/02/1995", "01/03/1995",
"13/03/1995", "01/01/1996",
> "31/01/1996", "24/01/1995", "01/07/1995",
"01/09/1995", "01/07/1997",
> "01/01/1998", "01/08/1998", "01/01/2000",
"17/01/2000", "29/02/2000",
> "26/01/1995", "01/07/1996", "16/09/1997",
"01/01/1998", "01/07/1998",
> "04/11/1999", "01/01/2001", "01/04/2001",
"31/08/2001", "01/09/2001",
> "02/09/2001", "01/12/2001", "01/02/2003",
"01/04/2003", "01/01/2004",
> "01/03/2004", "01/09/2004", "01/01/2005",
"01/04/2005", "28/10/2006",
> "01/01/2007", "01/04/2007", "01/09/2007",
"01/01/2009", "01/04/2009",
> "01/01/2010", "01/01/2011", "01/04/2011",
"05/09/2012", "01/01/2013"
> ), Fin = c("30/04/1997", "30/12/1997",
"31/12/1997", "27/02/1995",
> "28/02/1995", "12/03/1995", "30/06/1995",
"30/01/1996", "31/01/1996",
> "30/06/1995", "31/08/1995", "30/06/1997",
"31/12/1997", "31/07/1998",
> "31/12/1999", "16/01/2000", "28/02/2000",
"29/02/2000", "30/06/1996",
> "15/09/1997", "31/12/1997", "30/06/1998",
"03/11/1999", "31/12/2000",
> "31/03/2001", "30/08/2001", "31/08/2001",
"01/09/2001", "30/11/2001",
> "31/01/2003", "31/03/2003", "31/12/2003",
"29/02/2004", "31/08/2004",
> "31/12/2004", "31/03/2005", "27/10/2006",
"31/12/2006", "31/03/2007",
> "31/08/2007", "31/12/2008", "31/03/2009",
"31/12/2009", "31/12/2010",
> "31/03/2011", "04/09/2012", "31/12/2012",
"31/12/4712")), .Names > c("Matricule",
> "Nom", "Sexe", "DateNaissance",
"contrat", "Pays", "Debut", "Fin"
> ), class = "data.frame", row.names = c(NA, -48L))
>
> The dataframe to be obtained is df2
> dput(df2)
> structure(list(Mat = c(1L, 6L, 6L, 6L, 8L, 8L, 934L, 934L, 934L,
> 934L, 934L), Nom = c("VERON", "BENARD",
"BENARD", "BENARD", "DALNIC",
> "DALNIC", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI"), Sexe =
c("F?minin",
> "Masculin", "Masculin", "Masculin",
"F?minin", "F?minin", "Masculin",
> "Masculin", "Masculin", "Masculin",
"Masculin"), DateNaissance > c("02/09/1935",
> "01/04/1935", "01/04/1935", "01/04/1935",
"19/02/1940", "19/02/1940",
> "10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961"
> ), contrat = c("CDI commun", "CDI commun", "CDI
commun", "CDI commun",
> "CDI commun", "CDI commun", "CDD d?tach? ext.
Cirad", "CDI D?tach?s
> Autres",
> "CDI D?tach?s Autres", "CDI commun", "CDI
commun"), Pays = c("France",
> "Philippines", "France", "France",
"France", "Martinique", "Cameroun",
> "France", "Congo", "Gabon",
"Congo"), Debut = c("24/01/1995",
> "02/02/1995", "13/03/1995", "01/01/1996",
"24/01/1995", "01/09/1995",
> "26/01/1995", "01/09/2001", "01/09/2004",
"01/09/2007", "05/09/2012"
> ), Fin = c("31/12/1997", "12/03/1995",
"30/06/1995", "31/01/1996",
> "31/08/1995", "29/02/2000", "31/08/2001",
"31/08/2004", "31/08/2007",
> "04/09/2012", "31/12/4712")), .Names =
c("Mat", "Nom", "Sexe",
> "DateNaissance", "contrat", "Pays",
"Debut", "Fin"), class > "data.frame",
row.names = c(NA,
> -11L))
>
> Thank you for your help
>
Here is how you can do it with the 'data.table' package:> require(data.table) > df1 <- data.table(df1) > result <- df1[+ , list(Debut = Debut[1L] # first entry + , Fin = Fin[1L] + ) + , keyby = c("Matricule", "Nom", "Sexe", "DateNaissance", "contrat", "Pays") + ]> resultMatricule Nom Sexe DateNaissance contrat Pays Debut Fin 1: 1 VERON Féminin 02/09/1935 CDI commun France 24/01/1995 30/04/1997 2: 6 BENARD Masculin 01/04/1935 CDI commun France 13/03/1995 30/06/1995 3: 6 BENARD Masculin 01/04/1935 CDI commun Philippines 02/02/1995 27/02/1995 4: 8 DALNIC Féminin 19/02/1940 CDI commun France 24/01/1995 30/06/1995 5: 8 DALNIC Féminin 19/02/1940 CDI commun Martinique 01/09/1995 30/06/1997 6: 934 FORNI Masculin 10/07/1961 CDD détaché ext. Cirad Cameroun 26/01/1995 30/06/1996 7: 934 FORNI Masculin 10/07/1961 CDI Détachés Autres Congo 01/09/2004 31/12/2004 8: 934 FORNI Masculin 10/07/1961 CDI Détachés Autres France 01/09/2001 01/09/2001 9: 934 FORNI Masculin 10/07/1961 CDI commun Congo 05/09/2012 31/12/2012 10: 934 FORNI Masculin 10/07/1961 CDI commun Gabon 01/09/2007 31/12/2008> >On Fri, Jul 12, 2013 at 3:45 PM, Arnaud Michel <michel.arnaud@cirad.fr>wrote:> Hello > > I have the following problem : group the lines of a dataframe when no > information change (Matricule, Nom, Sexe, DateNaissance, Contrat, Pays) and > when the value of Debut of lines i = value Fin of lines i-1 > I can obtain it with a do loop. Is it possible to avoid the loop ? > > The dataframe initial is df1 > dput(df1) > structure(list(Matricule = c(1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L, > 6L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 934L, 934L, 934L, 934L, > 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, > 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, > 934L, 934L, 934L, 934L), Nom = c("VERON", "VERON", "VERON", "BENARD", > "BENARD", "BENARD", "BENARD", "BENARD", "BENARD", "DALNIC", "DALNIC", > "DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC", "DALNIC", > "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", > "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", > "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", > "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI", > "FORNI", "FORNI"), Sexe = c("Féminin", "Féminin", "Féminin", > "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", > "Féminin", "Féminin", "Féminin", "Féminin", "Féminin", "Féminin", > "Féminin", "Féminin", "Féminin", "Masculin", "Masculin", "Masculin", > "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", > "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", > "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", > "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", "Masculin", > "Masculin", "Masculin", "Masculin"), DateNaissance = c("02/09/1935", > "02/09/1935", "02/09/1935", "01/04/1935", "01/04/1935", "01/04/1935", > "01/04/1935", "01/04/1935", "01/04/1935", "19/02/1940", "19/02/1940", > "19/02/1940", "19/02/1940", "19/02/1940", "19/02/1940", "19/02/1940", > "19/02/1940", "19/02/1940", "10/07/1961", "10/07/1961", "10/07/1961", > "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", > "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", > "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", > "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", > "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", > "10/07/1961", "10/07/1961"), contrat = c("CDI commun", "CDI commun", > "CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun", > "CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun", > "CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun", > "CDI commun", "CDD détaché ext. Cirad", "CDD détaché ext. Cirad", > "CDD détaché ext. Cirad", "CDD détaché ext. Cirad", "CDD détaché ext. > Cirad", > "CDD détaché ext. Cirad", "CDD détaché ext. Cirad", "CDD détaché ext. > Cirad", > "CDD détaché ext. Cirad", "CDI Détachés Autres", "CDI Détachés Autres", > "CDI Détachés Autres", "CDI Détachés Autres", "CDI Détachés Autres", > "CDI Détachés Autres", "CDI Détachés Autres", "CDI Détachés Autres", > "CDI Détachés Autres", "CDI Détachés Autres", "CDI Détachés Autres", > "CDI Détachés Autres", "CDI Détachés Autres", "CDI commun", "CDI commun", > "CDI commun", "CDI commun", "CDI commun", "CDI commun", "CDI commun", > "CDI commun"), Pays = c("France", "France", "France", "Philippines", > "Philippines", "Philippines", "France", "France", "France", "France", > "France", "Martinique", "Martinique", "Martinique", "Martinique", > "Martinique", "Martinique", "Martinique", "Cameroun", "Cameroun", > "Cameroun", "Cameroun", "Cameroun", "Cameroun", "Cameroun", "Cameroun", > "Cameroun", "France", "France", "France", "France", "France", > "France", "France", "Congo", "Congo", "Congo", "Congo", "Congo", > "Congo", "Gabon", "Gabon", "Gabon", "Gabon", "Gabon", "Gabon", > "Congo", "Congo"), Debut = c("24/01/1995", "01/05/1997", "31/12/1997", > "02/02/1995", "28/02/1995", "01/03/1995", "13/03/1995", "01/01/1996", > "31/01/1996", "24/01/1995", "01/07/1995", "01/09/1995", "01/07/1997", > "01/01/1998", "01/08/1998", "01/01/2000", "17/01/2000", "29/02/2000", > "26/01/1995", "01/07/1996", "16/09/1997", "01/01/1998", "01/07/1998", > "04/11/1999", "01/01/2001", "01/04/2001", "31/08/2001", "01/09/2001", > "02/09/2001", "01/12/2001", "01/02/2003", "01/04/2003", "01/01/2004", > "01/03/2004", "01/09/2004", "01/01/2005", "01/04/2005", "28/10/2006", > "01/01/2007", "01/04/2007", "01/09/2007", "01/01/2009", "01/04/2009", > "01/01/2010", "01/01/2011", "01/04/2011", "05/09/2012", "01/01/2013" > ), Fin = c("30/04/1997", "30/12/1997", "31/12/1997", "27/02/1995", > "28/02/1995", "12/03/1995", "30/06/1995", "30/01/1996", "31/01/1996", > "30/06/1995", "31/08/1995", "30/06/1997", "31/12/1997", "31/07/1998", > "31/12/1999", "16/01/2000", "28/02/2000", "29/02/2000", "30/06/1996", > "15/09/1997", "31/12/1997", "30/06/1998", "03/11/1999", "31/12/2000", > "31/03/2001", "30/08/2001", "31/08/2001", "01/09/2001", "30/11/2001", > "31/01/2003", "31/03/2003", "31/12/2003", "29/02/2004", "31/08/2004", > "31/12/2004", "31/03/2005", "27/10/2006", "31/12/2006", "31/03/2007", > "31/08/2007", "31/12/2008", "31/03/2009", "31/12/2009", "31/12/2010", > "31/03/2011", "04/09/2012", "31/12/2012", "31/12/4712")), .Names > c("Matricule", > "Nom", "Sexe", "DateNaissance", "contrat", "Pays", "Debut", "Fin" > ), class = "data.frame", row.names = c(NA, -48L)) > > The dataframe to be obtained is df2 > dput(df2) > structure(list(Mat = c(1L, 6L, 6L, 6L, 8L, 8L, 934L, 934L, 934L, > 934L, 934L), Nom = c("VERON", "BENARD", "BENARD", "BENARD", "DALNIC", > "DALNIC", "FORNI", "FORNI", "FORNI", "FORNI", "FORNI"), Sexe = c("Féminin", > "Masculin", "Masculin", "Masculin", "Féminin", "Féminin", "Masculin", > "Masculin", "Masculin", "Masculin", "Masculin"), DateNaissance > c("02/09/1935", > "01/04/1935", "01/04/1935", "01/04/1935", "19/02/1940", "19/02/1940", > "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961", "10/07/1961" > ), contrat = c("CDI commun", "CDI commun", "CDI commun", "CDI commun", > "CDI commun", "CDI commun", "CDD détaché ext. Cirad", "CDI Détachés > Autres", > "CDI Détachés Autres", "CDI commun", "CDI commun"), Pays = c("France", > "Philippines", "France", "France", "France", "Martinique", "Cameroun", > "France", "Congo", "Gabon", "Congo"), Debut = c("24/01/1995", > "02/02/1995", "13/03/1995", "01/01/1996", "24/01/1995", "01/09/1995", > "26/01/1995", "01/09/2001", "01/09/2004", "01/09/2007", "05/09/2012" > ), Fin = c("31/12/1997", "12/03/1995", "30/06/1995", "31/01/1996", > "31/08/1995", "29/02/2000", "31/08/2001", "31/08/2004", "31/08/2007", > "04/09/2012", "31/12/4712")), .Names = c("Mat", "Nom", "Sexe", > "DateNaissance", "contrat", "Pays", "Debut", "Fin"), class = "data.frame", > row.names = c(NA, > -11L)) > > Thank you for your help > > -- > Michel ARNAUD > Chargé de mission auprès du DRH > DGDRD-Drh - TA 174/04 > Av Agropolis 34398 Montpellier cedex 5 > tel : 04.67.61.75.38 > fax : 04.67.61.57.87 > port: 06.47.43.55.31 > > ______________________________**________________ > R-help@r-project.org mailing list > https://stat.ethz.ch/mailman/**listinfo/r-help<https://stat.ethz.ch/mailman/listinfo/r-help> > PLEASE do read the posting guide http://www.R-project.org/** > posting-guide.html <http://www.R-project.org/posting-guide.html> > and provide commented, minimal, self-contained, reproducible code. >-- Jim Holtman Data Munger Guru What is the problem that you are trying to solve? Tell me what you want to do, not how you want to do it. [[alternative HTML version deleted]]
Hi,
"when the value of Debut of lines i = value Fin of lines i-1"
That part is not clear esp. when it is looked upon with the expected output
(df2).? Also, in your example dataset:
df1$contrat[grep("^CDD",df1$contrat)]
#[1] "CDD d?tach? ext. Cirad" "CDD d?tach? ext. Cirad"
"CDD d?tach? ext. Cirad"
#[4] "CDD d?tach? ext. Cirad" "CDD d?tach? ext.Cirad"?
"CDD d?tach? ext. Cirad"
#[7] "CDD d?tach? ext. Cirad" "CDD d?tach? ext.Cirad"?
"CDD d?tach? ext. Cirad"
##Looks like there are extra spaces in some of them.? I guess these are the same
df1$contrat[grep("^CDD",df1$contrat)]<- "CDD d?tach? ext.
Cirad"
I tried this:
indx<-as.numeric(interaction(df1[,1:6],drop=FALSE))
?df1New<- df1
res2<-unique(within(df1New,{Debut<-ave(seq_along(indx),indx,FUN=function(x)
Debut[head(x,1)]);Fin<- ave(seq_along(indx),indx,FUN=function(x)
Fin[tail(x,1)])}))
?row.names(res2)<- 1:nrow(res2)
res2[,c(1,2,7:8)]
?? Matricule??? Nom????? Debut??????? Fin
1????????? 1? VERON 24/01/1995 31/12/1997
2????????? 6 BENARD 02/02/1995 12/03/1995
3????????? 6 BENARD 13/03/1995 31/01/1996 ###here not correct
4????????? 8 DALNIC 24/01/1995 31/08/1995
5????????? 8 DALNIC 01/09/1995 29/02/2000
6??????? 934? FORNI 26/01/1995 31/08/2001
7??????? 934? FORNI 01/09/2001 31/08/2004
8??????? 934? FORNI 01/09/2004 31/08/2007
9??????? 934? FORNI 01/09/2007 04/09/2012
10?????? 934? FORNI 05/09/2012 31/12/4712
df2[,c(1,2,7:8)]
?? Mat??? Nom????? Debut??????? Fin
1??? 1? VERON 24/01/1995 31/12/1997
2??? 6 BENARD 02/02/1995 12/03/1995
3??? 6 BENARD 13/03/1995 30/06/1995
4??? 6 BENARD 01/01/1996 31/01/1996 #missing this row
5??? 8 DALNIC 24/01/1995 31/08/1995
6??? 8 DALNIC 01/09/1995 29/02/2000
7? 934? FORNI 26/01/1995 31/08/2001
8? 934? FORNI 01/09/2001 31/08/2004
9? 934? FORNI 01/09/2004 31/08/2007
10 934? FORNI 01/09/2007 04/09/2012
11 934? FORNI 05/09/2012 31/12/4712
Here, the dates look similar to the ones on df2 except for one row in df2.
A.K.
----- Original Message -----
From: Arnaud Michel <michel.arnaud at cirad.fr>
To: R help <r-help at r-project.org>
Cc:
Sent: Friday, July 12, 2013 3:45 PM
Subject: [R] simplify a dataframe
Hello
I have the following problem : group the lines of a dataframe when no
information change (Matricule, Nom, Sexe, DateNaissance, Contrat, Pays)
and when the value of Debut of lines i = value Fin of lines i-1
I can obtain it with a do loop. Is it possible to avoid the loop ?
The dataframe initial is df1
dput(df1)
structure(list(Matricule = c(1L, 1L, 1L, 6L, 6L, 6L, 6L, 6L,
6L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L, 934L,
934L, 934L, 934L, 934L), Nom = c("VERON", "VERON",
"VERON", "BENARD",
"BENARD", "BENARD", "BENARD", "BENARD",
"BENARD", "DALNIC", "DALNIC",
"DALNIC", "DALNIC", "DALNIC", "DALNIC",
"DALNIC", "DALNIC", "DALNIC",
"FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI", "FORNI",
"FORNI", "FORNI"), Sexe = c("F?minin",
"F?minin", "F?minin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"F?minin", "F?minin", "F?minin",
"F?minin", "F?minin", "F?minin",
"F?minin", "F?minin", "F?minin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin", "Masculin", "Masculin"), DateNaissance
= c("02/09/1935",
"02/09/1935", "02/09/1935", "01/04/1935",
"01/04/1935", "01/04/1935",
"01/04/1935", "01/04/1935", "01/04/1935",
"19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940",
"19/02/1940", "19/02/1940", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961"), contrat = c("CDI
commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
"CDI commun", "CDD d?tach? ext. Cirad", "CDD d?tach?
ext. Cirad",
"CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad",
"CDD d?tach? ext.
Cirad",
"CDD d?tach? ext. Cirad", "CDD d?tach? ext. Cirad",
"CDD d?tach? ext.
Cirad",
"CDD d?tach? ext. Cirad", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
D?tach?s Autres",
"CDI D?tach?s Autres", "CDI D?tach?s Autres", "CDI
commun", "CDI commun",
"CDI commun", "CDI commun", "CDI commun",
"CDI commun", "CDI commun",
"CDI commun"), Pays = c("France", "France",
"France", "Philippines",
"Philippines", "Philippines", "France",
"France", "France", "France",
"France", "Martinique", "Martinique",
"Martinique", "Martinique",
"Martinique", "Martinique", "Martinique",
"Cameroun", "Cameroun",
"Cameroun", "Cameroun", "Cameroun",
"Cameroun", "Cameroun", "Cameroun",
"Cameroun", "France", "France",
"France", "France", "France",
"France", "France", "Congo", "Congo",
"Congo", "Congo", "Congo",
"Congo", "Gabon", "Gabon", "Gabon",
"Gabon", "Gabon", "Gabon",
"Congo", "Congo"), Debut = c("24/01/1995",
"01/05/1997", "31/12/1997",
"02/02/1995", "28/02/1995", "01/03/1995",
"13/03/1995", "01/01/1996",
"31/01/1996", "24/01/1995", "01/07/1995",
"01/09/1995", "01/07/1997",
"01/01/1998", "01/08/1998", "01/01/2000",
"17/01/2000", "29/02/2000",
"26/01/1995", "01/07/1996", "16/09/1997",
"01/01/1998", "01/07/1998",
"04/11/1999", "01/01/2001", "01/04/2001",
"31/08/2001", "01/09/2001",
"02/09/2001", "01/12/2001", "01/02/2003",
"01/04/2003", "01/01/2004",
"01/03/2004", "01/09/2004", "01/01/2005",
"01/04/2005", "28/10/2006",
"01/01/2007", "01/04/2007", "01/09/2007",
"01/01/2009", "01/04/2009",
"01/01/2010", "01/01/2011", "01/04/2011",
"05/09/2012", "01/01/2013"
), Fin = c("30/04/1997", "30/12/1997",
"31/12/1997", "27/02/1995",
"28/02/1995", "12/03/1995", "30/06/1995",
"30/01/1996", "31/01/1996",
"30/06/1995", "31/08/1995", "30/06/1997",
"31/12/1997", "31/07/1998",
"31/12/1999", "16/01/2000", "28/02/2000",
"29/02/2000", "30/06/1996",
"15/09/1997", "31/12/1997", "30/06/1998",
"03/11/1999", "31/12/2000",
"31/03/2001", "30/08/2001", "31/08/2001",
"01/09/2001", "30/11/2001",
"31/01/2003", "31/03/2003", "31/12/2003",
"29/02/2004", "31/08/2004",
"31/12/2004", "31/03/2005", "27/10/2006",
"31/12/2006", "31/03/2007",
"31/08/2007", "31/12/2008", "31/03/2009",
"31/12/2009", "31/12/2010",
"31/03/2011", "04/09/2012", "31/12/2012",
"31/12/4712")), .Names =
c("Matricule",
"Nom", "Sexe", "DateNaissance",
"contrat", "Pays", "Debut", "Fin"
), class = "data.frame", row.names = c(NA, -48L))
The dataframe to be obtained is df2
dput(df2)
structure(list(Mat = c(1L, 6L, 6L, 6L, 8L, 8L, 934L, 934L, 934L,
934L, 934L), Nom = c("VERON", "BENARD", "BENARD",
"BENARD", "DALNIC",
"DALNIC", "FORNI", "FORNI", "FORNI",
"FORNI", "FORNI"), Sexe = c("F?minin",
"Masculin", "Masculin", "Masculin",
"F?minin", "F?minin", "Masculin",
"Masculin", "Masculin", "Masculin",
"Masculin"), DateNaissance =
c("02/09/1935",
"01/04/1935", "01/04/1935", "01/04/1935",
"19/02/1940", "19/02/1940",
"10/07/1961", "10/07/1961", "10/07/1961",
"10/07/1961", "10/07/1961"
), contrat = c("CDI commun", "CDI commun", "CDI
commun", "CDI commun",
"CDI commun", "CDI commun", "CDD d?tach? ext.
Cirad", "CDI D?tach?s
Autres",
"CDI D?tach?s Autres", "CDI commun", "CDI
commun"), Pays = c("France",
"Philippines", "France", "France",
"France", "Martinique", "Cameroun",
"France", "Congo", "Gabon", "Congo"),
Debut = c("24/01/1995",
"02/02/1995", "13/03/1995", "01/01/1996",
"24/01/1995", "01/09/1995",
"26/01/1995", "01/09/2001", "01/09/2004",
"01/09/2007", "05/09/2012"
), Fin = c("31/12/1997", "12/03/1995",
"30/06/1995", "31/01/1996",
"31/08/1995", "29/02/2000", "31/08/2001",
"31/08/2004", "31/08/2007",
"04/09/2012", "31/12/4712")), .Names = c("Mat",
"Nom", "Sexe",
"DateNaissance", "contrat", "Pays",
"Debut", "Fin"), class =
"data.frame", row.names = c(NA,
-11L))
Thank you for your help
--
Michel ARNAUD
Charg? de mission aupr?s du DRH
DGDRD-Drh - TA 174/04
Av Agropolis 34398 Montpellier cedex 5
tel : 04.67.61.75.38
fax : 04.67.61.57.87
port: 06.47.43.55.31
______________________________________________
R-help at r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.