Martin Møller Skarbiniks Pedersen
2017-Nov-30 14:27 UTC
[R] Help avoiding setting column type two times
Hi,
I think and hope this a good place to ask for code review for a R
beginners?
I have made a R script which generates a dataset based on 2009 danish
referendum and it does work.
But I think the code could be better and I would any comments how the
code can be improved.
At least I would like to know how I avoid converting several of the
columns to factors in the end of the code?
Description of the code:
It reads a lot of xml-files from ../raw/ and saves a data.frame with
information
from these xml-files.
In the ../raw/ directiory I have placed the xml-files which I got from
"Statistics Denmark"
I have also put these xml-files on my website and they can be download
freely from http://20dage.dk/R/referendum-2009/raw.tar.gz
The code is below but I have also put the code at this place:
http://20dage.dk/R/referendum-2009/convert_from_xml.R
Best Regards
Martin M. S. Pedersen
-------
library(xml2)
convert_one_file <- function(url) {
x <- read_xml(url)
Sted <- xml_find_first(x, ".//Sted")
StedType <- xml_attr(Sted, "Type")
StedTekst <- xml_text(Sted)
Parti <- xml_find_all(x, ".//Parti")
PartiId <- xml_attr(Parti, "Id")
PartiBogstav <- xml_attr(Parti, "Bogstav")
PartiNavn <- xml_attr(Parti, "Navn")
StemmerAntal <- xml_attr(Parti, "StemmerAntal")
Stemmeberettigede <- xml_integer(xml_find_first(x,
".//Stemmeberettigede"))
DeltagelsePct <- xml_double(xml_find_first(x,
".//DeltagelsePct"))
IAltGyldigeStemmer <- xml_integer(xml_find_first(x,
".//IAltGyldigeStemmer"))
BlankeStemmer <- xml_integer(xml_find_first(x,
".//BlankeStemmer"))
AndreUgyldigeStemmer <- xml_integer(xml_find_first(x,
".//AndreUgyldigeStemmer"))
data.frame(cbind(StedType, StedTekst, PartiId, PartiBogstav, PartiNavn,
StemmerAntal, Stemmeberettigede, DeltagelsePct,
IAltGyldigeStemmer,
BlankeStemmer, AndreUgyldigeStemmer), stringsAsFactors = FALSE)
}
raw_path <- "../raw"
filenames <- dir(path = raw_path, pattern = "fintal_.*", full.names
= T)
result <- data.frame(StedType = factor(),
StedTekst = character(),
PartiId = factor(),
PartiBogstav = factor(),
PartiNavn = factor(),
StemmerAntal = integer(),
Stemmeberettigede = integer(),
DeltagelsePct = numeric(),
IAltGyldigeStemmer = integer(),
BlankeStemmer = integer(),
AndreUgyldigeStemmer = integer(),
stringsAsFactors = FALSE)
for (i in 1:length(filenames)) {
#cat(paste0(filenames[i],"\n"))
returnCode <- tryCatch({
result <- rbind(result, convert_one_file(filenames[i]))
}, error = function(e) {
cat(paste0(filenames[i]," failed:\n",e,"\n"))
})
}
result$StedType <- as.factor(result$StedType)
result$PartiId <- as.factor(result$PartiId)
result$PartiBogstav <- as.factor(result$PartiBogstav)
result$PartiNavn <- as.factor(result$PartiNavn)
result$StemmerAntal <- as.integer(result$StemmerAntal)
result$Stemmeberettigede <- as.integer(result$Stemmeberettigede)
result$DeltagelsePct <- as.numeric(result$DeltagelsePct)
result$IAltGyldigeStemmer <- as.integer(result$IAltGyldigeStemmer)
result$BlankeStemmer <- as.integer(result$BlankeStemmer)
result$AndreUgyldigeStemmer <- as.integer(result$AndreUgyldigeStemmer)
str(result)
save(result, file = "folkeafstemning2009.Rdata")
[[alternative HTML version deleted]]
See below. Am 30.11.2017 um 15:27 schrieb Martin M?ller Skarbiniks Pedersen:> Hi, > I think and hope this a good place to ask for code review for a R > beginners? > > I have made a R script which generates a dataset based on 2009 danish > referendum and it does work. > > But I think the code could be better and I would any comments how the > code can be improved. > At least I would like to know how I avoid converting several of the > columns to factors in the end of the code? > > Description of the code: > > It reads a lot of xml-files from ../raw/ and saves a data.frame with > information > from these xml-files. > > In the ../raw/ directiory I have placed the xml-files which I got from > "Statistics Denmark" > I have also put these xml-files on my website and they can be download > freely from http://20dage.dk/R/referendum-2009/raw.tar.gz > > The code is below but I have also put the code at this place: > http://20dage.dk/R/referendum-2009/convert_from_xml.R > > Best Regards > Martin M. S. Pedersen > > ------- > library(xml2) > > convert_one_file <- function(url) { > x <- read_xml(url) > > Sted <- xml_find_first(x, ".//Sted") > StedType <- xml_attr(Sted, "Type") > StedTekst <- xml_text(Sted) > > Parti <- xml_find_all(x, ".//Parti") > PartiId <- xml_attr(Parti, "Id") > PartiBogstav <- xml_attr(Parti, "Bogstav") > PartiNavn <- xml_attr(Parti, "Navn") > > > StemmerAntal <- xml_attr(Parti, "StemmerAntal") > Stemmeberettigede <- xml_integer(xml_find_first(x, > ".//Stemmeberettigede")) > DeltagelsePct <- xml_double(xml_find_first(x, ".//DeltagelsePct")) > IAltGyldigeStemmer <- xml_integer(xml_find_first(x, > ".//IAltGyldigeStemmer")) > BlankeStemmer <- xml_integer(xml_find_first(x, ".//BlankeStemmer")) > AndreUgyldigeStemmer <- xml_integer(xml_find_first(x, > ".//AndreUgyldigeStemmer")) > > data.frame(cbind(StedType, StedTekst, PartiId, PartiBogstav, PartiNavn, > StemmerAntal, Stemmeberettigede, DeltagelsePct, > IAltGyldigeStemmer, > BlankeStemmer, AndreUgyldigeStemmer), stringsAsFactors = FALSE) > } > > raw_path <- "../raw" > filenames <- dir(path = raw_path, pattern = "fintal_.*", full.names = T) > > result <- data.frame(StedType = factor(), > StedTekst = character(), > PartiId = factor(), > PartiBogstav = factor(), > PartiNavn = factor(), > StemmerAntal = integer(), > Stemmeberettigede = integer(), > DeltagelsePct = numeric(), > IAltGyldigeStemmer = integer(), > BlankeStemmer = integer(), > AndreUgyldigeStemmer = integer(), > stringsAsFactors = FALSE) > > for (i in 1:length(filenames)) { > #cat(paste0(filenames[i],"\n")) > returnCode <- tryCatch({ > result <- rbind(result, convert_one_file(filenames[i])) > }, error = function(e) { > cat(paste0(filenames[i]," failed:\n",e,"\n")) > }) > } > > result$StedType <- as.factor(result$StedType) > result$PartiId <- as.factor(result$PartiId) > result$PartiBogstav <- as.factor(result$PartiBogstav) > result$PartiNavn <- as.factor(result$PartiNavn) > result$StemmerAntal <- as.integer(result$StemmerAntal) > result$Stemmeberettigede <- as.integer(result$Stemmeberettigede) > result$DeltagelsePct <- as.numeric(result$DeltagelsePct) > result$IAltGyldigeStemmer <- as.integer(result$IAltGyldigeStemmer) > result$BlankeStemmer <- as.integer(result$BlankeStemmer) > result$AndreUgyldigeStemmer <- as.integer(result$AndreUgyldigeStemmer) > str(result) > save(result, file = "folkeafstemning2009.Rdata")Maybe two loops simplify this a little bit for you (not tested): for(v in c("StedType", <etc.>)) result[[v]] <- factor(result[[v]]) for(v in c("StemmerAntal", <etc.>)) result[[v]] <- as.integer(result[[v]]) Hth -- Gerrit --------------------------------------------------------------------- Dr. Gerrit Eichner Mathematical Institute, Room 212 gerrit.eichner at math.uni-giessen.de Justus-Liebig-University Giessen Tel: +49-(0)641-99-32104 Arndtstr. 2, 35392 Giessen, Germany Fax: +49-(0)641-99-32109 http://www.uni-giessen.de/eichner ---------------------------------------------------------------------> > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. >
> On Nov 30, 2017, at 6:27 AM, Martin M?ller Skarbiniks Pedersen <traxplayer at gmail.com> wrote: > > Hi, > I think and hope this a good place to ask for code review for a R > beginners? > > I have made a R script which generates a dataset based on 2009 danish > referendum and it does work. > > But I think the code could be better and I would any comments how the > code can be improved. > At least I would like to know how I avoid converting several of the > columns to factors in the end of the code? > > Description of the code: > > It reads a lot of xml-files from ../raw/ and saves a data.frame with > information > from these xml-files. > > In the ../raw/ directiory I have placed the xml-files which I got from > "Statistics Denmark" > I have also put these xml-files on my website and they can be download > freely from http://20dage.dk/R/referendum-2009/raw.tar.gz > > The code is below but I have also put the code at this place: > http://20dage.dk/R/referendum-2009/convert_from_xml.R > > Best Regards > Martin M. S. Pedersen > > ------- > library(xml2) > > convert_one_file <- function(url) { > x <- read_xml(url) > > Sted <- xml_find_first(x, ".//Sted") > StedType <- xml_attr(Sted, "Type") > StedTekst <- xml_text(Sted) > > Parti <- xml_find_all(x, ".//Parti") > PartiId <- xml_attr(Parti, "Id") > PartiBogstav <- xml_attr(Parti, "Bogstav") > PartiNavn <- xml_attr(Parti, "Navn") > > > StemmerAntal <- xml_attr(Parti, "StemmerAntal") > Stemmeberettigede <- xml_integer(xml_find_first(x, > ".//Stemmeberettigede")) > DeltagelsePct <- xml_double(xml_find_first(x, ".//DeltagelsePct")) > IAltGyldigeStemmer <- xml_integer(xml_find_first(x, > ".//IAltGyldigeStemmer")) > BlankeStemmer <- xml_integer(xml_find_first(x, ".//BlankeStemmer")) > AndreUgyldigeStemmer <- xml_integer(xml_find_first(x, > ".//AndreUgyldigeStemmer")) > > data.frame(cbind(StedType, StedTekst, PartiId, PartiBogstav, PartiNavn, > StemmerAntal, Stemmeberettigede, DeltagelsePct, > IAltGyldigeStemmer, > BlankeStemmer, AndreUgyldigeStemmer), stringsAsFactors = FALSE)The construction `data.frame(cbind( ...` is a serious source of potential error. The cbind coerces to matrix class which also then coerces to a single atomic class, either numeric or character. Factors loose all their meaning. Dates get messed up. Error ensues. Better would be: data.frame( StedType, StedTekst, PartiId, PartiBogstav, PartiNavn, StemmerAntal, Stemmeberettigede, DeltagelsePct, IAltGyldigeStemmer, BlankeStemmer, AndreUgyldigeStemmer, stringsAsFactors = FALSE) -- David.> } > > raw_path <- "../raw" > filenames <- dir(path = raw_path, pattern = "fintal_.*", full.names = T) > > result <- data.frame(StedType = factor(), > StedTekst = character(), > PartiId = factor(), > PartiBogstav = factor(), > PartiNavn = factor(), > StemmerAntal = integer(), > Stemmeberettigede = integer(), > DeltagelsePct = numeric(), > IAltGyldigeStemmer = integer(), > BlankeStemmer = integer(), > AndreUgyldigeStemmer = integer(), > stringsAsFactors = FALSE) > > for (i in 1:length(filenames)) { > #cat(paste0(filenames[i],"\n")) > returnCode <- tryCatch({ > result <- rbind(result, convert_one_file(filenames[i])) > }, error = function(e) { > cat(paste0(filenames[i]," failed:\n",e,"\n")) > }) > } > > result$StedType <- as.factor(result$StedType) > result$PartiId <- as.factor(result$PartiId) > result$PartiBogstav <- as.factor(result$PartiBogstav) > result$PartiNavn <- as.factor(result$PartiNavn) > result$StemmerAntal <- as.integer(result$StemmerAntal) > result$Stemmeberettigede <- as.integer(result$Stemmeberettigede) > result$DeltagelsePct <- as.numeric(result$DeltagelsePct) > result$IAltGyldigeStemmer <- as.integer(result$IAltGyldigeStemmer) > result$BlankeStemmer <- as.integer(result$BlankeStemmer) > result$AndreUgyldigeStemmer <- as.integer(result$AndreUgyldigeStemmer) > str(result) > save(result, file = "folkeafstemning2009.Rdata") > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code.David Winsemius Alameda, CA, USA 'Any technology distinguishable from magic is insufficiently advanced.' -Gehm's Corollary to Clarke's Third Law