I keep getting these fixed-width format (FWF) data files where variables
for a single subject are spread across multiple lines and each line has
a different format (apparently created with SAS or SPSS). To read them
I stole some stuff from `read.fwf' in base R and threw together the
function included below.
Please feel free to test this function with your own data and let me
know of any problems.
Also, is there another (better) way to do this in R?
--
___________________________________________________________
Neil E. Klepeis -- School of Public Health, UC Berkeley and
Lawrence Berkeley National Laboratory, Berkeley, CA USA
http://socrates.berkeley.edu/~nklepeis/R_PACKAGE/library/heR.Base/html/read.fwf.mult.html
----
read.fwf.mult <-
function (file, widths, rows, sep = "\t", as.is = FALSE, skip = 0,
row.names = NULL, col.names, n = -1, blank.lines.skip = FALSE,
...)
{
if (!is.list(widths) | length(widths) != rows | (!missing(col.names)
&
(length(col.names) != rows | !is.list(col.names))))
stop("`widths' and `col.names' (optional) should be lists
containing column widths and names corresponding to each line of a
group")
if (!missing(widths) & !missing(col.names))
for (i in 1:rows) if (length(widths[[i]]) !length(col.names[[i]])) {
cat("Mismatch between `widths' and `col.names' at line
",
i, "\n")
stop("Check `widths' and `col.names'.")
}
doone <- function(x) {
x <- substring(x, first, last)
x[nchar(x) == 0] <- "NA"
x
}
blanks <- function(x) {
b <- character(length = length(x))
for (i in 1:length(x)) if (x[i] > 0)
b[i] <- paste(rep(" ", x[i]), collapse = "")
b
}
FILE <- tempfile("Rfwf.")
on.exit(unlink(FILE))
raw <- scan(file, what = "", sep = "\n", quote =
"", quiet = TRUE,
n = n, skip = skip, blank.lines.skip = blank.lines.skip)
ngroups <- length(raw)%/%rows
cat("Rows per group:", rows, "\n")
cat("Number of total rows:", length(raw), "\n")
cat("Number of groups:", ngroups, "\n")
if (ngroups < length(raw)/rows)
stop("Incomplete multi-line groups. Check input file.")
f <- expand.grid(1:rows, 1:ngroups)[[2]]
group.widths <- sapply(widths, sum)
fix.widths <- function(y) {
p <- group.widths - nchar(y)
bp <- blanks(p)
for (i in 1:length(y)) {
if (p[i] > 0) {
y[i] <- paste(y[i], bp[i], sep = "")
}
else if (p[i] < 0) {
y[i] <- substr(y[i], 1, group.widths[i])
}
}
y
}
raw <- tapply(raw, INDEX = f, FUN = "fix.widths")
raw <- lapply(raw, FUN = "paste", sep = "", collapse
= "")
widths <- unlist(widths)
st <- c(1, 1 + cumsum(widths))
first <- st[-length(st)]
last <- cumsum(widths)
cat(file = FILE, sapply(raw, doone), sep = c(rep(sep, ,
length(widths) -
1), "\n"))
col.names <- unlist(col.names)
read.table(file = FILE, header = FALSE, sep = sep, as.is = as.is,
row.names = row.names, col.names = col.names, quote = "",
...)
}
-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-
r-help mailing list -- Read http://www.ci.tuwien.ac.at/~hornik/R/R-FAQ.html
Send "info", "help", or "[un]subscribe"
(in the "body", not the subject !) To: r-help-request at
stat.math.ethz.ch
_._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._