I keep getting these fixed-width format (FWF) data files where variables for a single subject are spread across multiple lines and each line has a different format (apparently created with SAS or SPSS). To read them I stole some stuff from `read.fwf' in base R and threw together the function included below. Please feel free to test this function with your own data and let me know of any problems. Also, is there another (better) way to do this in R? -- ___________________________________________________________ Neil E. Klepeis -- School of Public Health, UC Berkeley and Lawrence Berkeley National Laboratory, Berkeley, CA USA http://socrates.berkeley.edu/~nklepeis/R_PACKAGE/library/heR.Base/html/read.fwf.mult.html ---- read.fwf.mult <- function (file, widths, rows, sep = "\t", as.is = FALSE, skip = 0, row.names = NULL, col.names, n = -1, blank.lines.skip = FALSE, ...) { if (!is.list(widths) | length(widths) != rows | (!missing(col.names) & (length(col.names) != rows | !is.list(col.names)))) stop("`widths' and `col.names' (optional) should be lists containing column widths and names corresponding to each line of a group") if (!missing(widths) & !missing(col.names)) for (i in 1:rows) if (length(widths[[i]]) !length(col.names[[i]])) { cat("Mismatch between `widths' and `col.names' at line ", i, "\n") stop("Check `widths' and `col.names'.") } doone <- function(x) { x <- substring(x, first, last) x[nchar(x) == 0] <- "NA" x } blanks <- function(x) { b <- character(length = length(x)) for (i in 1:length(x)) if (x[i] > 0) b[i] <- paste(rep(" ", x[i]), collapse = "") b } FILE <- tempfile("Rfwf.") on.exit(unlink(FILE)) raw <- scan(file, what = "", sep = "\n", quote = "", quiet = TRUE, n = n, skip = skip, blank.lines.skip = blank.lines.skip) ngroups <- length(raw)%/%rows cat("Rows per group:", rows, "\n") cat("Number of total rows:", length(raw), "\n") cat("Number of groups:", ngroups, "\n") if (ngroups < length(raw)/rows) stop("Incomplete multi-line groups. Check input file.") f <- expand.grid(1:rows, 1:ngroups)[[2]] group.widths <- sapply(widths, sum) fix.widths <- function(y) { p <- group.widths - nchar(y) bp <- blanks(p) for (i in 1:length(y)) { if (p[i] > 0) { y[i] <- paste(y[i], bp[i], sep = "") } else if (p[i] < 0) { y[i] <- substr(y[i], 1, group.widths[i]) } } y } raw <- tapply(raw, INDEX = f, FUN = "fix.widths") raw <- lapply(raw, FUN = "paste", sep = "", collapse = "") widths <- unlist(widths) st <- c(1, 1 + cumsum(widths)) first <- st[-length(st)] last <- cumsum(widths) cat(file = FILE, sapply(raw, doone), sep = c(rep(sep, , length(widths) - 1), "\n")) col.names <- unlist(col.names) read.table(file = FILE, header = FALSE, sep = sep, as.is = as.is, row.names = row.names, col.names = col.names, quote = "", ...) } -.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.- r-help mailing list -- Read http://www.ci.tuwien.ac.at/~hornik/R/R-FAQ.html Send "info", "help", or "[un]subscribe" (in the "body", not the subject !) To: r-help-request at stat.math.ethz.ch _._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._._