Hello, we have 80 text files with matrices. Each matrix represents a map (rows for latitude and columns for longitude), the 80 maps represent steps in time. In addition, we have a vector x of length 80. We would like to compute a regression between matrices (response through time) and x and create maps representing coefficients, r2 etc. Problem: the 80 matrices are of the size 4000 x 3500 and we were running out of memory. We computed line by line and the results for each line were appended to output grids. This works. But - for each line, 80 text files must be scanned and output must be written. And there are several for-loops involved. This takes a lot of time (about a week). I read the contributions related to speeding up code and maybe vectorizing parts of the procedure could help a bit. However, I am a neophyte (as you may see from the code below) and did not find a way by now. I would appreciate very much any suggestions for speeding up the procedure. Thanks, Zarza The code (running but sloooooow): ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ regrid <- function (infolder, x, outfolder) { # List of input files setwd (infolder) filelist <- dir (pattern=".*.asc$", full.names = F) # Dimensions (making use of the header information coming with # the .asc-input files, ESRI-format) hd <- read.table (filelist [1], nrows = 6) cols <- hd[1,2] rows <- hd[2,2] times <- length (filelist) items <- 4 + ncol (x) # Prepare output out1 <- matrix (numeric (times * cols), ncol = cols) out2 <- matrix (numeric (items * cols), ncol = items) out3 <- as.numeric (items) # Prepare .asc-files filenames <- c("R2", "adj.R2", "p", "b0", colnames (x)) for (i in 1:items) { write.table (hd, file = paste (outfolder, filenames [i],".asc",sep =""), quote=F, row.names=F, col.names=F) } rm (hd) # Prepare regression xnam <- paste ("x[,", 1:(ncol(x)),"]", sep="") form <- paste("y ~ ", paste(xnam, collapse="+")) rm (xnam) # Loop through rows for (j in 1:rows) { getgrid <- function (j) { print (paste ("Row",j,"/",rows),quote = F) # Read out multi-temporal response values for one grid-row of cells for (k in 1:times) { getslice <- function (k) { values <- scan (filelist [k], what=0, na.strings = "-9999", skip = (5 + j), nlines = 1, nmax = cols, quiet=T) values } out1[k,] <- getslice (k) } # Regression for (l in 1:cols) { y <- as.vector (out1 [,l]) if (length (y) > length (na.omit (y))) { setNA <- function (l) { NAs <- rep (NA, length (out3)) NAs } out2[l,] <- setNA (l) } else { regression <- function (l) { model <- lm (as.formula(form)) out3[1] <- summary (model)$r.squared out3[2] <- summary (model)$adj.r.squared f <- summary (model)$fstatistic out3[3] <- 1-pf(f[1],f[2],f[3]) out3[4:items] <- coef(model)[1:(1 + ncol(x))] out3 } out2[l,] <- regression (l) } } out2 } fillrow <- getgrid (j) # Append results to output files for (m in 1:items) { write.table (t(fillrow [,m]), file = paste (outfolder, filenames [m], ".asc", sep =""), append=T, quote=F, na = as.character (-9999), row.names = F, col.names = F, dec=".") } } } -- View this message in context: http://www.nabble.com/Lots-of-huge-matrices%2C-for-loops%2C-speed-tp18303230p18303230.html Sent from the R help mailing list archive at Nabble.com.
Open all 80 files at once and then start reading a line from each one in turn. On my Windows machine I can have 80 open at once. You are spending all your time 'skipping' records. Code might look something like this: # open them all files <- list() for (i in fileList) files[[i]] <- file(i, 'r') # now process each line for (lines in 1:3500){ input <- list() for (i in 1:80) input[[i]] <- scan(files[[i]], what=0, n=1) # now you have the 80 lines, process them ...... } On Sun, Jul 6, 2008 at 11:39 AM, Zarza <s.schmidtlein at uni-bonn.de> wrote:> > Hello, > we have 80 text files with matrices. Each matrix represents a map (rows for > latitude and columns for longitude), the 80 maps represent steps in time. In > addition, we have a vector x of length 80. We would like to compute a > regression between matrices (response through time) and x and create maps > representing coefficients, r2 etc. Problem: the 80 matrices are of the size > 4000 x 3500 and we were running out of memory. We computed line by line and > the results for each line were appended to output grids. This works. But - > for each line, 80 text files must be scanned and output must be written. And > there are several for-loops involved. This takes a lot of time (about a > week). I read the contributions related to speeding up code and maybe > vectorizing parts of the procedure could help a bit. However, I am a > neophyte (as you may see from the code below) and did not find a way by now. > I would appreciate very much any suggestions for speeding up the procedure. > Thanks, Zarza > > > > > The code (running but sloooooow): > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > regrid <- function (infolder, x, outfolder) { > > # List of input files > setwd (infolder) > filelist <- dir (pattern=".*.asc$", full.names = F) > > # Dimensions (making use of the header information coming with > # the .asc-input files, ESRI-format) > hd <- read.table (filelist [1], nrows = 6) > cols <- hd[1,2] > rows <- hd[2,2] > times <- length (filelist) > items <- 4 + ncol (x) > > # Prepare output > out1 <- matrix (numeric (times * cols), ncol = cols) > out2 <- matrix (numeric (items * cols), ncol = items) > out3 <- as.numeric (items) > > # Prepare .asc-files > filenames <- c("R2", "adj.R2", "p", "b0", colnames (x)) > for (i in 1:items) { > write.table (hd, file = paste (outfolder, filenames [i],".asc",sep =""), > quote=F, row.names=F, col.names=F) } > rm (hd) > > # Prepare regression > xnam <- paste ("x[,", 1:(ncol(x)),"]", sep="") > form <- paste("y ~ ", paste(xnam, collapse="+")) > rm (xnam) > > # Loop through rows > for (j in 1:rows) { > getgrid <- function (j) { > print (paste ("Row",j,"/",rows),quote = F) > > # Read out multi-temporal response values for one grid-row of cells > for (k in 1:times) > { > getslice <- function (k) { > values <- scan (filelist [k], what=0, na.strings = "-9999", > skip = (5 + j), nlines = 1, nmax = cols, quiet=T) > values } > out1[k,] <- getslice (k) > } > > # Regression > for (l in 1:cols) > { > y <- as.vector (out1 [,l]) > if (length (y) > length (na.omit (y))) > { > setNA <- function (l) { > NAs <- rep (NA, length (out3)) > NAs } > out2[l,] <- setNA (l) > } > else > { > regression <- function (l) { > model <- lm (as.formula(form)) > out3[1] <- summary (model)$r.squared > out3[2] <- summary (model)$adj.r.squared > f <- summary (model)$fstatistic > out3[3] <- 1-pf(f[1],f[2],f[3]) > out3[4:items] <- coef(model)[1:(1 + ncol(x))] > out3 } > out2[l,] <- regression (l) > } > } > out2 > } > fillrow <- getgrid (j) > > # Append results to output files > for (m in 1:items) { > write.table (t(fillrow [,m]), file = paste (outfolder, filenames [m], > ".asc", sep =""), append=T, quote=F, na = as.character (-9999), > row.names = F, col.names = F, dec=".") } > } > } > -- > View this message in context: http://www.nabble.com/Lots-of-huge-matrices%2C-for-loops%2C-speed-tp18303230p18303230.html > Sent from the R help mailing list archive at Nabble.com. > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. >-- Jim Holtman Cincinnati, OH +1 513 646 9390 What is the problem you are trying to solve?
You are doing a lot of unnecessary work inside each loop. For example the function regression() is defined as many as rows*cols times, but only needs to be defined once. Similarly getgrid() and getslice() (which does need to be defined inside the j loop, but not inside the k loop). HTH Ray Brownrigg On Mon, 07 Jul 2008, Zarza wrote:> Hello, > we have 80 text files with matrices. Each matrix represents a map (rows for > latitude and columns for longitude), the 80 maps represent steps in time. > In addition, we have a vector x of length 80. We would like to compute a > regression between matrices (response through time) and x and create maps > representing coefficients, r2 etc. Problem: the 80 matrices are of the size > 4000 x 3500 and we were running out of memory. We computed line by line and > the results for each line were appended to output grids. This works. But - > for each line, 80 text files must be scanned and output must be written. > And there are several for-loops involved. This takes a lot of time (about a > week). I read the contributions related to speeding up code and maybe > vectorizing parts of the procedure could help a bit. However, I am a > neophyte (as you may see from the code below) and did not find a way by > now. I would appreciate very much any suggestions for speeding up the > procedure. Thanks, Zarza > > > > > The code (running but sloooooow): > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > regrid <- function (infolder, x, outfolder) { > > # List of input files > setwd (infolder) > filelist <- dir (pattern=".*.asc$", full.names = F) > > # Dimensions (making use of the header information coming with > # the .asc-input files, ESRI-format) > hd <- read.table (filelist [1], nrows = 6) > cols <- hd[1,2] > rows <- hd[2,2] > times <- length (filelist) > items <- 4 + ncol (x) > > # Prepare output > out1 <- matrix (numeric (times * cols), ncol = cols) > out2 <- matrix (numeric (items * cols), ncol = items) > out3 <- as.numeric (items) > > # Prepare .asc-files > filenames <- c("R2", "adj.R2", "p", "b0", colnames (x)) > for (i in 1:items) { > write.table (hd, file = paste (outfolder, filenames [i],".asc",sep =""), > quote=F, row.names=F, col.names=F) } > rm (hd) > > # Prepare regression > xnam <- paste ("x[,", 1:(ncol(x)),"]", sep="") > form <- paste("y ~ ", paste(xnam, collapse="+")) > rm (xnam) > > # Loop through rows > for (j in 1:rows) { > getgrid <- function (j) { > print (paste ("Row",j,"/",rows),quote = F) > > # Read out multi-temporal response values for one grid-row of cells > for (k in 1:times) > { > getslice <- function (k) { > values <- scan (filelist [k], what=0, na.strings = "-9999", > skip = (5 + j), nlines = 1, nmax = cols, quiet=T) > values } > out1[k,] <- getslice (k) > } > > # Regression > for (l in 1:cols) > { > y <- as.vector (out1 [,l]) > if (length (y) > length (na.omit (y))) > { > setNA <- function (l) { > NAs <- rep (NA, length (out3)) > NAs } > out2[l,] <- setNA (l) > } > else > { > regression <- function (l) { > model <- lm (as.formula(form)) > out3[1] <- summary (model)$r.squared > out3[2] <- summary (model)$adj.r.squared > f <- summary (model)$fstatistic > out3[3] <- 1-pf(f[1],f[2],f[3]) > out3[4:items] <- coef(model)[1:(1 + ncol(x))] > out3 } > out2[l,] <- regression (l) > } > } > out2 > } > fillrow <- getgrid (j) > > # Append results to output files > for (m in 1:items) { > write.table (t(fillrow [,m]), file = paste (outfolder, filenames [m], > ".asc", sep =""), append=T, quote=F, na = as.character (-9999), > row.names = F, col.names = F, dec=".") } > } > }
Another possibility is to use explicit formula, i.e. if you are doing linear regression like y = a*x + b then the explicit formulae are: a = (meanXY - meanX*meanY)/(meanX2 - meanX^2) b = (meanY*meanX2 - meanX*meanXY)/(meanX2 - meanX^2) where meanX is mean(x), meanXY is mean(x*y), meanX2 is mean(x^2), etc. So if x is your time vector you could do something like: meanY <- matrix(0,nrow=4000,ncol=3500) meanXY <- matrix(0,nrow=4000,ncol=3500) meanY2 <- matrix(0,nrow=4000,ncol=3500) for (i in 1:80) { A <- read.table(file[i]) meanY = meanY +A meanXY <- meanXY + x[i]*A meanY2 <- meanY2 + A*A } now: meanY <- meanY/80 meanXY <- meanXY/80 meanY2 <- meanY2/80 meanX <- mean(x) meanX2 <- mean(x*x) and now use the above formulae to compute regression coefficients. You will need space for 4 4000x3500 matrices and this should not be a problem. Once a and b are found you can use meanX,meanX2,meanXY,meanY,meanY2 to compute R2 as well. --- On Mon, 7/7/08, Zarza <s.schmidtlein at uni-bonn.de> wrote:> From: Zarza <s.schmidtlein at uni-bonn.de> > Subject: [R] Lots of huge matrices, for-loops, speed > To: r-help at r-project.org > Received: Monday, 7 July, 2008, 1:39 AM > Hello, > we have 80 text files with matrices. Each matrix represents > a map (rows for > latitude and columns for longitude), the 80 maps represent > steps in time. In > addition, we have a vector x of length 80. We would like to > compute a > regression between matrices (response through time) and x > and create maps > representing coefficients, r2 etc. Problem: the 80 matrices > are of the size > 4000 x 3500 and we were running out of memory. We computed > line by line and > the results for each line were appended to output grids. > This works. But - > for each line, 80 text files must be scanned and output > must be written. And > there are several for-loops involved. This takes a lot of > time (about a > week). I read the contributions related to speeding up code > and maybe > vectorizing parts of the procedure could help a bit. > However, I am a > neophyte (as you may see from the code below) and did not > find a way by now. > I would appreciate very much any suggestions for speeding > up the procedure. > Thanks, Zarza > > > > > The code (running but sloooooow): > ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > regrid <- function (infolder, x, outfolder) { > > # List of input files > setwd (infolder) > filelist <- dir (pattern=".*.asc$", full.names > = F) > > # Dimensions (making use of the header information coming > with > # the .asc-input files, ESRI-format) > hd <- read.table (filelist [1], nrows = 6) > cols <- hd[1,2] > rows <- hd[2,2] > times <- length (filelist) > items <- 4 + ncol (x) > > # Prepare output > out1 <- matrix (numeric (times * cols), ncol = cols) > out2 <- matrix (numeric (items * cols), ncol = items) > out3 <- as.numeric (items) > > # Prepare .asc-files > filenames <- c("R2", "adj.R2", > "p", "b0", colnames (x)) > for (i in 1:items) { > write.table (hd, file = paste (outfolder, filenames > [i],".asc",sep =""), > quote=F, row.names=F, col.names=F) } > rm (hd) > > # Prepare regression > xnam <- paste ("x[,", > 1:(ncol(x)),"]", sep="") > form <- paste("y ~ ", paste(xnam, > collapse="+")) > rm (xnam) > > # Loop through rows > for (j in 1:rows) { > getgrid <- function (j) { > print (paste > ("Row",j,"/",rows),quote = F) > > # Read out multi-temporal response values for one > grid-row of cells > for (k in 1:times) > { > getslice <- function (k) { > values <- scan (filelist [k], what=0, > na.strings = "-9999", > skip = (5 + j), nlines = 1, nmax = cols, > quiet=T) > values } > out1[k,] <- getslice (k) > } > > # Regression > for (l in 1:cols) > { > y <- as.vector (out1 [,l]) > if (length (y) > length (na.omit (y))) > { > setNA <- function (l) { > NAs <- rep (NA, length (out3)) > NAs } > out2[l,] <- setNA (l) > } > else > { > regression <- function (l) { > model <- lm (as.formula(form)) > out3[1] <- summary (model)$r.squared > out3[2] <- summary (model)$adj.r.squared > f <- summary (model)$fstatistic > out3[3] <- 1-pf(f[1],f[2],f[3]) > out3[4:items] <- coef(model)[1:(1 + > ncol(x))] > out3 } > out2[l,] <- regression (l) > } > } > out2 > } > fillrow <- getgrid (j) > > # Append results to output files > for (m in 1:items) { > write.table (t(fillrow [,m]), file = paste (outfolder, > filenames [m], > ".asc", sep =""), append=T, > quote=F, na = as.character (-9999), > row.names = F, col.names = F, dec=".") } > } > } > -- > View this message in context: > http://www.nabble.com/Lots-of-huge-matrices%2C-for-loops%2C-speed-tp18303230p18303230.html > Sent from the R help mailing list archive at Nabble.com. > > ______________________________________________ > R-help at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide > http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, > reproducible code.