David Terk
2012-Jul-22 19:54 UTC
[Rd] Reading many large files causes R to crash - Possible Bug in R 2.15.1 64-bit Ubuntu
I am reading several hundred files. Anywhere from 50k-400k in size. It appears that when I read these files with R 2.15.1 the process will hang or seg fault on the scan() call. This does not happen on R 2.14.1. This is happening on the precise build of Ubuntu. I have included everything, but the issue appears to be when performing the scan in the method parseTickData. Below is the code. Hopefully this is the right place to post. parseTickDataFromDir = function(tickerDir, per, subper, fun) { tickerAbsFilenames = list.files(tickerDir,full.names=T) tickerNames = list.files(tickerDir,full.names=F) tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames) pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), style = 3) for(i in 1:length(tickerAbsFilenames)) { # Grab Raw Tick Data dat.i = parseTickData(tickerAbsFilenames[i]) #Sys.sleep(1) # Create Template dates <- unique(substr(as.character(index(dat.i)), 1,10)) times <- rep("09:30:00", length(dates)) openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S") templateTimes <- NULL for (j in 1:length(openDateTimes)) { if (is.null(templateTimes)) { templateTimes <- openDateTimes[j] + 0:23400 } else { templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400) } } # Convert templateTimes to XTS, merge with data and convert NA's templateTimes <- as.xts(templateTimes) dat.i <- merge(dat.i, templateTimes, all=T) # If there is no data in the first print, we will have leading NA's. So set them to -1. # Since we do not want these values removed by to.period if (is.na(dat.i[1])) { dat.i[1] <- -1 } # Fix remaining NA's dat.i <- na.locf(dat.i) # Convert to desired bucket size dat.i <- to.period(dat.i, period=per, k=subper, name=NULL) # Always use templated index, otherwise merge fails with other symbols index(dat.i) <- index(to.period(templateTimes, period=per, k=subper)) # If there was missing data at open, set close to NA valsToChange <- which(dat.i[,"Open"] == -1) if (length(valsToChange) != 0) { dat.i[valsToChange, "Close"] <- NA } if(i == 1) { DAT = fun(dat.i) } else { DAT = merge(DAT,fun(dat.i)) } setTxtProgressBar(pb, i) } close(pb) colnames(DAT) = tickerNames return(DAT) } parseTickData <- function(inputFile) { DAT.list <- scan(file=inputFile, sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T) index <- as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y %H:%M:%S") DAT.xts <- xts(DAT.list$Close,index) DAT.xts <- make.index.unique(DAT.xts) return(DAT.xts) } [[alternative HTML version deleted]]
Joshua Ulrich
2012-Jul-22 20:47 UTC
[Rd] Reading many large files causes R to crash - Possible Bug in R 2.15.1 64-bit Ubuntu
Cross-posted on Stack Overflow: http://stackoverflow.com/q/11596747/271616 -- Joshua Ulrich | about.me/joshuaulrich FOSS Trading | www.fosstrading.com On Sun, Jul 22, 2012 at 2:54 PM, David Terk <david.terk at gmail.com> wrote:> I am reading several hundred files. Anywhere from 50k-400k in size. It > appears that when I read these files with R 2.15.1 the process will hang or > seg fault on the scan() call. This does not happen on R 2.14.1. > > > > This is happening on the precise build of Ubuntu. > > > > I have included everything, but the issue appears to be when performing the > scan in the method parseTickData. > > > > Below is the code. Hopefully this is the right place to post. > > > > parseTickDataFromDir = function(tickerDir, per, subper, fun) { > > tickerAbsFilenames = list.files(tickerDir,full.names=T) > > tickerNames = list.files(tickerDir,full.names=F) > > tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames) > > pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), style = 3) > > > > for(i in 1:length(tickerAbsFilenames)) { > > > > # Grab Raw Tick Data > > dat.i = parseTickData(tickerAbsFilenames[i]) > > #Sys.sleep(1) > > # Create Template > > dates <- unique(substr(as.character(index(dat.i)), 1,10)) > > times <- rep("09:30:00", length(dates)) > > openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S") > > templateTimes <- NULL > > > > for (j in 1:length(openDateTimes)) { > > if (is.null(templateTimes)) { > > templateTimes <- openDateTimes[j] + 0:23400 > > } else { > > templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400) > > } > > } > > > > # Convert templateTimes to XTS, merge with data and convert NA's > > templateTimes <- as.xts(templateTimes) > > dat.i <- merge(dat.i, templateTimes, all=T) > > # If there is no data in the first print, we will have leading NA's. So > set them to -1. > > # Since we do not want these values removed by to.period > > if (is.na(dat.i[1])) { > > dat.i[1] <- -1 > > } > > # Fix remaining NA's > > dat.i <- na.locf(dat.i) > > # Convert to desired bucket size > > dat.i <- to.period(dat.i, period=per, k=subper, name=NULL) > > # Always use templated index, otherwise merge fails with other symbols > > index(dat.i) <- index(to.period(templateTimes, period=per, k=subper)) > > # If there was missing data at open, set close to NA > > valsToChange <- which(dat.i[,"Open"] == -1) > > if (length(valsToChange) != 0) { > > dat.i[valsToChange, "Close"] <- NA > > } > > if(i == 1) { > > DAT = fun(dat.i) > > } else { > > DAT = merge(DAT,fun(dat.i)) > > } > > setTxtProgressBar(pb, i) > > } > > close(pb) > > colnames(DAT) = tickerNames > > return(DAT) > > } > > > > parseTickData <- function(inputFile) { > > DAT.list <- scan(file=inputFile, > sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T) > > index <- as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y > %H:%M:%S") > > DAT.xts <- xts(DAT.list$Close,index) > > DAT.xts <- make.index.unique(DAT.xts) > > return(DAT.xts) > > } > > > > > > > [[alternative HTML version deleted]] > > ______________________________________________ > R-devel at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-devel
Duncan Murdoch
2012-Jul-22 20:47 UTC
[Rd] Reading many large files causes R to crash - Possible Bug in R 2.15.1 64-bit Ubuntu
On 12-07-22 3:54 PM, David Terk wrote:> I am reading several hundred files. Anywhere from 50k-400k in size. It > appears that when I read these files with R 2.15.1 the process will hang or > seg fault on the scan() call. This does not happen on R 2.14.1.The code below doesn't do anything other than define a couple of functions. Please simplify it to code that creates a file (or multiple files), reads it or them, and shows a bug. If you can't do that, then gradually add the rest of the stuff from these functions into the mix until you figure out what is really causing the bug. If you don't post code that allows us to reproduce the crash, it's really unlikely that we'll be able to fix it. Duncan Murdoch> > > > This is happening on the precise build of Ubuntu. > > > > I have included everything, but the issue appears to be when performing the > scan in the method parseTickData. > > > > Below is the code. Hopefully this is the right place to post. > > > > parseTickDataFromDir = function(tickerDir, per, subper, fun) { > > tickerAbsFilenames = list.files(tickerDir,full.names=T) > > tickerNames = list.files(tickerDir,full.names=F) > > tickerNames = gsub("_[a-zA-Z0-9].csv","",tickerNames) > > pb <- txtProgressBar(min = 0, max = length(tickerAbsFilenames), style = 3) > > > > for(i in 1:length(tickerAbsFilenames)) { > > > > # Grab Raw Tick Data > > dat.i = parseTickData(tickerAbsFilenames[i]) > > #Sys.sleep(1) > > # Create Template > > dates <- unique(substr(as.character(index(dat.i)), 1,10)) > > times <- rep("09:30:00", length(dates)) > > openDateTimes <- strptime(paste(dates, times), "%F %H:%M:%S") > > templateTimes <- NULL > > > > for (j in 1:length(openDateTimes)) { > > if (is.null(templateTimes)) { > > templateTimes <- openDateTimes[j] + 0:23400 > > } else { > > templateTimes <- c(templateTimes, openDateTimes[j] + 0:23400) > > } > > } > > > > # Convert templateTimes to XTS, merge with data and convert NA's > > templateTimes <- as.xts(templateTimes) > > dat.i <- merge(dat.i, templateTimes, all=T) > > # If there is no data in the first print, we will have leading NA's. So > set them to -1. > > # Since we do not want these values removed by to.period > > if (is.na(dat.i[1])) { > > dat.i[1] <- -1 > > } > > # Fix remaining NA's > > dat.i <- na.locf(dat.i) > > # Convert to desired bucket size > > dat.i <- to.period(dat.i, period=per, k=subper, name=NULL) > > # Always use templated index, otherwise merge fails with other symbols > > index(dat.i) <- index(to.period(templateTimes, period=per, k=subper)) > > # If there was missing data at open, set close to NA > > valsToChange <- which(dat.i[,"Open"] == -1) > > if (length(valsToChange) != 0) { > > dat.i[valsToChange, "Close"] <- NA > > } > > if(i == 1) { > > DAT = fun(dat.i) > > } else { > > DAT = merge(DAT,fun(dat.i)) > > } > > setTxtProgressBar(pb, i) > > } > > close(pb) > > colnames(DAT) = tickerNames > > return(DAT) > > } > > > > parseTickData <- function(inputFile) { > > DAT.list <- scan(file=inputFile, > sep=",",skip=1,what=list(Date="",Time="",Close=0,Volume=0),quiet=T) > > index <- as.POSIXct(paste(DAT.list$Date,DAT.list$Time),format="%m/%d/%Y > %H:%M:%S") > > DAT.xts <- xts(DAT.list$Close,index) > > DAT.xts <- make.index.unique(DAT.xts) > > return(DAT.xts) > > } > > > > > > > [[alternative HTML version deleted]] > > ______________________________________________ > R-devel at r-project.org mailing list > https://stat.ethz.ch/mailman/listinfo/r-devel >