Hello,
This is what I am trying to do: I wrote a little function that takes
addresses (coordinates) as input, and returns the road distance between
every two points using Google Maps. Catch is, there are 2000 addresses, so I
have to get around 2x10^6 addresses. On my first go, this is what I did:
#########################################
getRoadDist = function(X,complete=F){ # X must be a matrix or data frame
of coordinates; lat and lon
require(RCurl)
Y = apply( X, 1, function(x){ paste(x[1], ",", x[2],
sep="") } )
grid = expand.grid(Y,Y,KEEP.OUT.ATTRS=F)
grid =
apply(grid,1,function(x){paste(x[1],"&daddr=",x[2],sep="")})
grid = matrix(grid,ncol=length(Y),dimnames=list(names(Y),names(Y)))
grid[upper.tri(grid,T)] = NA
Distances = function(x){
if (is.na(x)) {
NA
}
else {
URL =
getURL(paste("http://maps.google.com/maps?saddr",x,sep=""))
y = strsplit(URL, "<div><b>")
y = strsplit(y[[1]][2], " mi</b>"
)[[1]][1]
as.numeric(y)
}
}
dists = sapply(grid,Distances)
dists = matrix(dists,ncol=ncol(grid),dimnames=dimnames(grid))
if (complete) {
diag(dists)=0
dists[upper.tri(dists)]=dists[lower.tri(dists)]
dists
}
else {
dists
}
}
#########################################
But R was crashing after 1 hour or so -- it either said "Reached total
allocation of 1535Mb" or, became unresponsive. Then, I tried to modify the
procedure to avoid big matrices at the. What I did was, I got the distances
and, one by one, appended them to a file in the hope that this would use
less memory:
##########################################
# X is the matrix of addresses, as before
require(RCurl)
Y = apply( X, 1, function(x){ paste(x[1], ",", x[2],
sep="") } )
grid = expand.grid(Y,Y,KEEP.OUT.ATTRS=F)
grid =
apply(grid,1,function(x){paste(x[1],"&daddr=",x[2],sep="")})
grid = matrix(grid,ncol=length(Y),dimnames=list(names(Y),names(Y)))
grid[upper.tri(grid,T)] = NA
Distances = function(x){
if (is.na(x)) {
NA
}
else {
URL =
getURL(paste("http://maps.google.com/maps?saddr",x,sep=""))
y = strsplit(URL, "<div><b>")
y = strsplit(y[[1]][2], " mi</b>"
)[[1]][1]
as.numeric(y)
}
}
grid2=grid[!is.na(grid)]
n = length(grid2)
for (i in 1:n) {
temp = Distances(grid2[i])
write.table(temp,"distances.csv",col.names=F,row.names=F,append=T)
}
##########################################
But R still crashes after 2 hours (all I got was around 20.000 distances).
It doesn't really matter how long this will take me (I can always use more
than one machine), but I'd really like to get this done. Any thoughts?
Many many thanks,
Dimitri
[[alternative HTML version deleted]]