thr3ads.net - R help - [R] web scraping image [Jun 2015]

If this information is useful, please help other people find it:
Share via:

Curtis DeGasperi

2015-Jun-08 18:09 UTC

[R] web scraping image

Thanks to Jim's prompting, I think I came up with a fairly painless way to
parse the HTML without having to write any parsing code myself using the
function getHTMLExternalFiles in the XML package. A working version of the
code follows:

## Code to process USGS peak flow data

require(dataRetrieval)
require(XML)

## Need to start with list of gauge ids to process

siteno <- c('12142000','12134500','12149000')

lstas <-length(siteno) #length of locator list

print(paste('Processsing...',siteno[1],' ',siteno[1], sep =
""))

datall <-  readNWISpeak(siteno[1])

for (a in 2:lstas) {
  # Print station being processed
  print(paste('Processsing...',siteno[a], sep = ""))

  dat<-  readNWISpeak(siteno[a])

  datall <- rbind(datall,dat)

}

write.csv(datall, file = "usgs_peaks.csv")

# Retrieve ascii text files and graphics
for (a in 1:lstas) {

  print(paste('Processsing...',siteno[a], sep = ""))

  graphic.url <-
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',siteno[a],'&agency_cd=USGS&format=img',
sep = "")
  usgs.img <- getHTMLExternalFiles(graphic.url)
  graphic.img <- paste('http://nwis.waterdata.usgs.gov',usgs.img, sep
= "")

  peakfq.url <-
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',siteno[a],'&agency_cd=USGS&format=hn2',
sep = "")
  tab.url  <-
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',siteno[a],'&agency_cd=USGS&format=rdb',
sep = "")

  graphic.fn <- paste('graphic_',siteno[a],'.gif', sep =
"")
  peakfq.fn <- paste('peakfq_',siteno[a],'.txt', sep =
"")
  tab.fn  <- paste('tab_',siteno[a],'.txt', sep =
"")
  download.file(graphic.img,graphic.fn,mode='wb')
  download.file(peakfq.url,peakfq.fn)
  download.file(tab.url,tab.fn)
}
> ------------------------------
>
> Message: 34
> Date: Fri, 5 Jun 2015 08:59:04 +1000
> From: Jim Lemon <drjimlemon at gmail.com>
> To: Curtis DeGasperi <curtis.degasperi at gmail.com>
> Cc: r-help mailing list <r-help at r-project.org>
> Subject: Re: [R] web scraping image
> Message-ID:
>         <CA+8X3fV0aJw+E22JayV1GfM6JR_taZuA5FwGD3T_mfGfQy2nFA at
mail.gmail.com>> Content-Type: text/plain; charset=UTF-8
>
> Hi Chris,
> I don't have the packages you are using, but tracing this indicates
> that the page source contains the relative path of the graphic, in
> this case:
>
> /nwisweb/data/img/USGS.12144500.19581112.20140309..0.peak.pres.gif
>
> and you already have the server URL:
>
> nwis.waterdata.usgs.gov
>
> getting the path out of the page source isn't difficult, just split
> the text at double quotes and get the token following "img src=".
If I
> understand the arguments of "download.file" correctly, the path
is the
> graphic.fn argument and the server URL is the graphic.url argument. I
> would paste them together and display the result to make sure that it
> matches the image you want. When I did this, the correct image
> appeared in my browser. I'm using Google Chrome, so I don't have to
> prepend the http://
>
> Jim
>
> On Fri, Jun 5, 2015 at 2:31 AM, Curtis DeGasperi
> <curtis.degasperi at gmail.com> wrote:
>> I'm working on a script that downloads data from the USGS NWIS
server.
>> dataRetrieval makes it easy to quickly get the data in a neat tabular
>> format, but I was also interested in getting the tabular text files -
>> also fairly easy for me using download.file.
>>
>> However, I'm not skilled enough to work out how to download the
nice
>> graphic files that can be produced dynamically from the USGS NWIS
>> server (for example:
>>http://nwis.waterdata.usgs.gov/nwis/peak?site_no=12144500&agency_cd=USGS&format=img
)>>
>> My question is how do I get the image from this web page and save it
>> to a local directory? scrapeR returns the information from the page
>> and I suspect this is a possible solution path, but I don't know
what
>> the next step is.
>>
>> My code provided below works from a list I've created of USGS flow
>> gauging stations.
>>
>> Curtis
>>
>> ## Code to process USGS daily flow data for high and low flow analysis
>> ## Need to start with list of gauge ids to process
>> ## Can't figure out how to automate download of images
>>
>> require(dataRetrieval)
>> require(data.table)
>> require(scrapeR)
>>
>> df <- read.csv("usgs_stations.csv", header=TRUE)
>>
>> lstas <-length(df$siteno) #length of locator list
>>
>> print(paste('Processsing...',df$name[1],'
',df$siteno[1], sep = ""))
>>
>> datall <-  readNWISpeak(df$siteno[1])
>>
>> for (a in 2:lstas) {
>>   # Print station being processed
>>   print(paste('Processsing...',df$name[a],'
',df$siteno[a], sep = ""))
>>
>>   dat<-  readNWISpeak(df$siteno[a])
>>
>>   datall <- rbind(datall,dat)
>>
>> }
>>
>> write.csv(datall, file = "usgs_peaks.csv")
>>
>> # Retrieve ascii text files and graphics
>>
>> for (a in 1:lstas) {
>>
>>   print(paste('Processsing...',df$name[1],'
',df$siteno[1], sep = ""))
>>
>>   graphic.url <-
>>
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no',df$siteno[a],'&agency_cd=USGS&format=img',
>> sep = "")
>>   peakfq.url <-
>>
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no',df$siteno[a],'&agency_cd=USGS&format=hn2',
>> sep = "")
>>   tab.url  <-
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no',df$siteno[a],'&agency_cd=USGS&format=rdb',
>> sep = "")
>>
>>   graphic.fn <-
paste('graphic_',df$siteno[a],'.gif', sep = "")
>>   peakfq.fn <- paste('peakfq_',df$siteno[a],'.txt',
sep = "")
>>   tab.fn  <- paste('tab_',df$siteno[a],'.txt', sep =
"")
>>
>>   download.file(graphic.url,graphic.fn,mode='wb') # This
apparently
>> doesn't work - file is empty
>>   download.file(peakfq.url,peakfq.fn)
>>   download.file(tab.url,tab.fn)
>> }
>>
>> # scrapeR
>> pageSource<-scrape(url="http://nwis.waterdata.usgs.gov/nwis/peak?site_no=12144500&agency_cd=USGS&format=img
",headers=TRUE,>> parse=FALSE)
>> page<-scrape(object="pageSource")
>>
>> ______________________________________________
>> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide
http://www.R-project.org/posting-guide.html>> and provide commented, minimal, self-contained, reproducible code.
	[[alternative HTML version deleted]]

boB Rudis

2015-Jun-08 19:41 UTC

head link

[R] web scraping image

You can also do it with rvest & httr (but that does involve some
"parsing"):

library(httr)
library(rvest)

url <-
"http://nwis.waterdata.usgs.gov/nwis/peak?site_no=12144500&agency_cd=USGS&format=img"
html(url) %>%
  html_nodes("img") %>%
  html_attr("src") %>%
  paste0("http://nwis.waterdata.usgs.gov", .) %>%
  GET(write_disk("12144500.gif")) -> status

Very readable and can be made programmatic pretty easily, too. Plus:
avoids direct use of the XML library. Future versions will no doubt
swap xml2 for XML as well.

-Bob


On Mon, Jun 8, 2015 at 2:09 PM, Curtis DeGasperi
<curtis.degasperi at gmail.com> wrote:> Thanks to Jim's prompting, I think I came up with a fairly painless way
to
> parse the HTML without having to write any parsing code myself using the
> function getHTMLExternalFiles in the XML package. A working version of the
> code follows:
>
> ## Code to process USGS peak flow data
>
> require(dataRetrieval)
> require(XML)
>
> ## Need to start with list of gauge ids to process
>
> siteno <- c('12142000','12134500','12149000')
>
> lstas <-length(siteno) #length of locator list
>
> print(paste('Processsing...',siteno[1],' ',siteno[1], sep =
""))
>
> datall <-  readNWISpeak(siteno[1])
>
> for (a in 2:lstas) {
>   # Print station being processed
>   print(paste('Processsing...',siteno[a], sep = ""))
>
>   dat<-  readNWISpeak(siteno[a])
>
>   datall <- rbind(datall,dat)
>
> }
>
> write.csv(datall, file = "usgs_peaks.csv")
>
> # Retrieve ascii text files and graphics
> for (a in 1:lstas) {
>
>   print(paste('Processsing...',siteno[a], sep = ""))
>
>   graphic.url <-
>
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',siteno[a],'&agency_cd=USGS&format=img',
> sep = "")
>   usgs.img <- getHTMLExternalFiles(graphic.url)
>   graphic.img <-
paste('http://nwis.waterdata.usgs.gov',usgs.img, sep = "")
>
>   peakfq.url <-
>
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',siteno[a],'&agency_cd=USGS&format=hn2',
> sep = "")
>   tab.url  <-
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no=',siteno[a],'&agency_cd=USGS&format=rdb',
> sep = "")
>
>   graphic.fn <- paste('graphic_',siteno[a],'.gif', sep =
"")
>   peakfq.fn <- paste('peakfq_',siteno[a],'.txt', sep =
"")
>   tab.fn  <- paste('tab_',siteno[a],'.txt', sep =
"")
>   download.file(graphic.img,graphic.fn,mode='wb')
>   download.file(peakfq.url,peakfq.fn)
>   download.file(tab.url,tab.fn)
> }
>
>> ------------------------------
>>
>> Message: 34
>> Date: Fri, 5 Jun 2015 08:59:04 +1000
>> From: Jim Lemon <drjimlemon at gmail.com>
>> To: Curtis DeGasperi <curtis.degasperi at gmail.com>
>> Cc: r-help mailing list <r-help at r-project.org>
>> Subject: Re: [R] web scraping image
>> Message-ID:
>>         <
> CA+8X3fV0aJw+E22JayV1GfM6JR_taZuA5FwGD3T_mfGfQy2nFA at mail.gmail.com>
>> Content-Type: text/plain; charset=UTF-8
>>
>> Hi Chris,
>> I don't have the packages you are using, but tracing this indicates
>> that the page source contains the relative path of the graphic, in
>> this case:
>>
>> /nwisweb/data/img/USGS.12144500.19581112.20140309..0.peak.pres.gif
>>
>> and you already have the server URL:
>>
>> nwis.waterdata.usgs.gov
>>
>> getting the path out of the page source isn't difficult, just split
>> the text at double quotes and get the token following "img
src=". If I
>> understand the arguments of "download.file" correctly, the
path is the
>> graphic.fn argument and the server URL is the graphic.url argument. I
>> would paste them together and display the result to make sure that it
>> matches the image you want. When I did this, the correct image
>> appeared in my browser. I'm using Google Chrome, so I don't
have to
>> prepend the http://
>>
>> Jim
>>
>> On Fri, Jun 5, 2015 at 2:31 AM, Curtis DeGasperi
>> <curtis.degasperi at gmail.com> wrote:
>>> I'm working on a script that downloads data from the USGS NWIS
server.
>>> dataRetrieval makes it easy to quickly get the data in a neat
tabular
>>> format, but I was also interested in getting the tabular text files
-
>>> also fairly easy for me using download.file.
>>>
>>> However, I'm not skilled enough to work out how to download the
nice
>>> graphic files that can be produced dynamically from the USGS NWIS
>>> server (for example:
>>>
>
http://nwis.waterdata.usgs.gov/nwis/peak?site_no=12144500&agency_cd=USGS&format=img
> )
>>>
>>> My question is how do I get the image from this web page and save
it
>>> to a local directory? scrapeR returns the information from the page
>>> and I suspect this is a possible solution path, but I don't
know what
>>> the next step is.
>>>
>>> My code provided below works from a list I've created of USGS
flow
>>> gauging stations.
>>>
>>> Curtis
>>>
>>> ## Code to process USGS daily flow data for high and low flow
analysis
>>> ## Need to start with list of gauge ids to process
>>> ## Can't figure out how to automate download of images
>>>
>>> require(dataRetrieval)
>>> require(data.table)
>>> require(scrapeR)
>>>
>>> df <- read.csv("usgs_stations.csv", header=TRUE)
>>>
>>> lstas <-length(df$siteno) #length of locator list
>>>
>>> print(paste('Processsing...',df$name[1],'
',df$siteno[1], sep = ""))
>>>
>>> datall <-  readNWISpeak(df$siteno[1])
>>>
>>> for (a in 2:lstas) {
>>>   # Print station being processed
>>>   print(paste('Processsing...',df$name[a],'
',df$siteno[a], sep = ""))
>>>
>>>   dat<-  readNWISpeak(df$siteno[a])
>>>
>>>   datall <- rbind(datall,dat)
>>>
>>> }
>>>
>>> write.csv(datall, file = "usgs_peaks.csv")
>>>
>>> # Retrieve ascii text files and graphics
>>>
>>> for (a in 1:lstas) {
>>>
>>>   print(paste('Processsing...',df$name[1],'
',df$siteno[1], sep = ""))
>>>
>>>   graphic.url <-
>>> paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no>
',df$siteno[a],'&agency_cd=USGS&format=img',
>>> sep = "")
>>>   peakfq.url <-
>>> paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no>
',df$siteno[a],'&agency_cd=USGS&format=hn2',
>>> sep = "")
>>>   tab.url  <-
paste('http://nwis.waterdata.usgs.gov/nwis/peak?site_no>
',df$siteno[a],'&agency_cd=USGS&format=rdb',
>>> sep = "")
>>>
>>>   graphic.fn <-
paste('graphic_',df$siteno[a],'.gif', sep = "")
>>>   peakfq.fn <-
paste('peakfq_',df$siteno[a],'.txt', sep = "")
>>>   tab.fn  <- paste('tab_',df$siteno[a],'.txt',
sep = "")
>>>
>>>   download.file(graphic.url,graphic.fn,mode='wb') # This
apparently
>>> doesn't work - file is empty
>>>   download.file(peakfq.url,peakfq.fn)
>>>   download.file(tab.url,tab.fn)
>>> }
>>>
>>> # scrapeR
>>> pageSource<-scrape(url="
>
http://nwis.waterdata.usgs.gov/nwis/peak?site_no=12144500&agency_cd=USGS&format=img
> ",headers=TRUE,
>>> parse=FALSE)
>>> page<-scrape(object="pageSource")
>>>
>>> ______________________________________________
>>> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more,
see
>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>> PLEASE do read the posting guide
> http://www.R-project.org/posting-guide.html
>>> and provide commented, minimal, self-contained, reproducible code.
>
>         [[alternative HTML version deleted]]
>
> ______________________________________________
> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.

R help - Jun 2015 - web scraping image

[R] web scraping image

[R] web scraping image