class WashingtonHikes::Scraper
Public Class Methods
get_page(url)
click to toggle source
Get HTML for a webpage
# File lib/washington_hikes/scraper.rb, line 3 def self.get_page(url) Nokogiri::HTML(open(url)) end
scrape_wta_hike_description(url)
click to toggle source
Scrape hike details from a hike's webpage
# File lib/washington_hikes/scraper.rb, line 44 def self.scrape_wta_hike_description(url) # Scrape description from hike detail page scraped_details = get_page(url).css("#hike-wrapper") # Return a hash with the desired details {description: scraped_details.css("#hike-body-text p").size == 0 ? "" : scraped_details.css("#hike-body-text p")[0].text} end
scrape_wta_hike_list()
click to toggle source
Scrape hike attributes from list of hikes on WTA
# File lib/washington_hikes/scraper.rb, line 9 def self.scrape_wta_hike_list number_of_pages = 2 # Number of WTA pages to scrape (they always show 30 hikes / page) hikes = [] # Create an empty erray to shovel hikes into page_index = 0 # 1st hike in WTA list that will be scaped - feeds into URL # Iterate through the number of hike pages you wish to scrape number_of_pages.times do url = "https://www.wta.org/go-outside/hikes?b_start:int=#{page_index}" page_of_hikes = get_page(url).css("div#search-result-listing .search-result-item") # Collect a hash of hike attributes for each hike on a page page_of_hikes.each do |hike| hike_attributes = { name: hike.css(".item-header span").text.split(" - ")[0].strip, region: hike.css(".item-header h3.region").text.split(" -- ")[0].strip, url: hike.css(".item-header a.listitem-title").attribute("href").value.strip, length: hike.css(".hike-detail .hike-stats .hike-length").size == 0 ? "unknown" : hike.css(".hike-detail .hike-stats .hike-length span").children.text.split(",")[0].split(" ")[0].to_i, type: hike.css(".hike-detail .hike-stats .hike-length").size == 0 ? "unknown" : hike.css(".hike-detail .hike-stats .hike-length span").children.text.split(",")[-1].strip, rating: hike.css(".hike-detail .hike-stats .hike-rating .Rating .AverageRating .star-rating .current-rating").text.split(" ")[0].to_f, features: hike.css(".hike-detail .hike-stats .trip-features").size == 0 ? [] : hike.css(".hike-detail .hike-stats .trip-features")[0].children.css("img").collect{|feature| feature.attribute("title").value}, elevation_gain: hike.css(".hike-detail .hike-stats .hike-gain").size == 0 ? "unknown" : hike.css(".hike-detail .hike-stats .hike-gain span").children.text.strip.to_i } hikes << hike_attributes end # Update URL index -- WTA always shows 30 hikes / page, starting with page_index page_index += 30 end # Return an array of hashes, each containing attributes for a scraped hike hikes end