class Scraper
Public Class Methods
scrape_player(url)
click to toggle source
scrape the player data from url / html file
# File lib/scraper.rb, line 13 def self.scrape_player(url) newarray=[] # I decided to scrape twice because I found too many irregular HTML elements and an irregular number of elements between the headline block and the particular heading-container # I was looking to scrape. Sometimes this included a second heading-container containing a NBSP or other elements. # There was no easy way to go from the block containing the name to the correct heading-container block (maybe iterate until you can find the next correct heading-container block.) #scrape the name, school, year, etc for all players player_id_arr=scrape_page_head(url) #scrape the height, weight, scouting report player_info_arr=scrape_phys_meas(url) # merge the two parts of data -- the player info and some of the additional data newarray=(player_id_arr.zip (player_info_arr)).map do |row| row[0].merge(row[1]) end newarray end
Private Class Methods
scrape_page_head(url)
click to toggle source
had to scrape the data from the page in two passes because of too many irregularities in the HTML/CSS code first scrape the names
# File lib/scraper.rb, line 38 def scrape_page_head(url) i=0 html = open(url) # binding.pry playerindex = Nokogiri::HTML(html) # initialize array of player info hashes playhasharr = [] # sxcrape to get all the player information headline-blocks (player name, position, school/club) pheadarr=playerindex.css ("div.component.headline-block") # put the name, position, school/club, year elements into a hash, then put the hash into an array pheadarr.each do |phrow| playhashrow={} # create playhashrow -- a hash for name, position, school/club, year # Get the name from div.head (div tag, class=head). The split(",").first is to remove a comma # in the last name on the list playhashrow[:name]=(phrow.css("div.text-block div.head").text).split(",").first # schoolclub variable here actually is "position|schoolclub, year" schoolclub=phrow.css("div.text-block div.subhead").text # get the player's position here in the first part of schoolclub (originally named split1) playhashrow[:position]=schoolclub.split(",").first # split1=prevexp.split(",").last.split("|").first # Get the school / club name from the headline block playhashrow[:schoolclub]=schoolclub.split(",").last.split("|").first.strip # Get the class year class_year = schoolclub.split(",").last.split("|").last.strip # The way that the schoolclub variabkle if class_year !=playhashrow[:schoolclub] playhashrow[:class_year]=class_year end # put the current rank into the player hash (to avoid having to search later) playhashrow[:rank]=i+1 # binding.pry playhasharr << playhashrow # puts ("number #{i+1}") i+=1 end playhasharr end
scrape_phys_meas(url)
click to toggle source
need to scrape in two passes because of odd extra div nodes /lines in the HTML
# File lib/scraper.rb, line 86 def scrape_phys_meas(url) html = open(url) playerindex = Nokogiri::HTML(html) containarr=playerindex.css ("div.component.heading-container") containarr_mod=containarr.select {|prow| (prow.text).include?("Height")} containhasharr=[] containarr_mod.each do |pmeas_node| row_hash={} pm_node_text=pmeas_node.text # binding.pry htwt = pm_node_text.gsub(NBSP,"").strip blurb=pmeas_node.next_element.text row_hash[:height]=(htwt.split("|").first).split(":").last.strip row_hash[:weight]=(htwt.split("|")[1]).split(":").last.strip row_hash[:age]=((htwt.split("|")[2]).split(":").last.strip).to_i lastitem=(htwt.split("|")[3]) if lastitem != nil # this is because most entries with no last rank have "Last Rank: NR" BUT one does not have any entries row_hash[:last_rank]=lastitem.split(":").last.strip # binding.pry end row_hash[:blurb] = blurb containhasharr << row_hash end # binding.pry containhasharr end