class Scraper
simple (reusable) scraper class that calls Nokogiri and dumps the requested site
Constants
- CACHE_LOCATIONS
- CHORUS_CHAMPS_SITE
- LOCAL_SITES
- LOCATIONS
site storage
- QUARTET_CHAMPS_SITE
Public Class Methods
load_cache()
click to toggle source
scraper should know what it's scraping, but should not worry about the data classes' architecture
# File lib/barbershop_contestants/scraper.rb, line 57 def self.load_cache # loaded = {} # CACHE_LOCATIONS.each do |key, loc| # # load loc # # loaded[key] = fopen(loc) # end nil # I'll have to figure this out later :/ # try to get these to work: ##### doc = Nokogiri(string_or_io) ##### node.write_to(io, *options) ##### or ##### node.to_s / .to_html / .to_xml end
scrape_and_create_chorus_champs(source)
click to toggle source
# File lib/barbershop_contestants/scraper.rb, line 114 def self.scrape_and_create_chorus_champs(source) # binding.pry scrape_chorus_champs(source).each do |row| # build a hash row_data = row.text.split("\n") # binding.pry c_champs_hash = { year: row_data[1].to_i, name: row_data[2], hometown_and_district: row_data[3], director: row_data[4], number_on_stage: row_data[5], score: row_data[6], place: 1, # champions definitionally are first place type: "chorus" } Performance.find_or_create(c_champs_hash, "chorus") end system "clear" or system "cls" end
scrape_and_create_quartet_champs(source)
click to toggle source
# File lib/barbershop_contestants/scraper.rb, line 85 def self.scrape_and_create_quartet_champs(source) scrape_quartet_champs(source).each do |row| # binding.pry row_data = row.text.split("\n") q_champs_hash = { year: row_data[1].to_i, name: row_data[2], score: row_data[3], district: row_data[4], comments: row_data[5], members: row_data[7], place: 1, # champions definitionally are first place type: "quartet" } Performance.find_or_create(q_champs_hash, "quartet") # binding.pry end system "clear" or system "cls" end
scrape_and_create_year(source, year, type)
click to toggle source
# File lib/barbershop_contestants/scraper.rb, line 135 def self.scrape_and_create_year(source, year, type) if @years_scraped[type].include?(year) # puts "#{year} already scraped" # for debugging return true end @years_scraped[type] << year scrape_year(source, year, type).each do |t| # binding.pry t.each do |tr| row_data = tr.text.split("\n") # binding.pry year_hash = { year: year, place: row_data[1], name: row_data[2], district: row_data[3], score: row_data[4] } year_hash[:number_on_stage] = row_data[5] if type == "chorus" Performance.find_or_create(year_hash, type) end end system "clear" or system "cls" end
scrape_chorus_champs(source)
click to toggle source
# File lib/barbershop_contestants/scraper.rb, line 105 def self.scrape_chorus_champs(source) puts "Scraping Chorus Champs" location = LOCATIONS[:base][source] + LOCATIONS[:c_champs][source] doc = load_cache || scrape_or_load(location) champ_table = doc.css(".wikitable")[1].css("tr") champ_table.shift # remove header line champ_table end
scrape_or_load(page)
click to toggle source
# File lib/barbershop_contestants/scraper.rb, line 50 def self.scrape_or_load(page) load_cache || Nokogiri::HTML(open(page)) end
scrape_quartet_champs(source)
click to toggle source
# File lib/barbershop_contestants/scraper.rb, line 72 def self.scrape_quartet_champs(source) # binding.pry puts "Scraping Quartet Champs" location = LOCATIONS[:base][source] + LOCATIONS[:q_champs][source] doc = load_cache || scrape_or_load(location) # puts "Scraping local copy of site" # TODO: reinstate real scraping functionality when in wifi # binding.pry champ_table = doc.css(".wikitable tbody tr") # get the champs table champ_table.shift # get rid of the headers (can't figure out how to differentiate them with css) champ_table end
scrape_year(source, year, type)
click to toggle source
# File lib/barbershop_contestants/scraper.rb, line 160 def self.scrape_year(source, year, type) puts "Scraping #{type.capitalize} Contest for #{year}" location = LOCATIONS[:base][source] + \ LOCATIONS[(type[0] + "_year").to_sym][source].join(year.to_s) doc = load_cache || scrape_or_load(location) tables_node = doc.css(".wikitable") tables_arr = [] tables_node.each do |t| # binding.pry unless t.css("tr").first.text.include?("Admin") tables_arr << t.css("tr").drop(1) end end tables_arr # binding.pry end