class HouseFloorBills::Scraper
Attributes
doc_schedule[RW]
schedule[RW]
Public Class Methods
new(week = "")
click to toggle source
# File lib/house_floor_bills/scraper.rb, line 4 def initialize(week = "") # Format of week must be "2017-03-27" @schedule = HouseFloorBills::Schedule.new @doc_schedule = Nokogiri::HTML(open("https://docs.house.gov/floor/Default.aspx?date=#{week}")) end
Public Instance Methods
get_congress()
click to toggle source
# File lib/house_floor_bills/scraper.rb, line 21 def get_congress current_DC_time = Time.now.getlocal('-04:00') start_of_congress_116 = Time.utc(2019,"jan",3,04,00,0) start_of_congress_117 = Time.utc(2021,"jan",3,04,00,0) start_of_congress_118 = Time.utc(2023,"jan",3,04,00,0) if current_DC_time < start_of_congress_117 return "116th-congress" elsif current_DC_time < start_of_congress_118 return "117th-congress" elsif current_DC_time >= start_of_congress_118 return "118th-congress" end end
scrape()
click to toggle source
# File lib/house_floor_bills/scraper.rb, line 9 def scrape scrape_schedule scrape_bills @schedule end
scrape_bills()
click to toggle source
# File lib/house_floor_bills/scraper.rb, line 37 def scrape_bills print "\nLoading bills " @doc_schedule.search("table.floorItems > tr.floorItem").collect do |floor_item| # Instantiate the bill b = HouseFloorBills::Bill.new # Scrape the data b.number = floor_item.css("td.legisNum").text.strip b.name = floor_item.css("td.floorText").text.strip b.pdf = floor_item.css("td.files a").attr("href").text print "." # Set URL conditionally, based on type of bill: if b.number.split.include? "H.R." b.url = "https://www.congress.gov/bill/#{get_congress}/house-bill/#{b.number.split.last}" elsif b.number.split.include? "H.Res." b.url = "https://www.congress.gov/bill/#{get_congress}/house-resolution/#{b.number.split.last}" elsif b.number.split.include? "S." b.url = "https://www.congress.gov/bill/#{get_congress}/senate-bill/#{b.number.split.last}" else b.url = "" end # Handle error if couldn't get bill URL if b.url == "" # Set all to blank b.sponsor = "" b.committees = "" b.status = "" b.summary = "" else doc_bill ||= Nokogiri::HTML(open(b.url)) b.sponsor = doc_bill.search("table.standard01 > tr:first-child a").text.strip b.committees = doc_bill.search("table.standard01 > tr:nth-child(2) td").text.strip b.status = doc_bill.search("ol.bill_progress li.selected > text()").text.strip b.summary = doc_bill.search("div#bill-summary > p, div#bill-summary li").to_s.gsub("</p>","\n\n").gsub("</li>","\n\n").gsub(/<\/.+>/,"").gsub(/<.+>/,"") if b.summary == "" b.summary = doc_bill.search("div#main > p").text end end # Add the bill to the schedule @schedule.add_bill(b) end end
scrape_schedule()
click to toggle source
# File lib/house_floor_bills/scraper.rb, line 15 def scrape_schedule # Populate @schedule with more data from the schedule page @schedule.title = @doc_schedule.search("div#primaryContent h1 > text()").text.strip.gsub("\r\n ", " ") @schedule.week = @doc_schedule.search("div#primaryContent h1 > text()").text.split("\n").last.strip end