class EFFScraper
Public Class Methods
new(url)
click to toggle source
# File lib/effscraper.rb, line 7 def initialize(url) @url = url @casearray = Array.new end
Public Instance Methods
scrapeCase()
click to toggle source
Scrapes all documents in case
# File lib/effscraper.rb, line 13 def scrapeCase html = Nokogiri::HTML(open(@url)) # Get number of pages to scrape if html.css("li.pager-current")[0] count = html.css("li.pager-current")[0].text.split(" ") n = count[2].to_i else n = 1 end # Go through pages and scrape them for i in 1..n if i > 1 link = "https://eff.org" + html.css("li.pager-next")[0].css("a")[0]["href"] html = Nokogiri::HTML(open(link)) end scrapePage(html) end JSON.pretty_generate(@casearray) end
scrapePage(html)
click to toggle source
Scrapes each page of documents
# File lib/effscraper.rb, line 38 def scrapePage(html) items = html.css("div.view-content")[0] items.css("li").each do |l| dochash = Hash.new # Gets link to document and file l.css("a").each do |a| if a.text == "[PDF]" dochash[:url] = a["href"] `wget -P public/uploads #{dochash[:url]}` path = dochash[:url].split("/") dochash[:path] = "public/uploads/" + path[path.length-1].chomp.strip end end # Get date and title dochash[:doc_date] = l.css("span.date-display-single").text dochash[:title] = l.css("a")[1].text # Extract metadata and text begin u = UploadConvert.new(dochash[:path]) metadata = u.extractMetadataPDF metadata.each{|k, v| dochash[k] = v} dochash[:text] = u.detectPDFType @casearray.push(dochash) rescue end end end