class Rubyscholar::Parser

Attributes

crossRefEmail[RW]
parsedPapers[RW]

Public Class Methods

new(url, crossRefEmail = "") click to toggle source
# File lib/rubyscholar-main.rb, line 21
def initialize(url, crossRefEmail = "")
  @parsedPapers  = []
  @crossRefEmail = crossRefEmail # if nil doesn't return any DOI
  parse(url)
end

Public Instance Methods

getDoi(lastNameFirstAuthor, title, crossRefEmail) click to toggle source

Scholar doesn’t provide DOI. But if registered at crossref (its free), DOI can be retreived.

# File lib/rubyscholar-main.rb, line 64
def getDoi(lastNameFirstAuthor, title, crossRefEmail)
  return '' if @crossRefEmail.nil?
  sleep(1) # to reduce risk
  STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
  url = 'http://www.crossref.org/openurl?redirect=false' +  
    '&pid='    + crossRefEmail + 
    '&aulast=' + lastNameFirstAuthor   +
    '&atitle=' + URI.escape(title)
  crossRefXML = Nokogiri::XML(open(url)) 
  crossRefXML.search("doi").children.first.content rescue ''
end
parse(url) click to toggle source
# File lib/rubyscholar-main.rb, line 27
def parse(url)
  STDERR << "Will check #{url}.\n"
  page = Nokogiri::HTML(open(url,
                             'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'), nil, 'utf-8')
  papers = page.css(".gsc_a_tr")
  STDERR << "Found #{papers.length} papers.\n"
  papers.each do |paper|
    title          = paper.css(".gsc_a_at").text rescue ''
    title.gsub!(/\.$/, '')

    googleUrl      = paper.children[0].children[0].attribute('href').text rescue ''
    authors        = paper.children[0].children[1].text.clean rescue ''
    authors.gsub!("...", "et al")

    journal        = paper.children[0].children[2].text rescue '' 
    journalName    = journal.split(/,|\d/).first.clean  rescue ''
    journalDetails = journal.gsub(journalName, '').clean 
    year           = journalDetails.match(/, \d+$/)[0]  rescue ''
    journalDetails = journalDetails.gsub(year, '').clean
    year           = year.clean

    #citations
    citeInfo      = paper.css('.gsc_a_ac')
    citationCount = citeInfo.text
    citationUrl   = citationCount.empty?  ? nil : citeInfo.attribute('href').to_s 

    # get DOI: needs last name of first author, no funny chars
    lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
    doi                 = getDoi( lastNameFirstAuthor, title, @crossRefEmail)

    @parsedPapers.push(Paper.new( title, googleUrl, authors, journalName, journalDetails, year, citationCount, citationUrl, doi))
  end
  STDERR << "Scraped #{parsedPapers.length} from Google Scholar.\n"
end