class RayyanScrapers::PubMedHealthScraper
Public Class Methods
max_pages_to_scrape()
click to toggle source
# File lib/rayyan-scrapers/pub_med_health_scraper.rb, line 24 def self.max_pages_to_scrape 26 end
new()
click to toggle source
Calls superclass method
# File lib/rayyan-scrapers/pub_med_health_scraper.rb, line 3 def initialize super @base_url = 'http://www.ncbi.nlm.nih.gov/pubmedhealth' @search_url = "#{@base_url}/s/full_text_reviews_medrev" @detail_url = @base_url @pubmed_detail_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml" @logger.debug "PubMedHealth scraper initialized" end
Public Instance Methods
create_search_url(page)
click to toggle source
# File lib/rayyan-scrapers/pub_med_health_scraper.rb, line 13 def create_search_url(page) char = (?a.ord + page - 1).chr "#{@search_url}/#{char}" end
extract_pmh_abstracts(doc, mArticle)
click to toggle source
# File lib/rayyan-scrapers/pub_med_health_scraper.rb, line 71 def extract_pmh_abstracts(doc, mArticle) (doc/'//div[@class="body-content whole_rhythm"]/div[@id!=""]').each do |abstract| #@logger.debug "Found abstract: #{abstract} with h4: #{abstract.at('./h4')} and text: #{abstract.text}" label = ScraperBase.node_text(abstract, './h4') content = ScraperBase.node_text(abstract, './p') unless label.nil? or content.nil? mArticle.abstracts.build :label => label.gsub(/:/, ''), :category => abstract['id'], :content => content end end end
get_start_page()
click to toggle source
# File lib/rayyan-scrapers/pub_med_health_scraper.rb, line 18 def get_start_page url = create_search_url(1) page = @agent.get url Nokogiri::HTML.parse(page.body, URI.escape(url)) end
process_list_page(page) { |article, true| ... }
click to toggle source
# File lib/rayyan-scrapers/pub_med_health_scraper.rb, line 28 def process_list_page(page) begin page.url # parsed html rescue page = Nokogiri::HTML.parse(page.body, page.uri.to_s) end @logger.info("Processing list page with URL: #{page.url}") new_items_found = false # don't stop before max_pages items = page/'//ul[@class="resultList"]/li/a' # pline "Found #{items.length} items in page", true items.each do |anchor| pmhid = anchor['href'].match(/PMH[\d]+/)[0] link = "#{@detail_url}/#{pmhid}" title = anchor.text @logger.info "Got result with id #{pmhid} and link #{link} with title #{title}" # pline " Item #{@curr_property} of #{@total}..." # get detailed info begin article = Article.find_by_url(link) if article.nil? new_items_found = true article = process_pmh_detail_page(@agent.get(link), pmhid, title, link) yield article, true else yield article, false end rescue => exception @logger.error "Error processing #{link}:" @logger.error exception @logger.error exception.backtrace.join("\n") end @curr_property = @curr_property + 1 end new_items_found end
process_pmh_detail_page(page, pmhid, title, link)
click to toggle source
# File lib/rayyan-scrapers/pub_med_health_scraper.rb, line 84 def process_pmh_detail_page(page, pmhid, title, link) pmid = ScraperBase.node_text(page, '//a[@title="PubMed record of this title"]') if pmid.blank? @logger.warn "No PMID record for #{pmhid}" doc = Nokogiri::HTML.parse(page.body, page.uri.to_s) mArticle = Article.new mArticle.source = @source mArticle.sid = pmhid mArticle.title = title mArticle.publication_types << PublicationType.where(name: "Book").first_or_create extract_pmh_abstracts doc, mArticle else pmlink = "#{@pubmed_detail_url}&id=#{pmid}" @logger.debug "Now processing PMID #{pmid} with url #{pmlink}" mArticle = process_detail_page(@agent.get(pmlink), pmid) end mArticle.url = link mArticle.save mArticle end