class RayyanScrapers::ScraperBase

Constants

DEFAULT_MAX_PARALLEL_ARTICLES
DEFAULT_MAX_PARALLEL_REFPAGES

Public Class Methods

max_pages_to_scrape() click to toggle source

functions to override in subclasses

# File lib/rayyan-scrapers/scraper_base.rb, line 85
def self.max_pages_to_scrape; 0 end
max_parallel_articles() click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 75
def self.max_parallel_articles
  (ENV['SCRAPERS_MAX_PARALLEL_ARTICLES'] || DEFAULT_MAX_PARALLEL_ARTICLES).to_i
end
max_parallel_refpages() click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 79
def self.max_parallel_refpages
  (ENV['SCRAPERS_MAX_PARALLEL_REFPAGES'] || DEFAULT_MAX_PARALLEL_REFPAGES).to_i
end
max_results() click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 86
def self.max_results; max_pages_to_scrape * results_per_page end
new(logger = nil, moneta_options = nil) click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 12
def initialize(logger = nil, moneta_options = nil)
  @site_id = 'UNKNOWN'
  # During capybara tests (stubbed Typheous, no cache), hydra stops processing queue on reaching @max_parallel_articles!
  @max_parallel_articles = self.class.max_parallel_articles
  @max_parallel_refpages = self.class.max_parallel_refpages

  @logger = logger || DummyLogger.new

  @hercules_articles = Hercules.new @logger, {:max_concurrency => @max_parallel_articles}, moneta_options
  @hercules_refpages = Hercules.new @logger, {:max_concurrency => @max_parallel_refpages}, moneta_options

  @headers = {headers: {"User-Agent"=>"Mozilla/5.0"}}
end
node_html(page, xpath) click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 70
def self.node_html(page, xpath)
  n = page.at(xpath)
  n = n.inner_html.strip if n
end
node_text(page, xpath) click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 65
def self.node_text(page, xpath)
  n = page.at(xpath)
  n = n.text.strip if n
end
results_per_page() click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 89
def self.results_per_page; raise 'Not implemented' end

Public Instance Methods

enough_pages(page_id) click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 61
def enough_pages(page_id)
  self.class.max_pages_to_scrape > 0 && page_id > self.class.max_pages_to_scrape
end
get_detail() click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 93
def get_detail; raise 'Not implemented' end
get_start_page() click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 90
def get_start_page; raise 'Not implemented' end
iterate_list_pages(page, &block) click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 45
def iterate_list_pages(page, &block)
  page_id = 1
  while page != nil
    @logger.info "Processing page #{page_id}..."
    items_count = process_list_page(page, &block)
    page_id += 1
    if items_count == 0 || enough_pages(page_id)
      @logger.info "Stopping at this page"
      page = nil
    else
      url = get_next_page_link page, page_id
      page = url ? Typhoeus::Request.get(url, @headers) : nil
    end
  end
end
process_detail_page() click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 94
def process_detail_page; raise 'Not implemented' end
process_list_page(page) click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 92
def process_list_page(page); raise 'Not implemented' end
scrape() { |item, total| ... } click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 26
def scrape
  @logger.info "Scraping as #{self.class.name}"
  t1 = Time.now

  page = get_start_page
  total = total_pages page
  @logger.info "Total results: #{total}"
  iterate_list_pages(page) {|item| yield item, total if block_given?}

  @hercules_refpages.kill do |done_requests_refpages|
    @hercules_articles.kill do |done_requests_articles|
      @logger.info "hercules_articles killed hydra"
      tdiff = Time.now - t1
      done_requests = done_requests_refpages + done_requests_articles
      @logger.info "FINISHED #{done_requests} requests in #{tdiff.round} seconds (#{done_requests/tdiff} r/s)"
    end
  end
end
total_pages(page) click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 87
def total_pages(page); 'Unknown' end