class RayyanScrapers::ScraperBase
Constants
- DEFAULT_MAX_PARALLEL_ARTICLES
- DEFAULT_MAX_PARALLEL_REFPAGES
Public Class Methods
max_pages_to_scrape()
click to toggle source
functions to override in subclasses
# File lib/rayyan-scrapers/scraper_base.rb, line 85 def self.max_pages_to_scrape; 0 end
max_parallel_articles()
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 75 def self.max_parallel_articles (ENV['SCRAPERS_MAX_PARALLEL_ARTICLES'] || DEFAULT_MAX_PARALLEL_ARTICLES).to_i end
max_parallel_refpages()
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 79 def self.max_parallel_refpages (ENV['SCRAPERS_MAX_PARALLEL_REFPAGES'] || DEFAULT_MAX_PARALLEL_REFPAGES).to_i end
max_results()
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 86 def self.max_results; max_pages_to_scrape * results_per_page end
new(logger = nil, moneta_options = nil)
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 12 def initialize(logger = nil, moneta_options = nil) @site_id = 'UNKNOWN' # During capybara tests (stubbed Typheous, no cache), hydra stops processing queue on reaching @max_parallel_articles! @max_parallel_articles = self.class.max_parallel_articles @max_parallel_refpages = self.class.max_parallel_refpages @logger = logger || DummyLogger.new @hercules_articles = Hercules.new @logger, {:max_concurrency => @max_parallel_articles}, moneta_options @hercules_refpages = Hercules.new @logger, {:max_concurrency => @max_parallel_refpages}, moneta_options @headers = {headers: {"User-Agent"=>"Mozilla/5.0"}} end
node_html(page, xpath)
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 70 def self.node_html(page, xpath) n = page.at(xpath) n = n.inner_html.strip if n end
node_text(page, xpath)
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 65 def self.node_text(page, xpath) n = page.at(xpath) n = n.text.strip if n end
results_per_page()
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 89 def self.results_per_page; raise 'Not implemented' end
Public Instance Methods
enough_pages(page_id)
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 61 def enough_pages(page_id) self.class.max_pages_to_scrape > 0 && page_id > self.class.max_pages_to_scrape end
get_detail()
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 93 def get_detail; raise 'Not implemented' end
get_next_page_link(page, page_id)
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 91 def get_next_page_link(page, page_id); raise 'Not implemented' end
get_start_page()
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 90 def get_start_page; raise 'Not implemented' end
iterate_list_pages(page, &block)
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 45 def iterate_list_pages(page, &block) page_id = 1 while page != nil @logger.info "Processing page #{page_id}..." items_count = process_list_page(page, &block) page_id += 1 if items_count == 0 || enough_pages(page_id) @logger.info "Stopping at this page" page = nil else url = get_next_page_link page, page_id page = url ? Typhoeus::Request.get(url, @headers) : nil end end end
process_detail_page()
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 94 def process_detail_page; raise 'Not implemented' end
process_list_page(page)
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 92 def process_list_page(page); raise 'Not implemented' end
scrape() { |item, total| ... }
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 26 def scrape @logger.info "Scraping as #{self.class.name}" t1 = Time.now page = get_start_page total = total_pages page @logger.info "Total results: #{total}" iterate_list_pages(page) {|item| yield item, total if block_given?} @hercules_refpages.kill do |done_requests_refpages| @hercules_articles.kill do |done_requests_articles| @logger.info "hercules_articles killed hydra" tdiff = Time.now - t1 done_requests = done_requests_refpages + done_requests_articles @logger.info "FINISHED #{done_requests} requests in #{tdiff.round} seconds (#{done_requests/tdiff} r/s)" end end end
total_pages(page)
click to toggle source
# File lib/rayyan-scrapers/scraper_base.rb, line 87 def total_pages(page); 'Unknown' end