class RayyanScrapers::Hercules
Public Class Methods
new(logger, hydra_options = {}, moneta_options = nil)
click to toggle source
# File lib/support/hercules.rb, line 6 def initialize(logger, hydra_options = {}, moneta_options = nil) @hydra = Typhoeus::Hydra.new hydra_options @killed = false @pending_requests = 0 @done_requests = 0 @max_hydra_queue_length = 200 @logger = logger || DummyLogger.new @after_kill = nil @cache = moneta_options ? Moneta.new(*moneta_options) : Moneta.new(:Null) end
Public Instance Methods
check_killed()
click to toggle source
# File lib/support/hercules.rb, line 80 def check_killed @logger.debug "+-+-+- pending_requests: #{@pending_requests}, done_requests: #{@done_requests}" return if !@killed || @pending_requests > 0 @after_kill.call @done_requests if @after_kill end
fight(heads) { |item| ... }
click to toggle source
hydra_run
# File lib/support/hercules.rb, line 18 def fight(heads) heads.each do |item| yield item end @hydra.run end
kill(&after_kill)
click to toggle source
# File lib/support/hercules.rb, line 74 def kill(&after_kill) @killed = true @after_kill = after_kill check_killed end
strike(link, cache_key = nil, yield_exception = false) { |request, response| ... }
click to toggle source
hydra_queue
# File lib/support/hercules.rb, line 26 def strike(link, cache_key = nil, yield_exception = false) request = Typhoeus::Request.new(link, :followlocation => true, headers: {"User-Agent"=>"Mozilla/5.0"}) if cache_key # look for cached version response = @cache[cache_key] unless response.nil? @logger.debug "Cache hit: #{cache_key}" yield request, response return end end request.on_complete do |response| if response.code == 0 # Could not get an http response, something's wrong. err = "ERROR: Unknown error (#{response}) while requesting #{link}" @logger.error err yield request, Exception.new(err) if yield_exception elsif response.timed_out? # aw hell no err = "ERROR: Timed out while requesting #{link}" @logger.error err yield request, Exception.new(err) if yield_exception elsif response.success? || response.code - 200 < 100 # in the middle of such dead slow network/processing, I am optimizing a compare and an AND! #funny begin @cache[cache_key] = response.body if cache_key yield request, response.body rescue => e @logger.warn "WARNING: Exception while processing response for #{link}" @logger.warn e end else # Received a non-successful http response. err = "ERROR: HTTP request failed: #{response.code.to_s} while requesting #{link}" @logger.error err yield request, Exception.new(err) if yield_exception end @done_requests += 1 @pending_requests -= 1 check_killed end @pending_requests += 1 @hydra.queue(request) @logger.debug "++++ Hydra has #{@hydra.queued_requests.length} queued requests" # prevent queue from growing too big, thus delaying hydra.run too much @hydra.run if @hydra.queued_requests.length > @max_hydra_queue_length end