class Scruber::FetcherAdapters::AbstractAdapter

Attributes

followlocation[RW]
max_concurrency[RW]
max_retry_times[RW]
options[RW]
request_timeout[RW]
retry_delays[RW]

Public Class Methods

new(options={}) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 13
def initialize(options={})
  @options = options
  @max_concurrency = options.fetch(:max_concurrency) { 1 }
  @max_retry_times = options.fetch(:max_retry_times) { 5 }
  @retry_delays = options.fetch(:retry_delays) { [1,2,2,4,4] }
  @followlocation = options.fetch(:followlocation) { false }
  @request_timeout = options.fetch(:request_timeout) { 15 }
end

Public Instance Methods

after_request_callback(page) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 30
def after_request_callback(page)
  if bad_response?(page)
    page.retry_at = determine_retry_at(page)
    page.retry_count += 1
    if page.max_retry_times.nil?
      page.max_retry_times = @max_retry_times
    end
    if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i
      page.retry_at = 1.year.from_now.to_i
    end
  else
    # Monkey patch to prevent redownloading of 404 pages
    # and processing 404 pages by regular parsers
    if page.response_code == 404
      page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero?
      page.max_retry_times = page.retry_count
    else
      page.fetched_at = Time.now.to_i
    end
  end
  if page.response_headers
    page.response_headers = page.response_headers.inject({}) {|acc, (k,v)| acc[k.gsub('.', '_')] = v.is_a?(Array) ? v.map{|v1| convert_to_utf8(v1) } : convert_to_utf8(v); acc }
  end
  page.response_body = convert_to_utf8(page.response_body)
  page
end
bad_response?(page) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 117
def bad_response?(page)
  case page.response_code
  when 0..1
    true
  when 200..299
    false
  when 300..399
    @options.fetch(:followlocation) { false }
  when 404
    false
  when 407
    raise "RejectedByProxy"
  else
    true
  end
end
before_request_callback(page) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 26
def before_request_callback(page)
  page
end
convert_to_utf8(text) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 57
def convert_to_utf8(text)
  unless text.to_s.empty?
    detection = CharlockHolmes::EncodingDetector.detect(text)
    if detection && detection[:encoding].present?
      text = CharlockHolmes::Converter.convert(text, detection[:encoding], 'UTF-8') rescue text
    end
  end

  text
end
determine_retry_at(page) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 112
def determine_retry_at(page)
  delay = @retry_delays[page.retry_count] || @retry_delays.last
  Time.now.to_i + delay
end
headers_for(page) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 68
def headers_for(page)
  if page.fetcher_agent
    headers = page.fetcher_agent.headers
  else
    headers = page.headers
  end
  headers = {} unless headers.is_a?(Hash)
  headers["User-Agent"] = user_agent_for(page)
  cookie = cookie_for(page)
  if cookie
    headers["Cookie"] = cookie
  end
  headers
end
proxy_for(page) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 102
def proxy_for(page)
  if page.proxy
    page.proxy
  elsif page.fetcher_agent && page.fetcher_agent.proxy
    page.fetcher_agent.proxy
  else
    Scruber::Helpers::ProxyRotator.next
  end
end
run(queue) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 22
def run(queue)
  raise NotImplementedError
end
user_agent_for(page) click to toggle source
# File lib/scruber/fetcher_adapters/abstract_adapter.rb, line 92
def user_agent_for(page)
  if page.user_agent
    page.user_agent
  elsif page.fetcher_agent && page.fetcher_agent.user_agent
    page.fetcher_agent.user_agent
  else
    Scruber::Helpers::UserAgentRotator.next
  end
end