class Polipus::HTTP

Constants

REDIRECT_LIMIT

Maximum number of redirects to follow on each get_response

RESCUABLE_ERRORS

Public Class Methods

new(opts = {}) click to toggle source
# File lib/polipus/http.rb, line 28
def initialize(opts = {})
  @connections = {}
  @connections_hits = {}
  @opts = opts
end

Public Instance Methods

accept_cookies?() click to toggle source

Does this HTTP client accept cookies from the server?

# File lib/polipus/http.rb, line 143
def accept_cookies?
  @opts[:accept_cookies]
end
fetch_page(url, referer = nil, depth = nil) click to toggle source

Fetch a single Page from the response of an HTTP request to url. Just gets the final destination page.

# File lib/polipus/http.rb, line 38
def fetch_page(url, referer = nil, depth = nil)
  fetch_pages(url, referer, depth).last
end
fetch_pages(url, referer = nil, depth = nil) click to toggle source

Create new Pages from the response of an HTTP request to url, including redirects

# File lib/polipus/http.rb, line 46
def fetch_pages(url, referer = nil, depth = nil)
  url = URI(url)
  pages = []
  get(url, referer) do |response, code, location, redirect_to, response_time|
    handle_compression response
    pages << Page.new(location, body: response.body,
                                code: code,
                                headers: response.to_hash,
                                referer: referer,
                                depth: depth,
                                redirect_to: redirect_to,
                                response_time: response_time,
                                fetched_at: Time.now.to_i)
  end

  pages
rescue *RESCUABLE_ERRORS => e
  if verbose?
    puts e.inspect
    puts e.backtrace
  end

  [Page.new(url, error: e, referer: referer, depth: depth)]
end
open_timeout() click to toggle source

HTTP open timeout in seconds

# File lib/polipus/http.rb, line 137
def open_timeout
  @opts[:open_timeout]
end
proxy_host() click to toggle source

The proxy address string

# File lib/polipus/http.rb, line 93
def proxy_host
  @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
end
proxy_host_port() click to toggle source

Shorthand to get proxy info with a single call It returns an array of [‘addr’, port, ‘user’, ‘pass’]

# File lib/polipus/http.rb, line 123
def proxy_host_port
  @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
end
proxy_pass() click to toggle source

The proxy password

# File lib/polipus/http.rb, line 114
def proxy_pass
  #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
  @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
end
proxy_port() click to toggle source

The proxy port

# File lib/polipus/http.rb, line 100
def proxy_port
  @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
end
proxy_user() click to toggle source

The proxy username

# File lib/polipus/http.rb, line 107
def proxy_user
  @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
end
read_timeout() click to toggle source

HTTP read timeout in seconds

# File lib/polipus/http.rb, line 130
def read_timeout
  @opts[:read_timeout]
end
redirect_limit() click to toggle source

The maximum number of redirects to follow

# File lib/polipus/http.rb, line 74
def redirect_limit
  @opts[:redirect_limit] || REDIRECT_LIMIT
end
user_agent() click to toggle source

The user-agent string which will be sent with each request, or nil if no such option is set

# File lib/polipus/http.rb, line 82
def user_agent
  if @opts[:user_agent].respond_to?(:sample)
    @opts[:user_agent].sample
  else
    @opts[:user_agent]
  end
end

Private Instance Methods

allowed?(to_url, from_url) click to toggle source

Allowed to connect to the requested url?

# File lib/polipus/http.rb, line 268
def allowed?(to_url, from_url)
  to_url.host.nil? || (to_url.host == from_url.host)
end
connection(url) click to toggle source
# File lib/polipus/http.rb, line 217
def connection(url)
  @connections[url.host] ||= {}
  @connections_hits[url.host] ||= {}

  if @connections[url.host][url.port]
    if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
      @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
      return refresh_connection url
    end
    @connections_hits[url.host][url.port] += 1
    return @connections[url.host][url.port]
  end

  refresh_connection url
end
get(url, referer = nil) { |response, code, loc, redirect_to, response_time| ... } click to toggle source

Retrieve HTTP responses for url, including redirects. Yields the response object, response code, and URI location for each response.

# File lib/polipus/http.rb, line 159
def get(url, referer = nil)
  limit = redirect_limit
  loc = url
  loop do
    # if redirected to a relative url, merge it with the host of the original
    # request url
    loc = url.merge(loc) if loc.relative?

    response, response_time = get_response(loc, referer)
    code = Integer(response.code)
    redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
    yield response, code, loc, redirect_to, response_time
    limit -= 1
    break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
  end
end
get_response(url, referer = nil) click to toggle source

Get an HTTPResponse for url, sending the appropriate User-Agent string

# File lib/polipus/http.rb, line 179
def get_response(url, referer = nil)
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"

  opts = {}
  opts['User-Agent'] = user_agent if user_agent
  opts['Referer'] = referer.to_s if referer
  opts['Cookie']  = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
  opts['Accept-Encoding'] = 'gzip,deflate'

  retries = 0
  begin
    start = Time.now
    # format request
    req = Net::HTTP::Get.new(full_path, opts)
    # HTTP Basic authentication
    req.basic_auth url.user, url.password if url.user
    if @opts[:http_user]
      req.basic_auth @opts[:http_user], @opts[:http_password]
    end
    # urls auth schema has higher priority
    req.basic_auth url.user, url.password if url.user
    response = connection(url).request(req)
    finish = Time.now
    response_time = ((finish - start) * 1000).round
    cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
    return response, response_time
  rescue *RESCUABLE_ERRORS => e
    puts e.inspect if verbose?
    refresh_connection(url)
    retries += 1
    if retries < 3
      retry
    else
      raise e
    end
  end
end
handle_compression(response) click to toggle source
# File lib/polipus/http.rb, line 272
def handle_compression(response)
  case response['content-encoding']
  when 'gzip', 'x-gzip'
    body_io = StringIO.new(response.body)
    response.body.replace Zlib::GzipReader.new(body_io).read
  when 'deflate'
    response.body.replace Zlib::Inflate.inflate(response.body)
  end
end
refresh_connection(url) click to toggle source
# File lib/polipus/http.rb, line 233
def refresh_connection(url)
  if @opts[:logger] && proxy_host && proxy_port
    @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
  end

  # Block has higher priority
  unless @opts[:proxy_host_port].nil?
    p_host, p_port, p_user, p_pass = proxy_host_port
  else
    p_host = proxy_host
    p_port = proxy_port
    p_user = proxy_user
    p_pass = proxy_pass
  end

  http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)

  http.read_timeout = read_timeout if read_timeout
  http.open_timeout = open_timeout if open_timeout

  if url.scheme == 'https'
    http.use_ssl = true
    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
  @connections_hits[url.host][url.port] = 1
  @connections[url.host][url.port] = http.start
end
verbose?() click to toggle source
# File lib/polipus/http.rb, line 261
def verbose?
  @opts[:verbose]
end