class Anemone::HTTP

Constants

REDIRECT_LIMIT

Maximum number of redirects to follow on each get_response

Attributes

Public Class Methods

new(opts = {}) click to toggle source
# File lib/anemone/http.rb, line 13
def initialize(opts = {})
  @connections = {}
  @opts = opts
  @cookie_store = CookieStore.new(@opts[:cookies])
end

Public Instance Methods

accept_cookies?() click to toggle source

Does this HTTP client accept cookies from the server?

# File lib/anemone/http.rb, line 73
def accept_cookies?
  @opts[:accept_cookies]
end
fetch_page(url, referer = nil, depth = nil) click to toggle source

Fetch a single Page from the response of an HTTP request to url. Just gets the final destination page.

# File lib/anemone/http.rb, line 23
def fetch_page(url, referer = nil, depth = nil)
  fetch_pages(url, referer, depth).last
end
fetch_pages(url, referer = nil, depth = nil) click to toggle source

Create new Pages from the response of an HTTP request to url, including redirects

# File lib/anemone/http.rb, line 31
def fetch_pages(url, referer = nil, depth = nil)
  begin
    url = URI(url) unless url.is_a?(URI)
    pages = []
    get(url, referer) do |response, code, location, redirect_to, response_time|
      pages << Page.new(location, :body => response.body.dup,
                                  :code => code,
                                  :headers => response.to_hash,
                                  :referer => referer,
                                  :depth => depth,
                                  :redirect_to => redirect_to,
                                  :response_time => response_time)
    end

    return pages
  rescue Exception => e
    if verbose?
      puts e.inspect
      puts e.backtrace
    end
    return [Page.new(url, :error => e)]
  end
end
proxy_host() click to toggle source

The proxy address string

# File lib/anemone/http.rb, line 80
def proxy_host
  @opts[:proxy_host]
end
proxy_port() click to toggle source

The proxy port

# File lib/anemone/http.rb, line 87
def proxy_port
  @opts[:proxy_port]
end
read_timeout() click to toggle source

HTTP read timeout in seconds

# File lib/anemone/http.rb, line 94
def read_timeout
  @opts[:read_timeout]
end
redirect_limit() click to toggle source

The maximum number of redirects to follow

# File lib/anemone/http.rb, line 58
def redirect_limit
  @opts[:redirect_limit] || REDIRECT_LIMIT
end
user_agent() click to toggle source

The user-agent string which will be sent with each request, or nil if no such option is set

# File lib/anemone/http.rb, line 66
def user_agent
  @opts[:user_agent]
end

Private Instance Methods

allowed?(to_url, from_url) click to toggle source

Allowed to connect to the requested url?

# File lib/anemone/http.rb, line 182
def allowed?(to_url, from_url)
  to_url.host.nil? || (to_url.host == from_url.host)
end
connection(url) click to toggle source
# File lib/anemone/http.rb, line 152
def connection(url)
  @connections[url.host] ||= {}

  if conn = @connections[url.host][url.port]
    return conn
  end

  refresh_connection url
end
get(url, referer = nil) { |response, code, loc, redirect_to, response_time| ... } click to toggle source

Retrieve HTTP responses for url, including redirects. Yields the response object, response code, and URI location for each response.

# File lib/anemone/http.rb, line 105
def get(url, referer = nil)
  limit = redirect_limit
  loc = url
  begin
      # if redirected to a relative url, merge it with the host of the original
      # request url
      loc = url.merge(loc) if loc.relative?

      response, response_time = get_response(loc, referer)
      code = Integer(response.code)
      redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
      yield response, code, loc, redirect_to, response_time
      limit -= 1
  end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
end
get_response(url, referer = nil) click to toggle source

Get an HTTPResponse for url, sending the appropriate User-Agent string

# File lib/anemone/http.rb, line 124
def get_response(url, referer = nil)
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"

  opts = {}
  opts['User-Agent'] = user_agent if user_agent
  opts['Referer'] = referer.to_s if referer
  opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)

  retries = 0
  begin
    start = Time.now()
    # format request
    req = Net::HTTP::Get.new(full_path, opts)
    # HTTP Basic authentication
    req.basic_auth url.user, url.password if url.user
    response = connection(url).request(req)
    finish = Time.now()
    response_time = ((finish - start) * 1000).round
    @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
    return response, response_time
  rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
    puts e.inspect if verbose?
    refresh_connection(url)
    retries += 1
    retry unless retries > 3
  end
end
refresh_connection(url) click to toggle source
# File lib/anemone/http.rb, line 162
def refresh_connection(url)
  http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)

  http.read_timeout = read_timeout if !!read_timeout

  if url.scheme == 'https'
    http.use_ssl = true
    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end

  @connections[url.host][url.port] = http.start 
end
verbose?() click to toggle source
# File lib/anemone/http.rb, line 175
def verbose?
  @opts[:verbose]
end