class Kudzu::Agent::Fetcher

Attributes

pool[R]

Public Class Methods

new(config, robots = nil) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 6
def initialize(config, robots = nil)
  @config = config
  @pool = Http::ConnectionPool.new(@config.max_connection || 100)
  @sleeper = Sleeper.new(@config, robots)
  @filterer = PageFilterer.new(@config)
  @jar = HTTP::CookieJar.new
end

Public Instance Methods

fetch(url, request_header: {}, method: :get, redirect: @config.max_redirect, redirect_from: nil) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 14
def fetch(url, request_header: {}, method: :get, redirect: @config.max_redirect, redirect_from: nil)
  uri = Addressable::URI.parse(url)
  request = build_request(uri, request_header: request_header, method: method)
  response, response_time = send_request(uri, request)

  if redirection?(response.code) && response['location'] && redirect > 0
    fetch(uri.join(response['location']).to_s, request_header: request_header,
                                               redirect: redirect - 1,
                                               redirect_from: redirect_from || url)
  else
    build_response(url, response, response_time, redirect_from)
  end
end

Private Instance Methods

build_http(uri) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 58
def build_http(uri)
  http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
  http.open_timeout = @config.open_timeout if @config.open_timeout
  http.read_timeout = @config.read_timeout if @config.read_timeout
  http.keep_alive_timeout = @config.keep_alive if @config.keep_alive
  if uri.scheme == 'https'
    http.use_ssl = true
    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end
  http.start
end
build_request(uri, request_header:, method:) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 70
def build_request(uri, request_header:, method:)
  request = Object.const_get("Net::HTTP::#{method.capitalize}").new(uri.request_uri)
  request.basic_auth uri.user, uri.password if uri.user && uri.password

  request['User-Agent'] = @config.user_agent
  request_header.each do |key, value|
    request[key] = value
  end
  request
end
build_response(url, response, response_time, redirect_from) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 81
def build_response(url, response, response_time, redirect_from)
  fetched = response.instance_variable_get("@read")
  Response.new(url: url,
               status: response.code.to_i,
               body: fetched ? response.body.to_s : nil,
               response_header: force_header_encoding(Hash[response.each.to_a]),
               response_time: response_time,
               redirect_from: redirect_from,
               fetched: fetched)
end
force_header_encoding(response_header) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 92
def force_header_encoding(response_header)
  response_header.each do |key, value|
    response_header[key] = value.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace)
  end
end
pool_name(uri) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 30
def pool_name(uri)
  "#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}"
end
redirection?(code) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 98
def redirection?(code)
  code = code.to_i
  300 <= code && code <= 399
end
send_request(uri, request) click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 34
def send_request(uri, request)
  start_http(uri, request) do |http|
    http.request(request) do |response|
      unless @filterer.allowed_response_header?(uri.to_s, response)
        http.finish
        break response
      end
    end
  end
end
start_http(uri, request) { |http| ... } click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 45
def start_http(uri, request)
  http = @pool.checkout(pool_name(uri)) { build_http(uri) }
  append_cookie(uri, request) if @config.handle_cookie
  @sleeper.politeness_delay(uri)

  start = Time.now.to_f
  response = yield http
  response_time = Time.now.to_f - start

  parse_cookie(uri, response) if @config.handle_cookie
  return response, response_time
end