class Kudzu::Agent::Fetcher
Attributes
pool[R]
Public Class Methods
new(config, robots = nil)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 6 def initialize(config, robots = nil) @config = config @pool = Http::ConnectionPool.new(@config.max_connection || 100) @sleeper = Sleeper.new(@config, robots) @filterer = PageFilterer.new(@config) @jar = HTTP::CookieJar.new end
Public Instance Methods
fetch(url, request_header: {}, method: :get, redirect: @config.max_redirect, redirect_from: nil)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 14 def fetch(url, request_header: {}, method: :get, redirect: @config.max_redirect, redirect_from: nil) uri = Addressable::URI.parse(url) request = build_request(uri, request_header: request_header, method: method) response, response_time = send_request(uri, request) if redirection?(response.code) && response['location'] && redirect > 0 fetch(uri.join(response['location']).to_s, request_header: request_header, redirect: redirect - 1, redirect_from: redirect_from || url) else build_response(url, response, response_time, redirect_from) end end
Private Instance Methods
build_http(uri)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 58 def build_http(uri) http = Net::HTTP.new(uri.host, uri.port || uri.default_port) http.open_timeout = @config.open_timeout if @config.open_timeout http.read_timeout = @config.read_timeout if @config.read_timeout http.keep_alive_timeout = @config.keep_alive if @config.keep_alive if uri.scheme == 'https' http.use_ssl = true http.verify_mode = OpenSSL::SSL::VERIFY_NONE end http.start end
build_request(uri, request_header:, method:)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 70 def build_request(uri, request_header:, method:) request = Object.const_get("Net::HTTP::#{method.capitalize}").new(uri.request_uri) request.basic_auth uri.user, uri.password if uri.user && uri.password request['User-Agent'] = @config.user_agent request_header.each do |key, value| request[key] = value end request end
build_response(url, response, response_time, redirect_from)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 81 def build_response(url, response, response_time, redirect_from) fetched = response.instance_variable_get("@read") Response.new(url: url, status: response.code.to_i, body: fetched ? response.body.to_s : nil, response_header: force_header_encoding(Hash[response.each.to_a]), response_time: response_time, redirect_from: redirect_from, fetched: fetched) end
force_header_encoding(response_header)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 92 def force_header_encoding(response_header) response_header.each do |key, value| response_header[key] = value.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace) end end
pool_name(uri)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 30 def pool_name(uri) "#{uri.scheme}_#{uri.host}_#{uri.port || uri.default_port}" end
redirection?(code)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 98 def redirection?(code) code = code.to_i 300 <= code && code <= 399 end
send_request(uri, request)
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 34 def send_request(uri, request) start_http(uri, request) do |http| http.request(request) do |response| unless @filterer.allowed_response_header?(uri.to_s, response) http.finish break response end end end end
start_http(uri, request) { |http| ... }
click to toggle source
# File lib/kudzu/agent/fetcher.rb, line 45 def start_http(uri, request) http = @pool.checkout(pool_name(uri)) { build_http(uri) } append_cookie(uri, request) if @config.handle_cookie @sleeper.politeness_delay(uri) start = Time.now.to_f response = yield http response_time = Time.now.to_f - start parse_cookie(uri, response) if @config.handle_cookie return response, response_time end