class Kudzu::Agent::Robots

Public Class Methods

new(config) click to toggle source
# File lib/kudzu/agent/robots.rb, line 4
def initialize(config)
  @config = config
  @monitor = Monitor.new
  @txt = {}
end

Public Instance Methods

allowed?(uri) click to toggle source
# File lib/kudzu/agent/robots.rb, line 10
def allowed?(uri)
  uri = Addressable::URI.parse(uri) if uri.is_a?(String)
  set = find_set(uri)
  return true unless set
  set.allowed_path?(uri)
end
crawl_delay(uri) click to toggle source
# File lib/kudzu/agent/robots.rb, line 17
def crawl_delay(uri)
  uri = Addressable::URI.parse(uri) if uri.is_a?(String)
  set = find_set(uri)
  return nil unless set
  set.crawl_delay
end
sitemaps(uri) click to toggle source
# File lib/kudzu/agent/robots.rb, line 24
def sitemaps(uri)
  uri = Addressable::URI.parse(uri) if uri.is_a?(String)
  txt = find_txt(uri)
  return [] unless txt
  txt.sitemaps
end

Private Instance Methods

fetch(base_uri) click to toggle source
# File lib/kudzu/agent/robots.rb, line 59
def fetch(base_uri)
  uri = base_uri.dup
  uri.path = 'robots.txt'
  uri.fragment = uri.query = nil

  http = Net::HTTP.new(uri.host, uri.port || uri.default_port)
  http.open_timeout = @config.open_timeout if @config.open_timeout
  http.read_timeout = @config.read_timeout if @config.read_timeout
  if uri.scheme == 'https'
    http.use_ssl = true
    http.verify_mode = OpenSSL::SSL::VERIFY_NONE
  end

  begin
    http.get(uri.request_uri)
  rescue => e
    Kudzu.log :error, "failed to fetch robots.txt: #{uri}", error: e
    nil
  end
end
fetch_and_parse(uri) click to toggle source
# File lib/kudzu/agent/robots.rb, line 49
def fetch_and_parse(uri)
  response = fetch(uri)
  if response && response.code.to_i == 200
    body = response.body.force_encoding('utf-8').encode('utf-8', undef: :replace, invalid: :replace)
    Parser.parse(body)
  else
    Parser.parse('')
  end
end
find_set(uri) click to toggle source
# File lib/kudzu/agent/robots.rb, line 39
def find_set(uri)
  txt = find_txt(uri)
  return unless txt

  txt.sets.each do |set|
    return set if @config.user_agent =~ set.user_agent
  end
  return nil
end
find_txt(uri) click to toggle source
# File lib/kudzu/agent/robots.rb, line 33
def find_txt(uri)
  @monitor.synchronize do
    @txt[uri.host] ||= fetch_and_parse(uri)
  end
end