class Polipus::Robotex::ParsedRobots

Public Class Methods

new(uri, user_agent) click to toggle source
# File lib/polipus/robotex.rb, line 16
def initialize(uri, user_agent)
  io = Robotex.get_robots_txt(uri, user_agent)
  if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
    io = StringIO.new("User-agent: *\nAllow: /\n")
  end

  @disallows = {}
  @allows = {}
  @delays = {}
  agent = /.*/
  io.each do |line|
    next if line =~ /^\s*(#.*|$)/
    arr = line.split(':')
    key = arr.shift
    value = arr.join(':').strip
    value.strip!
    case key.downcase
    when 'user-agent'
      agent = to_regex(value)
    when 'allow'
      unless value.empty?
        @allows[agent] ||= []
        @allows[agent] << to_regex(value)
      end
    when 'disallow'
      unless value.empty?
        @disallows[agent] ||= []
        @disallows[agent] << to_regex(value)
      end
    when 'crawl-delay'
      @delays[agent] = value.to_i
    end
  end
  @parsed = true
end

Public Instance Methods

allowed?(uri, user_agent) click to toggle source
# File lib/polipus/robotex.rb, line 52
def allowed?(uri, user_agent)
  return true unless @parsed
  allowed = true
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
  path = uri.request_uri

  @allows.each do |key, value|
    unless allowed
      if user_agent =~ key
        value.each do |rule|
          path =~ rule && allowed = true
        end
      end
    end
  end

  @disallows.each do |key, value|
    if user_agent =~ key
      value.each do |rule|
        path =~ rule && allowed = false
      end
    end
  end

  allowed
end
delay(user_agent) click to toggle source
# File lib/polipus/robotex.rb, line 79
def delay(user_agent)
  @delays.each do |agent, delay|
    return delay if agent =~ user_agent
  end
  nil
end

Protected Instance Methods

to_regex(pattern) click to toggle source
# File lib/polipus/robotex.rb, line 88
def to_regex(pattern)
  pattern = Regexp.escape(pattern)
  pattern.gsub!(Regexp.escape('*'), '.*')
  Regexp.compile("^#{pattern}")
end