class Spider::ExclusionParser

Constants

ALLOW
DELAY
DISALLOW
MAX_DIRECTIVES
NULL_MATCH

Attributes

wait_time[RW]

Public Class Methods

new(text, agent = nil, status = 200) click to toggle source
# File lib/exclusion.rb, line 27
def initialize(text, agent = nil, status = 200)
  @skip_list = []
  @agent_key = agent

  return if text.nil? || text.length.zero?

  if [401, 403].include? status
    @skip_list << [NULL_MATCH, true]
    return
  end

  begin
    config = parse_text(text)
    grab_list(config)
  rescue
  end
end

Public Instance Methods

allowed?(url) click to toggle source
# File lib/exclusion.rb, line 58
def allowed?(url)
  !excluded?(url)
end
excluded?(url) click to toggle source

Check to see if the given url is matched by any rule in the file, and return it's associated status

# File lib/exclusion.rb, line 48
def excluded?(url)
  url = safe_unescape(url)
  @skip_list.each do |entry|
    return entry.last if url.include? entry.first
    return entry.last if entry.first == NULL_MATCH
  end

  false
end

Private Instance Methods

allow(value) click to toggle source
# File lib/exclusion.rb, line 145
def allow(value)
  token = (value == '/' ? NULL_MATCH : value.chomp('*'))
  @skip_list << [safe_unescape(token), false]
end
delay(value) click to toggle source
# File lib/exclusion.rb, line 150
def delay(value)
  @wait_time = WaitTime.new(value.to_i)
end
disallow(value) click to toggle source
# File lib/exclusion.rb, line 140
def disallow(value)
  token = (value == '/' ? NULL_MATCH : value.chomp('*'))
  @skip_list << [safe_unescape(token), true]
end
grab_list(config) click to toggle source

Method to process the list of directives for a given user agent. Picks the one that applies to us, and then processes it's directives into the skip list by splitting the strings and taking the appropriate action. Stops after a set number of directives to avoid malformed files or denial of service attacks

# File lib/exclusion.rb, line 70
def grab_list(config)
  if config.include?(@agent_key)
    section = config[@agent_key]
  else
    section = config['*']
  end

  if section.length > MAX_DIRECTIVES
    section.slice!(MAX_DIRECTIVES, section.length)
  end

  section.each do |pair|
    key, value = pair.split(':')

    next if key.nil? || value.nil? ||
            key.empty? || value.empty?

    key.downcase!
    key.lstrip!
    key.rstrip!

    value.lstrip!
    value.rstrip!

    disallow(value) if key == DISALLOW
    delay(value) if key == DELAY
    allow(value) if key == ALLOW
  end
end
parse_text(text) click to toggle source

Top level file parsing method - makes sure carriage returns work, strips out any BOM, then loops through each line and opens up a new array of directives in the hash if a user-agent directive is found.

# File lib/exclusion.rb, line 104
def parse_text(text)
  current_key = ''
  config = {}

  text.gsub!("\r", "\n")
  text = text.force_encoding('UTF-8')
  text.gsub!("\xEF\xBB\xBF".force_encoding('UTF-8'), '')

  text.each_line do |line|
    line.lstrip!
    line.rstrip!
    line.gsub!(/#.*/, '')

    next unless line.length.nonzero? && line =~ /[^\s]/

    if line =~ /User-agent:\s+(.+)/i
      previous_key = current_key
      current_key = $1.downcase
      config[current_key] = [] unless config[current_key]

      # If we've seen a new user-agent directive and the previous one
      # is empty then we have a cascading user-agent string. Copy the
      # new user agent array ref so both user agents are identical.

      if config.key?(previous_key) && config[previous_key].size.zero?
        config[previous_key] = config[current_key]
      end

    else
      config[current_key] << line
    end
  end

  config
end
safe_unescape(target) click to toggle source
# File lib/exclusion.rb, line 154
def safe_unescape(target)
  t = target.gsub(/%2f/, '^^^')
  t = CGI.unescape(t)
  t.gsub(/\^\^\^/, '%2f')
end