class Spider::ExclusionParser
Constants
- ALLOW
- DELAY
- DISALLOW
- MAX_DIRECTIVES
- NULL_MATCH
Attributes
wait_time[RW]
Public Class Methods
new(text, agent = nil, status = 200)
click to toggle source
# File lib/exclusion.rb, line 27 def initialize(text, agent = nil, status = 200) @skip_list = [] @agent_key = agent return if text.nil? || text.length.zero? if [401, 403].include? status @skip_list << [NULL_MATCH, true] return end begin config = parse_text(text) grab_list(config) rescue end end
Public Instance Methods
allowed?(url)
click to toggle source
# File lib/exclusion.rb, line 58 def allowed?(url) !excluded?(url) end
excluded?(url)
click to toggle source
Check to see if the given url is matched by any rule in the file, and return it's associated status
# File lib/exclusion.rb, line 48 def excluded?(url) url = safe_unescape(url) @skip_list.each do |entry| return entry.last if url.include? entry.first return entry.last if entry.first == NULL_MATCH end false end
Private Instance Methods
allow(value)
click to toggle source
# File lib/exclusion.rb, line 145 def allow(value) token = (value == '/' ? NULL_MATCH : value.chomp('*')) @skip_list << [safe_unescape(token), false] end
delay(value)
click to toggle source
# File lib/exclusion.rb, line 150 def delay(value) @wait_time = WaitTime.new(value.to_i) end
disallow(value)
click to toggle source
# File lib/exclusion.rb, line 140 def disallow(value) token = (value == '/' ? NULL_MATCH : value.chomp('*')) @skip_list << [safe_unescape(token), true] end
grab_list(config)
click to toggle source
Method to process the list of directives for a given user agent. Picks the one that applies to us, and then processes it's directives into the skip list by splitting the strings and taking the appropriate action. Stops after a set number of directives to avoid malformed files or denial of service attacks
# File lib/exclusion.rb, line 70 def grab_list(config) if config.include?(@agent_key) section = config[@agent_key] else section = config['*'] end if section.length > MAX_DIRECTIVES section.slice!(MAX_DIRECTIVES, section.length) end section.each do |pair| key, value = pair.split(':') next if key.nil? || value.nil? || key.empty? || value.empty? key.downcase! key.lstrip! key.rstrip! value.lstrip! value.rstrip! disallow(value) if key == DISALLOW delay(value) if key == DELAY allow(value) if key == ALLOW end end
parse_text(text)
click to toggle source
Top level file parsing method - makes sure carriage returns work, strips out any BOM, then loops through each line and opens up a new array of directives in the hash if a user-agent directive is found.
# File lib/exclusion.rb, line 104 def parse_text(text) current_key = '' config = {} text.gsub!("\r", "\n") text = text.force_encoding('UTF-8') text.gsub!("\xEF\xBB\xBF".force_encoding('UTF-8'), '') text.each_line do |line| line.lstrip! line.rstrip! line.gsub!(/#.*/, '') next unless line.length.nonzero? && line =~ /[^\s]/ if line =~ /User-agent:\s+(.+)/i previous_key = current_key current_key = $1.downcase config[current_key] = [] unless config[current_key] # If we've seen a new user-agent directive and the previous one # is empty then we have a cascading user-agent string. Copy the # new user agent array ref so both user agents are identical. if config.key?(previous_key) && config[previous_key].size.zero? config[previous_key] = config[current_key] end else config[current_key] << line end end config end
safe_unescape(target)
click to toggle source
# File lib/exclusion.rb, line 154 def safe_unescape(target) t = target.gsub(/%2f/, '^^^') t = CGI.unescape(t) t.gsub(/\^\^\^/, '%2f') end