class Kudzu::Agent::Robots::Parser
Constants
- UNMATCH_REGEXP
Public Class Methods
parse(body)
click to toggle source
# File lib/kudzu/agent/robots/parser.rb, line 8 def parse(body) txt = Txt.new sets = [] prev_key = nil parse_body(body).each do |key, value| case key when 'user-agent' new_set = RuleSet.new(user_agent: ua_regexp(value)) txt.sets << new_set if prev_key == 'user-agent' sets << new_set else sets = [new_set] end when 'allow' re = path_regexp(value) sets.each { |set| set.rules << Rule.new(path: re, allow: true) } when 'disallow' re = path_regexp(value) sets.each { |set| set.rules << Rule.new(path: re, allow: false) } when 'crawl-delay' sets.each { |set| set.crawl_delay = value.to_i } when 'sitemap' txt.sitemaps << value end prev_key = key end sort(txt) end
Private Class Methods
parse_body(body)
click to toggle source
# File lib/kudzu/agent/robots/parser.rb, line 43 def parse_body(body) lines = body.to_s.split(/\r|\n|\r\n/) lines.map { |line| parse_line(line) }.compact end
parse_line(line)
click to toggle source
# File lib/kudzu/agent/robots/parser.rb, line 48 def parse_line(line) line.strip! if line.empty? || line.start_with?('#') nil else split_line(line) end end
path_regexp(value)
click to toggle source
# File lib/kudzu/agent/robots/parser.rb, line 74 def path_regexp(value) Regexp.new('^' + Regexp.escape(value).gsub('\*', '.*').gsub('\$', '$')) rescue RegexpError UNMATCH_REGEXP end
sort(txt)
click to toggle source
# File lib/kudzu/agent/robots/parser.rb, line 80 def sort(txt) txt.sets.sort_by! { |rule| [-rule.user_agent.to_s.count('*'), rule.user_agent.to_s.length] }.reverse! txt.sets.each do |set| set.rules.sort_by! { |rule| rule.path.to_s.length }.reverse! end txt end
split_line(line)
click to toggle source
# File lib/kudzu/agent/robots/parser.rb, line 57 def split_line(line) key, value = line.split(':', 2) key = key.to_s.strip.downcase value = value.to_s.sub(/#.*$/, '').strip if key.empty? || value.empty? nil else [key, value] end end
ua_regexp(value)
click to toggle source
# File lib/kudzu/agent/robots/parser.rb, line 68 def ua_regexp(value) Regexp.new(Regexp.escape(value).gsub('\*', '.*')) rescue RegexpError UNMATCH_REGEXP end