class Kudzu::Agent::UrlFilterer
Public Class Methods
new(config, robots = nil)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 4 def initialize(config, robots = nil) @config = config @robots = robots end
Public Instance Methods
allowed?(uri, base_uri, filter: nil)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 24 def allowed?(uri, base_uri, filter: nil) uri = Addressable::URI.parse(uri) if uri.is_a?(String) base_uri = Addressable::URI.parse(base_uri) if base_uri.is_a?(String) filter ||= @config.find_filter(base_uri) return true unless filter focused_host?(uri, base_uri, filter) && focused_descendants?(uri, base_uri, filter) && allowed_url?(uri, filter) && allowed_host?(uri, filter) && allowed_path?(uri, filter) && allowed_ext?(uri, filter) && allowed_by_robots?(uri) end
filter(refs, base_url)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 9 def filter(refs, base_url) base_uri = Addressable::URI.parse(base_url) filter = @config.find_filter(base_uri) refs.select do |ref| if allowed?(ref.uri, base_uri, filter: filter) Kudzu.log :debug, "passed url: #{ref.url}" true else Kudzu.log :debug, "dropped url: #{ref.url}" false end end end
Private Instance Methods
allowed_by_robots?(uri)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 71 def allowed_by_robots?(uri) return true unless @robots return true unless @config.respect_robots_txt @robots.allowed?(uri) end
allowed_ext?(uri, filter)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 65 def allowed_ext?(uri, filter) ext = uri.extname.to_s.sub(/^\./, '') return true if ext.empty? Util::Matcher.match?(ext, allows: filter.allow_ext, denies: filter.deny_ext) end
allowed_host?(uri, filter)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 57 def allowed_host?(uri, filter) Util::Matcher.match?(uri.host, allows: filter.allow_host, denies: filter.deny_host) end
allowed_path?(uri, filter)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 61 def allowed_path?(uri, filter) Util::Matcher.match?(uri.path, allows: filter.allow_path, denies: filter.deny_path) end
allowed_url?(uri, filter)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 53 def allowed_url?(uri, filter) Util::Matcher.match?(uri.to_s, allows: filter.allow_url, denies: filter.deny_url) end
focused_descendants?(uri, base_uri, filter)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 46 def focused_descendants?(uri, base_uri, filter) return true unless filter.focus_descendants dir = Kudzu::Common.path_to_dir(uri.path) base_dir = Kudzu::Common.path_to_dir(base_uri.path) uri.host == base_uri.host && dir =~ /^#{Regexp.escape(base_dir)}/i end
focused_host?(uri, base_uri, filter)
click to toggle source
# File lib/kudzu/agent/url_filterer.rb, line 41 def focused_host?(uri, base_uri, filter) return true unless filter.focus_host uri.host == base_uri.host end