class Kudzu::Agent::UrlExtractor
Public Class Methods
new(config)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 4 def initialize(config) @config = config end
Public Instance Methods
extract(response)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 8 def extract(response) refs = if response.html? ForHTML.new(@config).extract(response) elsif response.xml? ForXML.new(@config).extract(response) else [] end refs.each do |ref| ref.url = sanitize(ref.url) ref.url = normalize(ref.url, response.url) end refs.reject { |ref| ref.url.nil? }.uniq end
Private Instance Methods
normalize(url, base_url)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 30 def normalize(url, base_url) uri = Addressable::URI.parse(base_url).join(url) uri = Addressable::URI.parse(Addressable::URI.encode(Addressable::URI.unencode(uri))) uri.host = uri.normalized_host uri.path = '/' unless uri.path uri.path = uri.path.gsub(%r|/{2,}|, '/') uri.fragment = nil if uri.scheme.in?(%w(http https)) && Addressable::URI.parse(uri.to_s) uri.to_s else nil end rescue => e Kudzu.log :warn, "failed to normalize url: #{url}", error: e nil end
sanitize(url)
click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 26 def sanitize(url) url.strip.gsub(/^(\s| |%20)+/, '') end