class Kudzu::Agent::UrlExtractor

Public Class Methods

new(config) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 4
def initialize(config)
  @config = config
end

Public Instance Methods

extract(response) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 8
def extract(response)
  refs = if response.html?
           ForHTML.new(@config).extract(response)
         elsif response.xml?
           ForXML.new(@config).extract(response)
         else
           []
         end

  refs.each do |ref|
    ref.url = sanitize(ref.url)
    ref.url = normalize(ref.url, response.url)
  end
  refs.reject { |ref| ref.url.nil? }.uniq
end

Private Instance Methods

normalize(url, base_url) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 30
def normalize(url, base_url)
  uri = Addressable::URI.parse(base_url).join(url)
  uri = Addressable::URI.parse(Addressable::URI.encode(Addressable::URI.unencode(uri)))
  uri.host = uri.normalized_host
  uri.path = '/' unless uri.path
  uri.path = uri.path.gsub(%r|/{2,}|, '/')
  uri.fragment = nil

  if uri.scheme.in?(%w(http https)) && Addressable::URI.parse(uri.to_s)
    uri.to_s
  else
    nil
  end
rescue => e
  Kudzu.log :warn, "failed to normalize url: #{url}", error: e
  nil
end
sanitize(url) click to toggle source
# File lib/kudzu/agent/url_extractor.rb, line 26
def sanitize(url)
  url.strip.gsub(/^(\s| |%20)+/, '')
end