module Google::UrlCanonicalizer

Constants

ANCHOR_REGEX
URL_REGEX

Public Instance Methods

apply(input_url) click to toggle source
# File lib/google/url_canonicalizer.rb, line 9
def apply input_url
  url = input_url.to_s.encode("ASCII-8BIT", :invalid => :replace, :undef => :replace, :replace => '?')
  url = url.gsub(/\s/, '')
  url = url.gsub(ANCHOR_REGEX, '')
  url = unescape(url)
  m = URL_REGEX.match(url.downcase)
  if m
    protocol, host, port, dir, query = m[:protocol], m[:host], m[:port], m[:dir], m[:query]
    protocol = 'http://' if protocol.nil? or protocol == '/'
    host = host.sub(/\A\.*/,'').sub(/\.\z/, '') if host
    dir = dir.sub(/\A\/*/, '').gsub(/\/+/, '/').gsub(/\/\.\//, '/') if dir
    url = protocol << host.to_s << port.to_s << '/' << dir.to_s << query.to_s
  end
  
  url
end
unescape(url) click to toggle source
# File lib/google/url_canonicalizer.rb, line 26
def unescape url
  unescape = URI.unescape(url)
  while unescape != url
    url = unescape
    unescape = URI.unescape(url)
  end
  URI.escape(unescape)
end