module Google::UrlCanonicalizer
Constants
- ANCHOR_REGEX
- URL_REGEX
Public Instance Methods
apply(input_url)
click to toggle source
# File lib/google/url_canonicalizer.rb, line 9 def apply input_url url = input_url.to_s.encode("ASCII-8BIT", :invalid => :replace, :undef => :replace, :replace => '?') url = url.gsub(/\s/, '') url = url.gsub(ANCHOR_REGEX, '') url = unescape(url) m = URL_REGEX.match(url.downcase) if m protocol, host, port, dir, query = m[:protocol], m[:host], m[:port], m[:dir], m[:query] protocol = 'http://' if protocol.nil? or protocol == '/' host = host.sub(/\A\.*/,'').sub(/\.\z/, '') if host dir = dir.sub(/\A\/*/, '').gsub(/\/+/, '/').gsub(/\/\.\//, '/') if dir url = protocol << host.to_s << port.to_s << '/' << dir.to_s << query.to_s end url end
unescape(url)
click to toggle source
# File lib/google/url_canonicalizer.rb, line 26 def unescape url unescape = URI.unescape(url) while unescape != url url = unescape unescape = URI.unescape(url) end URI.escape(unescape) end