module Google::UrlScramble

Public Instance Methods

gen(url) click to toggle source

url is canonicalized url

# File lib/google/url_scramble.rb, line 6
def gen url
  m = Google::UrlCanonicalizer::URL_REGEX.match(url)
  return [[], []] if m.nil?
  protocol, host, port, dir, query = m[:protocol], m[:host], m[:port], m[:dir], m[:query]
  return [[], []] if host.nil?

  urls = []
  hosts, paths = [], []

  hosts << host
  if /[^\d.]/ =~ host
    host_segments = host.split('.')
    host_segments = host_segments[-6..-1] if host_segments.size >= 6
    h = host_segments.shift
    while !h.nil? && host_segments.size > 1 && hosts.size <= 5
      hosts << host_segments.join('.')
      h = host_segments.shift
    end
    host_keys = hosts.select {|x| x.count(".") == 2 or x.count(".") == 1}.map{|x| x+'/'}
  else
    host_keys = hosts.map{|x| x+'/'}
  end
  
  dir = dir.to_s.sub(/\A\//, '') # remove the leading slash
  paths << ('/' << dir)
  paths << ('/' << dir << query.to_s) unless query.blank?
  path_segments = dir.split('/')
  paths << '/'
  count = 0; tmp_path = ''
  while !(p = path_segments.shift).nil? && count <= 3
    tmp_path += ('/' << p)
    paths << tmp_path
    count += 1
  end
  paths.uniq!
  
  (urls = []).tap do
    hosts.each do |h|
      paths.each do |p|
        urls << h + p
      end
    end
  end
  
  [host_keys, urls]
end