class Twingly::URL

Constants

ACCEPTED_SCHEMES
CARRIAGE_RETURN
CUSTOM_PSL
ENDS_WITH_SLASH
ERRORS_TO_EXTEND
LEADING_AND_TRAILING_WHITESPACE
LINE_FEED
NBSP
SPACE
STARTS_WITH_WWW
VERSION
WHITESPACE_CHARS

Attributes

addressable_uri[R]
public_suffix_domain[R]

Public Class Methods

new(addressable_uri, public_suffix_domain) click to toggle source
# File lib/twingly/url.rb, line 113
def initialize(addressable_uri, public_suffix_domain)
  @addressable_uri      = addressable_uri
  @public_suffix_domain = public_suffix_domain
end
parse(potential_url) click to toggle source
# File lib/twingly/url.rb, line 49
def parse(potential_url)
  internal_parse(potential_url)
rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error
  NullURL.new
rescue Exception => error
  error.extend(Twingly::URL::Error)
  raise
end

Private Class Methods

clean_input(input) click to toggle source
# File lib/twingly/url.rb, line 82
def clean_input(input)
  input = String(input)
  input = input.scrub
  input = strip_whitespace(input)
end
internal_parse(input) click to toggle source
# File lib/twingly/url.rb, line 58
def internal_parse(input)
  potential_url   = clean_input(input)
  addressable_uri = Addressable::URI.heuristic_parse(potential_url)
  raise Twingly::URL::Error::ParseError if addressable_uri.nil?

  scheme = addressable_uri.scheme
  raise Twingly::URL::Error::ParseError unless scheme =~ ACCEPTED_SCHEMES

  # URLs that can't be normalized should not be valid
  try_addressable_normalize(addressable_uri)

  host = addressable_uri.host
  public_suffix_domain = PublicSuffix.parse(host, list: CUSTOM_PSL,
    default_rule: nil)
  raise Twingly::URL::Error::ParseError if public_suffix_domain.nil?

  raise Twingly::URL::Error::ParseError if public_suffix_domain.sld.nil?

  new(addressable_uri, public_suffix_domain)
rescue *ERRORS_TO_EXTEND => error
  error.extend(Twingly::URL::Error)
  raise
end
strip_whitespace(input) click to toggle source
# File lib/twingly/url.rb, line 88
def strip_whitespace(input)
  return input unless input.encoding == Encoding::UTF_8

  input.gsub(LEADING_AND_TRAILING_WHITESPACE, "")
end
try_addressable_normalize(addressable_uri) click to toggle source

Workaround for the following bug in addressable: github.com/sporkmonger/addressable/issues/224

# File lib/twingly/url.rb, line 96
def try_addressable_normalize(addressable_uri)
  addressable_uri.normalize
rescue ArgumentError => error
  if error.message.include?("invalid byte sequence in UTF-8")
    raise Twingly::URL::Error::ParseError
  end

  raise
end

Public Instance Methods

<=>(other) click to toggle source
# File lib/twingly/url.rb, line 211
def <=>(other)
  self.to_s <=> other.to_s
end
domain() click to toggle source
# File lib/twingly/url.rb, line 142
def domain
  public_suffix_domain.domain
end
eql?(other) click to toggle source
# File lib/twingly/url.rb, line 215
def eql?(other)
  return false unless other.is_a?(self.class)

  self.hash == other.hash
end
hash() click to toggle source
# File lib/twingly/url.rb, line 221
def hash
  self.to_s.hash
end
host() click to toggle source
# File lib/twingly/url.rb, line 146
def host
  addressable_uri.host
end
inspect() click to toggle source
# File lib/twingly/url.rb, line 229
def inspect
  sprintf("#<%s:0x%x %s>", self.class.name, __id__, self.to_s)
end
normalized() click to toggle source
# File lib/twingly/url.rb, line 162
def normalized
  normalized_url = addressable_uri.dup

  normalized_url.scheme = normalized_scheme
  normalized_url.host   = normalized_host
  normalized_url.path   = normalized_path

  self.class.parse(normalized_url)
end
normalized_host() click to toggle source
# File lib/twingly/url.rb, line 176
def normalized_host
  host   = addressable_uri.normalized_host
  domain = public_suffix_domain

  unless domain.subdomain?
    host = "www.#{host}"
  end

  host = normalize_blogspot(host, domain)

  host
end
normalized_path() click to toggle source
# File lib/twingly/url.rb, line 189
def normalized_path
  path = strip_trailing_slashes(addressable_uri.path)

  (path.empty?) ? "/" : path
end
normalized_scheme() click to toggle source
# File lib/twingly/url.rb, line 172
def normalized_scheme
  scheme.downcase
end
origin() click to toggle source
# File lib/twingly/url.rb, line 150
def origin
  addressable_uri.origin
end
password() click to toggle source
# File lib/twingly/url.rb, line 203
def password
  addressable_uri.password.to_s
end
path() click to toggle source
# File lib/twingly/url.rb, line 154
def path
  addressable_uri.path
end
scheme() click to toggle source
# File lib/twingly/url.rb, line 118
def scheme
  addressable_uri.scheme
end
sld() click to toggle source
# File lib/twingly/url.rb, line 126
def sld
  public_suffix_domain.sld
end
tld() click to toggle source
# File lib/twingly/url.rb, line 130
def tld
  public_suffix_domain.tld
end
to_s() click to toggle source
# File lib/twingly/url.rb, line 225
def to_s
  addressable_uri.to_s
end
trd() click to toggle source
# File lib/twingly/url.rb, line 122
def trd
  public_suffix_domain.trd.to_s
end
ttld() click to toggle source

Many ccTLDs have a second level underneath their ccTLD, use this when you don't care about the second level.

[1]: en.wikipedia.org/wiki/Second-level_domain

# File lib/twingly/url.rb, line 138
def ttld
  tld.split(".").last
end
user() click to toggle source
# File lib/twingly/url.rb, line 199
def user
  addressable_uri.user.to_s
end
userinfo() click to toggle source
# File lib/twingly/url.rb, line 195
def userinfo
  addressable_uri.userinfo.to_s
end
valid?() click to toggle source
# File lib/twingly/url.rb, line 207
def valid?
  true
end
without_scheme() click to toggle source
# File lib/twingly/url.rb, line 158
def without_scheme
  self.to_s.sub(/\A#{scheme}:/, "")
end

Private Instance Methods

normalize_blogspot(host, domain) click to toggle source
# File lib/twingly/url.rb, line 237
def normalize_blogspot(host, domain)
  if domain.sld.downcase == "blogspot"
    host.sub(STARTS_WITH_WWW, "").sub(/#{domain.tld}\z/i, "com")
  else
    host
  end
end
strip_trailing_slashes(path) click to toggle source
# File lib/twingly/url.rb, line 245
def strip_trailing_slashes(path)
  path.sub(ENDS_WITH_SLASH, "")
end