class MDUrl::Url

Constants

AUTO_ESCAPE

Allowed by RFCs, but cause of XSS attacks. Always escape these.

DELIMS

RFC 2396: characters reserved for delimiting URLs. We actually just auto-escape these.

HOSTLESS_PROTOCOL

protocols that can allow “unsafe” and “unwise” chars. protocols that never have a hostname.

HOSTNAME_MAX_LEN
HOSTNAME_PART_PATTERN
HOSTNAME_PART_START
HOST_ENDING_CHARS
NON_HOST_CHARS

Characters that are never ever allowed in a hostname. Note that any invalid chars are also handled, but these are the ones that are expected to be seen, so we fast-path them.

PORT_PATTERN
PROTOCOL_PATTERN

define these here so at least they only have to be compiled once on the first module load.

SIMPLE_PATH_PATTERN

Special case for a simple path URL

SLASHED_PROTOCOL

protocols that always contain a # bit.

UNWISE

RFC 2396: characters not allowed for various reasons.

Attributes

auth[RW]
hash[RW]
hostname[RW]
pathname[RW]
port[RW]
protocol[RW]
slashes[RW]

Public Class Methods

urlParse(url, slashesDenoteHost = false) click to toggle source
# File lib/mdurl-rb/parse.rb, line 97
def self.urlParse(url, slashesDenoteHost = false)
  return url if (url && url.is_a?(Url))

  u = Url.new
  u.parse(url, slashesDenoteHost)
  return u
end

Public Instance Methods

parse(url, slashesDenoteHost = false) click to toggle source
# File lib/mdurl-rb/parse.rb, line 106
def parse(url, slashesDenoteHost = false)
  rest = url

  # trim before proceeding.
  # This is to support parse stuff like "  http://foo.com  \n"
  rest = rest.strip

  if (!slashesDenoteHost && url.split('#').length == 1)
    # Try fast path regexp
    simplePath = SIMPLE_PATH_PATTERN.match(rest)
    if (simplePath)
      @pathname = simplePath[1]
      if (simplePath[2])
        @search = simplePath[2]
      end
      return self
    end
  end

  proto = PROTOCOL_PATTERN.match(rest)
  if (proto)
    proto      = proto[0]
    lowerProto = proto.downcase
    @protocol  = proto
    rest       = rest[proto.length..-1]
  end

  # figure out if it's got a host
  # user@server is *always* interpreted as a hostname, and url
  # resolution will treat //foo/bar as host=foo,path=bar because that's
  # how the browser resolves relative URLs.
  if (slashesDenoteHost || proto || rest.match(/^\/\/[^@\/]+@[^@\/]+/))
    slashes = rest[0...2] == '//'
    if (slashes && !(proto && HOSTLESS_PROTOCOL[proto]))
      rest = rest[2..-1]
      @slashes = true
    end
  end

  if (!HOSTLESS_PROTOCOL[proto] &&
      (slashes || (proto && !SLASHED_PROTOCOL[proto])))

    # there's a hostname.
    # the first instance of /, ?, ;, or # ends the host.
    #
    # If there is an @ in the hostname, then non-host chars *are* allowed
    # to the left of the last @ sign, unless some host-ending character
    # comes *before* the @-sign.
    # URLs are obnoxious.
    #
    # ex:
    # http://a@b@c/ => user:a@b host:c
    # http://a@b?@c => user:a host:c path:/?@c

    # v0.12 TODO(isaacs): This is not quite how Chrome does things.
    # Review our test case against browsers more comprehensively.

    # find the first instance of any HOST_ENDING_CHARS
    hostEnd = -1
    (0...HOST_ENDING_CHARS.length).each do |i|
      hec = rest.index(HOST_ENDING_CHARS[i])
      if (hec != nil && (hostEnd == -1 || hec < hostEnd))
        hostEnd = hec
      end
    end

    # at this point, either we have an explicit point where the
    # auth portion cannot go past, or the last @ char is the decider.
    if (hostEnd == -1)
      # atSign can be anywhere.
      atSign = rest.rindex('@')
    else
      # atSign must be in auth portion.
      # http://a@b/c@d => host:b auth:a path:/c@d
      # atSign = rest.lastIndexOf('@', hostEnd);
      atSign = rest[0..hostEnd].rindex('@')
    end

    # Now we have a portion which is definitely the auth.
    # Pull that off.
    if (atSign != nil)
      auth = rest.slice(0...atSign)
      rest = rest.slice((atSign + 1)..-1)
      @auth = auth
    end

    # the host is the remaining to the left of the first non-host char
    hostEnd = -1
    (0...NON_HOST_CHARS.length).each do |i|
      hec = rest.index(NON_HOST_CHARS[i])
      if (hec != nil && (hostEnd == -1 || hec < hostEnd))
        hostEnd = hec
      end
    end
    # if we still have not hit it, then the entire thing is a host.
    if (hostEnd === -1)
      hostEnd = rest.length
    end

    hostEnd -= 1 if (rest[hostEnd - 1] == ':')
    host = rest.slice(0...hostEnd)
    rest = rest.slice(hostEnd..-1)

    # pull out port.
    self.parseHost(host)

    # we've indicated that there is a hostname,
    # so even if it's empty, it has to be present.
    @hostname = @hostname || ''

    # if hostname begins with [ and ends with ]
    # assume that it's an IPv6 address.
    ipv6Hostname = @hostname[0] == '[' &&
        @hostname[@hostname.length - 1] == ']'

    # validate a little.
    if (!ipv6Hostname)
      hostparts = @hostname.split(/\./)
      (0...hostparts.length).each do |i|
        part = hostparts[i]
        next if (!part)
        if (!part.match(HOSTNAME_PART_PATTERN))
          newpart = ''
          (0...part.length).each do |j|
            if (part[j].ord > 127)
              # we replace non-ASCII char with a temporary placeholder
              # we need this to make sure size of hostname is not
              # broken by replacing non-ASCII by nothing
              newpart += 'x'
            else
              newpart += part[j]
            end
          end
          # we test again with ASCII char only
          if (!newpart.match(HOSTNAME_PART_PATTERN))
            validParts = hostparts.slice(0...i)
            notHost = hostparts.slice((i + 1)..-1)
            bit = part.match(HOSTNAME_PART_START)
            if (bit)
              validParts.push(bit[1])
              notHost.unshift(bit[2])
            end
            if (notHost.length)
              rest = notHost.join('.') + rest
            end
            @hostname = validParts.join('.')
            break
          end
        end
      end
    end

    if (@hostname.length > HOSTNAME_MAX_LEN)
      @hostname = ''
    end

    # strip [ and ] from the hostname
    # the host field still retains them, though
    if (ipv6Hostname)
      @hostname = @hostname[1, @hostname.length - 2]
    end
  end

  # chop off from the tail first.
  hash = rest.index('#')
  if (hash != nil)
    # got a fragment string.
    @hash = rest.slice(hash..-1)
    rest  = rest.slice(0...hash)
  end
  qm = rest.index('?')
  if (qm != nil)
    @search = rest.slice(qm..-1)
    rest    = rest.slice(0...qm)
  end
  @pathname = rest if !rest.nil? && rest != ''
  if (SLASHED_PROTOCOL[lowerProto] &&
      @hostname && !@pathname)
    @pathname = ''
  end

  return self
end
parseHost(host) click to toggle source
# File lib/mdurl-rb/parse.rb, line 291
def parseHost(host)
  port = PORT_PATTERN.match(host)
  if (port)
    port = port[0]
    if (port != ':')
      @port = port.slice(1..-1)
    end
    host = host[0, host.length - port.length]
  end
  @hostname = host if (host)
end