module Gammo::Tokenizer::Escape

Constants

ESCAPE_REPLACEMENT_TABLE
LONGEST_ENTITY_WITHOUT_SEMICOLON
REPLACEMENT_TABLE

Public Instance Methods

escape(s) click to toggle source

Escapes given string according to {ESCAPE_REPLACEMENT_TABLE}.

# File lib/gammo/tokenizer/escape.rb, line 52
def escape(s)
  s.gsub!(/[&'<>"\r]/) { |ch| ESCAPE_REPLACEMENT_TABLE[ch] }
end
unescape(data, **options) click to toggle source

Unescapes given data. @param [String] data @return [String, nil]

# File lib/gammo/tokenizer/escape.rb, line 59
def unescape(data, **options)
  return unless data
  data.each_byte.with_index do |byte, i|
    next unless byte.chr == ?&
    dst, src = unescape_entity(data, i, i, **options)
    while src < data.bytes.length
      byte = data.getbyte(src)
      if byte.chr == ?&
        dst, src = unescape_entity(data, dst, src, **options)
      else
        data.setbyte(dst, byte)
        dst, src = dst + 1, src + 1
      end
    end
    return data.byteslice(0, dst)
  end
end

Private Instance Methods

consume_entity_chars(s, i) click to toggle source
# File lib/gammo/tokenizer/escape.rb, line 151
def consume_entity_chars(s, i)
  while i < s.length
    ch = s[i]
    i += 1
    next if ?a <= ch && ch <= ?z || ?A <= ch && ch <= ?Z || ?0 <= ch && ch <= ?9
    i -= 1 if ch != ?;
    break
  end
  i
end
like_query_params?(name, s, i) click to toggle source
# File lib/gammo/tokenizer/escape.rb, line 162
def like_query_params?(name, s, i)
  name[name.length - 1] != ?; && s.length > i && s[i] == ?=
end
replace_entity(entities, t, dst, src, i) click to toggle source
# File lib/gammo/tokenizer/escape.rb, line 166
def replace_entity(entities, t, dst, src, i)
  [entities.inject(dst) { |sum, ch|
    ch.each_byte.with_index { |byte, j| t.setbyte(sum + j, byte) }
   sum + ch.bytes.length
  }, src + i]
end
swap(data, dst, src) click to toggle source
# File lib/gammo/tokenizer/escape.rb, line 146
def swap(data, dst, src)
  data[dst] = data[src]
  [dst + 1, src + 1]
end
unescape_entity(data, dst, src, in_attribute: false) click to toggle source
# File lib/gammo/tokenizer/escape.rb, line 79
def unescape_entity(data, dst, src, in_attribute: false)
  # No need to count "&".
  i, s = 1, data.byteslice(src..-1)
  swap(data, dst, src) if s.length <= 1
  return unescape_sharp_entity(data, s, dst, src, i) if s[i] == ?#
  i = consume_entity_chars(s, i)
  name = s.byteslice(1, i - 1)
  unless name == '' || (in_attribute && like_query_params?(name, s, i))
    entities = Entity::CODEPOINT[name.to_sym]
    entities = entities ? [entities] : Entity::TWO_CODEPOINTS[name.to_sym]
    return replace_entity(entities, data, dst, src, i) if entities
    unless in_attribute
      max = name.length - 1
      max = LONGEST_ENTITY_WITHOUT_SEMICOLON if max > LONGEST_ENTITY_WITHOUT_SEMICOLON
      max.downto(1) do |n|
        if entities = Entity::CODEPOINT[name.byteslice(0, n).to_sym]
          return replace_entity([entities], data, dst, src, n + 1)
        end
      end
    end
  end
  dst1, src1 = dst + i, src + i
  data[dst, dst1] = data[src, src1]
  [dst1, src1]
end
unescape_sharp_entity(data, s, dst, src, i) click to toggle source
# File lib/gammo/tokenizer/escape.rb, line 105
def unescape_sharp_entity(data, s, dst, src, i)
  return swap(data, dst, src) if s.length <= 3
  i += 1
  ch = s[i]
  hex = false
  if ch == ?x || ch == ?X
    hex = true
    i += 1
  end
  x = ?\x0
  while i < s.length
    ch = s[i]
    i += 1
    if hex
      if ?0 <= ch && ch <= ?9
        x = 16 * x.ord + ch.ord - ?0.ord
        next
      elsif ?a <= ch && ch <= ?f
        x = 16 * x.ord + ch.ord - ?a.ord + 10
        next
      elsif ?A <= ch && ch <= ?F
        x = 16 * x.ord + ch.ord - ?A.ord + 10
        next
      end
    elsif (?0 <= ch && ch <= ?9)
      x = 10 * x.ord + ch.ord - ?0.ord
      next
    end
    i -= 1 if ch != ?;
    break
  end
  return swap(data, dst, src) if i <= 3
  if 0x80 <= x && x <= 0x9F
    x = REPLACEMENT_TABLE[x - 0x80].ord
  elsif x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF
    x = "\u{FFFD}".ord
  end
  x.chr(Encoding::UTF_8).each_byte.with_index { |byte, j| data.setbyte(dst + j, byte) }
  [dst + x.chr(Encoding::UTF_8).bytes.length, src + i]
end