class Gammo::Tokenizer

Class for implementing HTML5 tokenization algorithm.

Constants

CommentToken
DoctypeToken
EOS

Represents end-of-string.

EOSError

Raised if unexpected EOS is found. @!visibility private

EndTagToken
ErrorToken
RAW_TAGS
RAW_TAGS_UNION
SelfClosingTagToken
StartTagToken
TextToken

Attributes

convert_null[RW]
raw[RW]
raw_tag[RW]
scanner[RW]

Public Class Methods

new(text, context: nil) click to toggle source
# File lib/gammo/tokenizer.rb, line 26
def initialize(text, context: nil)
  @text          = text
  @scanner       = StringScanner.new(text.force_encoding(Encoding::UTF_8))
  @raw_tag       = context && raw_tag?(context.downcase) ? context.downcase : ''
  @convert_null  = false
  @cdata_allowed = false
  @raw           = false
end

Public Instance Methods

allow_cdata!(b) click to toggle source
# File lib/gammo/tokenizer.rb, line 35
def allow_cdata!(b)
  @cdata_allowed = !!b
end
allow_cdata?() click to toggle source
# File lib/gammo/tokenizer.rb, line 39
def allow_cdata?
  @cdata_allowed
end
next_is_not_raw_text!() click to toggle source
# File lib/gammo/tokenizer.rb, line 96
def next_is_not_raw_text!
  @raw_tag = ''
end
next_token() click to toggle source
# File lib/gammo/tokenizer.rb, line 47
def next_token
  return EOS if scanner.eos?
  if previous_token_is_raw_tag? && (token = next_token_for_raw_tag)
    return token
  end
  @raw          = false
  @convert_null = false
  pos = scanner.pos
  buffer = ''.force_encoding(Encoding::ASCII)
  loop do
    break unless byte = scanner.get_byte
    buffer << byte
    next if byte != ?<
    break unless byte = scanner.get_byte
    buffer << byte
    if pos < (scanner.pos - 2)
      scanner.pos -= 2
      buffer = buffer.slice(0, buffer.length - 2)
      return text_token(buffer)
    end
    case byte
    when %r{[a-zA-Z]}
      step_back
      return scan_start_tag
    when ?!           then return scan_markup_declaration
    when ??           then return comment_token(?? + scan_until_close_angle)
    when ?/
      return text_token(buffer) if scanner.eos?
      # "</>" does not generate a token at all. treat this as empty comment token.
      return comment_token('') if scan(/>/)
      # Expects chars like "</a"
      return comment_token(scan_until_close_angle) unless check(/[a-zA-Z]/)
      begin
        tag = scan_tag(need_attribute: false)
      rescue EOSError
        return EOS
      end
      return error_token(pos) if tag.nil?
      return end_tag_token(tag)
    else
      step_back
      buffer = buffer.slice(0, buffer.length - 1)
      next
    end
  end
  return text_token(buffer) if pos < scanner.pos
  EOS
end
previous_token_is_raw_tag?() click to toggle source
# File lib/gammo/tokenizer.rb, line 43
def previous_token_is_raw_tag?
  !raw_tag.empty?
end

Private Instance Methods

comment_token(text) click to toggle source
# File lib/gammo/tokenizer.rb, line 365
def comment_token(text)
  CommentToken.new(text, raw: raw, convert_null: convert_null)
end
doctype_token(text) click to toggle source
# File lib/gammo/tokenizer.rb, line 369
def doctype_token(text)
  DoctypeToken.new(text, raw: raw, convert_null: convert_null)
end
end_tag_token(tag) click to toggle source
# File lib/gammo/tokenizer.rb, line 361
def end_tag_token(tag)
  EndTagToken.new(tag.name, tag: Tags.lookup(tag.name), attributes: tag.attributes)
end
error_token(pos) click to toggle source
# File lib/gammo/tokenizer.rb, line 357
def error_token(pos)
  ErrorToken.new("unexpected token, #{scanner.string.slice(pos..scanner.pos)}")
end
next_token_for_raw_tag() click to toggle source
# File lib/gammo/tokenizer.rb, line 102
def next_token_for_raw_tag
  pos = scanner.pos
  token =
    if raw_tag != 'plaintext'
      scan_raw_or_rcdata
    else
      @raw = true
      text_token(scan_until(/\z/) || '')
    end
  if token && scanner.pos > pos
    @convert_null      = true
    token.convert_null = true
    token.load_data(token.data)
    return token
  end
end
peek(length:, target: matched) click to toggle source
# File lib/gammo/tokenizer.rb, line 170
def peek(length:, target: matched)
  target.slice(0, length)
end
raw_tag?(name) click to toggle source
# File lib/gammo/tokenizer.rb, line 349
def raw_tag?(name)
  RAW_TAGS.include?(name)
end
scan_cdata() click to toggle source
# File lib/gammo/tokenizer.rb, line 322
def scan_cdata
  return unless scan(/\[CDATA\[/)
  brackets = 0
  buffer = ''
  loop do
    byte = scanner.get_byte
    return text_token(buffer) unless byte
    buffer << byte
    case byte
    when ?]
      brackets += 1
    when ?>
      if brackets >= 2
        buffer = buffer.slice(0, buffer.length - ']]>'.length)
        break
      end
      brackets = 0
    else
      brackets = 0
    end
  end
  text_token(buffer)
end
scan_comment() click to toggle source
# File lib/gammo/tokenizer.rb, line 282
def scan_comment
  count = 2
  buffer = ''
  loop do
    unless byte = scanner.get_byte
      count = 2 if count > 2
      buffer = buffer.slice(0, buffer.length - count)
      break
    end
    buffer << byte
    case byte
    when ?-
      count += 1
      next
    when ?>
      if count >= 2
        # "-->"
        buffer = buffer.slice(0, buffer.length - 3)
        break
      end
    when ?!
      if count >= 2
        break unless byte = scanner.get_byte
        # "--!>"
        if byte == ?>
          # no need to count ">" as it's not appended to the buffer.
          buffer = buffer.slice(0, buffer.length - 3)
          break
        end
      end
    end
    count = 0
  end
  comment_token(buffer)
end
scan_doctype() click to toggle source
# File lib/gammo/tokenizer.rb, line 276
def scan_doctype
  scan_whitespace
  return error_token(scanner.pos) if scanner.eos?
  doctype_token(scan_until_close_angle)
end
scan_markup_declaration() click to toggle source
# File lib/gammo/tokenizer.rb, line 260
def scan_markup_declaration
  return scan_comment if scan(/--/)
  return scan_doctype if scan(/DOCTYPE/i)
  if allow_cdata? && (cdata = scan_cdata)
    self.convert_null = true
    cdata
  else
    comment_token(scan_until_close_angle)
  end
end
scan_raw_or_rcdata() click to toggle source
# File lib/gammo/tokenizer.rb, line 119
def scan_raw_or_rcdata
  if raw_tag == 'script'
    token = scan_script
    @raw     = true
    @raw_tag = ''
    return token
  end
  buffer = ''
  while !scanner.eos?
    ch = scanner.get_byte
    buffer << ch
    break if scanner.eos?
    next if ch != ?<
    ch = scanner.get_byte
    buffer << ch
    break if scanner.eos?
    if ch != ?/
      buffer = buffer.slice(0, buffer.length - 1)
      scanner.unscan
      next
    end
    if scanner.check(%r{#{raw_tag}[>\s\/]}) || scanner.eos?
      buffer = buffer.slice(0..-3)
      scanner.pos -= 2
      break
    end
  end
  @raw = raw_tag != 'textarea' && raw_tag != 'title'
  @raw_tag = ''
  text_token(buffer) unless buffer.empty?
end
scan_script() click to toggle source
# File lib/gammo/tokenizer.rb, line 151
def scan_script
  text_token(ScriptScanner.new(scanner, raw_tag: raw_tag).scan)
end
scan_start_tag() click to toggle source
# File lib/gammo/tokenizer.rb, line 155
def scan_start_tag
  begin
    tag = scan_tag(need_attribute: true)
  rescue EOSError
    return EOS
  end
  name = tag.name
  @raw_tag = name.downcase if raw_tag?(name)
  (tag.self_closing? || scanner.string.slice(scanner.pos - 2) == ?/ ? SelfClosingTagToken : StartTagToken).new(
    name,
    tag: Tags.lookup(tag.name),
    attributes: tag.attributes,
  )
end
scan_tag(need_attribute: false) click to toggle source
# File lib/gammo/tokenizer.rb, line 224
def scan_tag(need_attribute: false)
  name = scan_until(%r{[\s/>]})
  name =
    if name
      step_back if name.end_with?(?/) || name.end_with?(?>)
      name.slice(0, name.length - 1)
    else
      if buf = scan_until(/\s/)
        step_back
        buf.slice(0, buf.length - 1)
      else
        scan_until(/\z/)
        raise EOSError, "Couldn't find a token for representing end of the tag"
      end
    end
  name = name.downcase
  scan_whitespace
  return Tag.new(name: name) if scanner.eos?
  attrs = []
  while !scan(/>/)
    key = scan_tag_attribute_key
    return Tag.new(name: name, attributes: attrs, self_closing: true) if key == ?/ && scan(/>/)
    next unless key
    break if scanner.eos?
    scan_whitespace
    break if scanner.eos?
    value = scan_tag_attribute_value
    value = unescape(value, in_attribute: true) if value
    break if scanner.eos?
    attrs << Attribute.new(key: key, value: value) if need_attribute
    scan_whitespace
    break if scanner.eos?
  end
  Tag.new(name: name, attributes: attrs, self_closing: false)
end
scan_tag_attribute_key() click to toggle source
# File lib/gammo/tokenizer.rb, line 178
def scan_tag_attribute_key
  key = scan_until(%r{[=>\s/]})
  return scan_until(/\z/).downcase unless key
  return if key.length < 2
  ch = key.slice(key.length - 1)
  case ch when ?=, ?> then step_back end
  key.slice(0, key.length - 1).downcase
end
scan_tag_attribute_value() click to toggle source
# File lib/gammo/tokenizer.rb, line 187
def scan_tag_attribute_value
  byte = scanner.get_byte
  step_back && return if byte != ?=
  scan_whitespace
  return unless quote = scanner.get_byte
  return if scanner.eos?
  case quote
  when ?>     then step_back && return
  when ?', ?"
    value = scan_until(/#{quote}/) 
    unless value
      scan_until(/\z/)
      raise EOSError, "Couldn't find a token for representing end of the tag" unless byte = scanner.get_byte
    end
    value.slice(0, value.length - 1)
  else
    return quote if scanner.eos?
    return quote + scan_until(/\z/) unless value = scan_until(/[\s>]/)
    step_back if value.end_with?(?>)
    quote + value.slice(0, value.length - 1)
  end
end
scan_until_close_angle() click to toggle source
# File lib/gammo/tokenizer.rb, line 271
def scan_until_close_angle
  text = scan_until(/>/)
  text ? text.slice(0, text.length - 1) : scan_until(/\z/)
end
scan_whitespace() click to toggle source
# File lib/gammo/tokenizer.rb, line 318
def scan_whitespace
  scan(/[\s]+/)
end
step_back() click to toggle source
# File lib/gammo/tokenizer.rb, line 174
def step_back
  scanner.pos -= 1
end
text_token(text) click to toggle source
# File lib/gammo/tokenizer.rb, line 353
def text_token(text)
  TextToken.new(text, raw: raw, convert_null: convert_null)
end