class Hocon::Impl::Tokenizer::TokenIterator

Constants

FIRST_NUMBER_CHARS

chars JSON allows a number to start with

NOT_IN_UNQUOTED_TEXT

chars that stop an unquoted string

NUMBER_CHARS

chars JSON allows to be part of a number

Public Class Methods

line_origin(base_origin, line_number) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 197
def self.line_origin(base_origin, line_number)
  base_origin.with_line_number(line_number)
end
new(origin, input, allow_comments) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 108
def initialize(origin, input, allow_comments)
  @origin = origin
  @input = input
  @allow_comments = allow_comments
  @buffer = []
  @line_number = 1
  @line_origin = @origin.with_line_number(@line_number)
  @tokens = []
  @tokens << Tokens::START
  @whitespace_saver = WhitespaceSaver.new
end
problem(origin, what, message, suggest_quotes, cause) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 190
def self.problem(origin, what, message, suggest_quotes, cause)
  if what.nil? || message.nil?
    raise ConfigBugOrBrokenError.new("internal error, creating bad TokenizerProblemError")
  end
  TokenizerProblemError.new(Tokens.new_problem(origin, what, message, suggest_quotes, cause))
end
simple_value?(t) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 562
def self.simple_value?(t)
  Tokens.substitution?(t) ||
      Tokens.unquoted_text?(t) ||
      Tokens.value?(t)
end
whitespace?(c) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 143
def self.whitespace?(c)
  Hocon::Impl::ConfigImplUtil.whitespace?(c)
end
whitespace_not_newline?(c) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 147
def self.whitespace_not_newline?(c)
  (c != "\n") and (Hocon::Impl::ConfigImplUtil.whitespace?(c))
end

Public Instance Methods

append_triple_quoted_string(sb, sb_orig) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 380
def append_triple_quoted_string(sb, sb_orig)
  # we are after the opening triple quote and need to consume the
  # close triple
  consecutive_quotes = 0

  while true
    c = next_char_raw

    if c == '"'
      consecutive_quotes += 1
    elsif consecutive_quotes >= 3
      # the last three quotes end the string and the other kept.
      sb.string = sb.string[0...-3]
      put_back c
      break
    else
      consecutive_quotes = 0
      if c == -1
        error_msg = "End of input but triple-quoted string was still open"
        raise self.class.problem(@line_origin, c, error_msg, false, nil)
      elsif c == "\n"
        # keep the line number accurate
        @line_number += 1
        @line_origin = @origin.with_line_number(@line_number)
      end
    end

    sb << c
    sb_orig << c
  end
end
each() { |next| ... } click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 600
def each
  while has_next?
    # Have to use self.next instead of next because next is a reserved word
    yield self.next
  end
end
has_next?() click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 577
def has_next?
  !@tokens.empty?
end
map() { |token| ... } click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 607
def map
  token_list = []
  each do |token|
    # yield token to calling method, append whatever is returned from the
    # map block to token_list
    token_list << yield(token)
  end
  token_list
end
next() click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 581
def next
  t = @tokens.shift
  if (@tokens.empty?) and (t != Tokens::EOF)
    begin
      queue_next_token
    rescue TokenizerProblemError => e
      @tokens.push(e.problem)
    end
    if @tokens.empty?
      raise ConfigBugOrBrokenError, "bug: tokens queue should not be empty here"
    end
  end
  t
end
next_char_after_whitespace(saver) click to toggle source

get next char, skipping non-newline whitespace

# File lib/hocon/impl/tokenizer.rb, line 175
def next_char_after_whitespace(saver)
  while true
    c = next_char_raw
    if c == -1
      return -1
    else
      if self.class.whitespace_not_newline?(c)
        saver.add(c)
      else
        return c
      end
    end
  end
end
next_char_raw() click to toggle source

this should ONLY be called from nextCharSkippingComments or when inside a quoted string, or when parsing a sequence like ${ or +=, everything else should use nextCharSkippingComments().

# File lib/hocon/impl/tokenizer.rb, line 124
def next_char_raw
  if @buffer.empty?
    begin
      @input.readchar.chr
    rescue EOFError
      -1
    end
  else
    @buffer.pop
  end
end
pull_comment(first_char) click to toggle source

ONE char has always been consumed, either the # or the first /, but not both slashes

# File lib/hocon/impl/tokenizer.rb, line 203
def pull_comment(first_char)
  double_slash = false
  if first_char == '/'
    discard = next_char_raw
    if discard != '/'
      raise ConfigBugOrBrokenError, "called pullComment but // not seen"
    end
    double_slash = true
  end

  io = StringIO.new
  while true
    c = next_char_raw
    if (c == -1) || (c == "\n")
      put_back(c)
      if (double_slash)
        return Tokens.new_comment_double_slash(@line_origin, io.string)
      else
        return Tokens.new_comment_hash(@line_origin, io.string)
      end
    else
      io << c
    end
  end
end
pull_escape_sequence(sb, sb_orig) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 322
def pull_escape_sequence(sb, sb_orig)
  escaped = next_char_raw

  if escaped == -1
    error_msg = "End of input but backslash in string had nothing after it"
    raise self.class.problem(@line_origin, "", error_msg, false, nil)
  end

  # This is needed so we return the unescaped escape characters back out when rendering
  # the token
  sb_orig << "\\" << escaped

  case escaped
    when "\""
      sb << "\""
    when "\\"
      sb << "\\"
    when "/"
      sb << "/"
    when "b"
      sb << "\b"
    when "f"
      sb << "\f"
    when "n"
      sb << "\n"
    when "r"
      sb << "\r"
    when "t"
      sb << "\t"
    when "u"
      codepoint = ""

      # Grab the 4 hex chars for the unicode character
      4.times do
        c = next_char_raw

        if c == -1
          error_msg = "End of input but expecting 4 hex digits for \\uXXXX escape"
          raise self.class.problem(@line_origin, c, error_msg, false, nil)
        end

        codepoint << c
      end
      sb_orig << codepoint
      # Convert codepoint to a unicode character
      packed = [codepoint.hex].pack("U")
      if packed == "_"
        raise self.class.problem(@line_origin, codepoint,
                                 "Malformed hex digits after \\u escape in string: '#{codepoint}'",
                                 false, nil)
      end
      sb << packed
    else
      error_msg = "backslash followed by '#{escaped}', this is not a valid escape sequence (quoted strings use JSON escaping, so use double-backslash \\ for literal backslash)"
      raise self.class.problem(Hocon::Impl::Tokenizer.as_string(escaped), "", error_msg, false, nil)
  end
end
pull_next_token(saver) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 513
def pull_next_token(saver)
  c = next_char_after_whitespace(saver)
  if c == -1
    Tokens::EOF
  elsif c == "\n"
    # newline tokens have the just-ended line number
    line = Tokens.new_line(@line_origin)
    @line_number += 1
    @line_origin = @origin.with_line_number(@line_number)
    line
  else
    t = nil
    if start_of_comment?(c)
      t = pull_comment(c)
    else
      t = case c
            when '"' then pull_quoted_string
            when '$' then pull_substitution
            when ':' then Tokens::COLON
            when ',' then Tokens::COMMA
            when '=' then Tokens::EQUALS
            when '{' then Tokens::OPEN_CURLY
            when '}' then Tokens::CLOSE_CURLY
            when '[' then Tokens::OPEN_SQUARE
            when ']' then Tokens::CLOSE_SQUARE
            when '+' then pull_plus_equals
            else nil
          end

      if t.nil?
        if FIRST_NUMBER_CHARS.index(c)
          t = pull_number(c)
        elsif NOT_IN_UNQUOTED_TEXT.index(c)
          raise self.class.problem(@line_origin, c, "Reserved character '#{c}' is not allowed outside quotes", true, nil)
        else
          put_back(c)
          t = pull_unquoted_text
        end
      end
    end

    if t.nil?
      raise ConfigBugOrBrokenError, "bug: failed to generate next token"
    end

    t
  end
end
pull_number(first_char) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 279
def pull_number(first_char)
  sb = StringIO.new
  sb << first_char
  contained_decimal_or_e = false
  c = next_char_raw
  while (c != -1) && (NUMBER_CHARS.index(c))
    if (c == '.') ||
        (c == 'e') ||
        (c == 'E')
      contained_decimal_or_e = true
    end
    sb << c
    c = next_char_raw
  end
  # the last character we looked at wasn't part of the number, put it
  # back
  put_back(c)
  s = sb.string
  begin
    if contained_decimal_or_e
      # force floating point representation
      Tokens.new_double(@line_origin, Float(s), s)
    else
      Tokens.new_long(@line_origin, Integer(s), s)
    end
  rescue ArgumentError => e
    if e.message =~ /^invalid value for (Float|Integer)\(\)/
      # not a number after all, see if it's an unquoted string.
      s.each_char do |u|
        if NOT_IN_UNQUOTED_TEXT.index(u)
          raise self.class.problem(@line_origin, u, "Reserved character '#{u}'" +
                                                   "is not allowed outside quotes", true, nil)
        end
      end
      # no evil chars so we just decide this was a string and
      # not a number.
      Tokens.new_unquoted_text(@line_origin, s)
    else
      raise e
    end
  end
end
pull_plus_equals() click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 458
def pull_plus_equals
  # the initial '+' has already been consumed
  c = next_char_raw

  unless c == '='
    error_msg = "'+' not followed by =, '#{c}' not allowed after '+'"
    raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
  end

  Tokens::PLUS_EQUALS
end
pull_quoted_string() click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 412
def pull_quoted_string
  # the open quote has already been consumed
  sb = StringIO.new

  # We need a second StringIO to keep track of escape characters.
  # We want to return them exactly as they appeared in the original text,
  # which means we will need a new StringIO to escape escape characters
  # so we can also keep the actual value of the string. This is gross.
  sb_orig = StringIO.new
  sb_orig << '"'

  c = ""
  while c != '"'
    c = next_char_raw
    if c == -1
      raise self.class.problem(@line_origin, c, "End of input but string quote was still open", false, nil)
    end

    if c == "\\"
      pull_escape_sequence(sb, sb_orig)
    elsif c == '"'
      sb_orig << c
      # done!
    elsif c =~ /[[:cntrl:]]/
      raise self.class.problem(@line_origin, c, "JSON does not allow unescaped #{c}" +
                                               " in quoted strings, use a backslash escape", false, nil)
    else
      sb << c
      sb_orig << c
    end
  end

  # maybe switch to triple-quoted string, sort of hacky...
  if sb.length == 0
    third = next_char_raw
    if third == '"'
      sb_orig << third
      append_triple_quoted_string(sb, sb_orig)
    else
      put_back(third)
    end
  end

  Tokens.new_string(@line_origin, sb.string, sb_orig.string)
end
pull_substitution() click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 470
def pull_substitution
  # the initial '$' has already been consumed
  c = next_char_raw
  if c != '{'
    error_msg = "'$' not followed by {, '#{c}' not allowed after '$'"
    raise self.class.problem(@line_origin, c, error_msg, true, nil) # true = suggest quotes
  end

  optional = false
  c = next_char_raw

  if c == '?'
    optional = true
  else
    put_back(c)
  end

  saver = WhitespaceSaver.new
  expression = []

  while true
    t = pull_next_token(saver)
    # note that we avoid validating the allowed tokens inside
    # the substitution here; we even allow nested substitutions
    # in the tokenizer. The parser sorts it out.

    if t == Tokens::CLOSE_CURLY
      # end the loop, done!
      break
    elsif t == Tokens::EOF
      raise self.class.problem(@line_origin, t, "Substitution ${ was not closed with a }", false, nil)
    else
      whitespace = saver.check(t, @line_origin, @line_number)
      unless whitespace.nil?
        expression << whitespace
      end
      expression << t
    end
  end

  Tokens.new_substitution(@line_origin, optional, expression)
end
pull_unquoted_text() click to toggle source

The rules here are intended to maximize convenience while avoiding confusion with real valid JSON. Basically anything that parses as JSON is treated the JSON way and otherwise we assume it’s a string and let the parser sort it out.

# File lib/hocon/impl/tokenizer.rb, line 241
def pull_unquoted_text
  origin = @line_origin
  io = StringIO.new
  c = next_char_raw
  while true
    if (c == -1) or
        (NOT_IN_UNQUOTED_TEXT.index(c)) or
        (self.class.whitespace?(c)) or
        (start_of_comment?(c))
      break
    else
      io << c
    end

    # we parse true/false/null tokens as such no matter
    # what is after them, as long as they are at the
    # start of the unquoted token.
    if io.length == 4
      if io.string == "true"
        return Tokens.new_boolean(origin, true)
      elsif io.string == "null"
        return Tokens.new_null(origin)
      end
    elsif io.length  == 5
      if io.string == "false"
        return Tokens.new_boolean(origin, false)
      end
    end

    c = next_char_raw
  end

  # put back the char that ended the unquoted text
  put_back(c)

  Tokens.new_unquoted_text(origin, io.string)
end
put_back(c) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 136
def put_back(c)
  if @buffer.length > 2
    raise ConfigBugOrBrokenError, "bug: putBack() three times, undesirable look-ahead"
  end
  @buffer.push(c)
end
queue_next_token() click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 568
def queue_next_token
  t = pull_next_token(@whitespace_saver)
  whitespace = @whitespace_saver.check(t, @origin, @line_number)
  if whitespace
    @tokens.push(whitespace)
  end
  @tokens.push(t)
end
remove() click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 596
def remove
  raise ConfigBugOrBrokenError, "Does not make sense to remove items from token stream"
end
start_of_comment?(c) click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 151
def start_of_comment?(c)
  if c == -1
    false
  else
    if @allow_comments
      if c == '#'
        true
      elsif c == '/'
        maybe_second_slash = next_char_raw
        # we want to predictably NOT consume any chars
        put_back(maybe_second_slash)
        if maybe_second_slash == '/'
          true
        else
          false
        end
      end
    else
      false
    end
  end
end
to_list() click to toggle source
# File lib/hocon/impl/tokenizer.rb, line 617
def to_list
  # Return array of tokens from the iterator
  self.map { |token| token }
end