class Rucc::Lexer::Impl

Attributes

infile[R]

Public Class Methods

new(files) click to toggle source

@param [FileIOList] files

# File lib/rucc/lexer/impl.rb, line 13
def initialize(files)
  @infile = files.first
  @files = files

  @buffers = [[]]  # stack buffers to impl peek.
  @token_gen = TokenGen.new(@files)
end

Public Instance Methods

current_file() click to toggle source

@return [FileIO]

# File lib/rucc/lexer/impl.rb, line 199
def current_file
  @files.current
end
lex() click to toggle source

@return [Token]

# File lib/rucc/lexer/impl.rb, line 36
def lex
  buf = @buffers.last
  if buf.size > 0
    return buf.pop
  end

  if @buffers.size > 1
    return Token::EOF_TOKEN
  end

  bol = (current_file.column == 1)
  tok = do_read_token
  while tok.kind == T::SPACE
    tok = do_read_token
    tok.space = true
  end
  tok.bol = bol

  # NOTE: only for debug
  # if tok.kind == T::NEWLINE
  #   print "\n"
  # else
  #   print " " if tok.space
  #   print tok
  #   # print current_file.name
  # end

  tok
end
lex_string(s) click to toggle source

Reads a token from a given string. This function temporarily switches the main input stream to a given string and reads one token.

@param [String] s @return [Token

# File lib/rucc/lexer/impl.rb, line 72
def lex_string(s)
  @files.stream_stash([FileIO.new(StringIO.new(s), "-")])
  r = do_read_token
  next?("\n")
  p = get_pos(0)
  if peek != nil  # EOF
    raise "#{p}: unconsumed input: #{s}"
    # errorp(p, "unconsumed input: %s", s)
  end
  @files.stream_unstash
  r
end
push_file(file) click to toggle source

@param [FileIO]

# File lib/rucc/lexer/impl.rb, line 133
def push_file(file)
  @files.push(file)
end
read_header_file_name() click to toggle source

Reads a header file name for include.

Filenames after include need a special tokenization treatment. A filename string may be quoted by < and > instead of “”. Even if it's quoted by “”, it's still different from a regular string token. For example, \ in this context is not interpreted as a quote. Thus, we cannot use lex() to read a filename.

That the C preprocessor requires a special lexer behavior only for include is a violation of layering. Ideally, the lexer should be agnostic about higher layers status. But we need this for the C grammar.

@return [<String, Boolean>, <NilClass, NilClass>]

# File lib/rucc/lexer/impl.rb, line 98
def read_header_file_name
  std = nil
  if !buffer_empty?
    return nil, std
  end

  skip_space!
  p = get_pos(0)
  if next?('"')
    std = false
    close = '"'
  elsif next?('<')
    std = true
    close = '>'
  else
    return nil, std
  end
  b = ""
  while !next?(close)
    c = readc
    if c.nil? || c == '\n'
      raise "#{p}: premature end of header name"
      # errorp(p, "premature end of header name");
    end
    b << c
  end
  if b.size == 0
    raise "#{p}: header name should not be empty"
    # errorp(p, "header name should not be empty");
  end

  return b, std
end
skip_cond_incl!() click to toggle source

Skips a block of code excluded from input by if, ifdef and the like. C11 6.10 says that code within if and endif needs to be a sequence of valid tokens even if skipped. However, in reality, most compilers don't tokenize nor validate contents. We don't do that, too. This function is to skip code until matching endif as fast as we can.

# File lib/rucc/lexer/impl.rb, line 156
def skip_cond_incl!
  nest = 0
  while true
    bol = current_file.column == 1
    skip_space!
    c = readc
    if c.nil?  # EOF
      return
    end
    if c == '\''
      skip_char!
      next
    end
    if c == '"'
      skip_string!
      next
    end
    if (c != '#' || !bol)
      next
    end
    column = current_file.column - 1
    tok = lex
    if (tok.kind != T::IDENT)
      next
    end
    if (nest == 0) && (Token.is_ident?(tok, "else") || Token.is_ident?(tok, "elif") || Token.is_ident?(tok, "endif"))
      unget_token(tok)
      hash = @token_gen.make_keyword('#')
      hash.bol = true
      hash.column = column
      unget_token(hash)
      return
    end
    if Token.is_ident?(tok, "if") || Token.is_ident?(tok, "ifdef") || Token.is_ident?(tok, "ifndef")
      nest += 1
    elsif (nest > 0) && Token.is_ident?(tok, "endif")
      nest -= 1
    end
    skip_line!
  end
end
token_buffer_stash(buf) click to toggle source

Temporarily switches the input token stream to given list of tokens, so that you can get the tokens as return values of lex() again. After the tokens are exhausted, EOF is returned from lex() until “unstash” is called to restore the original state.

@param [<Token>] buf

# File lib/rucc/lexer/impl.rb, line 143
def token_buffer_stash(buf)
  @buffers.push(buf)
end
token_buffer_unstash() click to toggle source
# File lib/rucc/lexer/impl.rb, line 147
def token_buffer_unstash
  @buffers.pop
end
unget_all(tokens) click to toggle source

@param [<Token>] tokens

# File lib/rucc/lexer/impl.rb, line 31
def unget_all(tokens)
  tokens.reverse.each { |token| unget_token(token) }
end
unget_token(tok) click to toggle source

@param [Token] tok

# File lib/rucc/lexer/impl.rb, line 24
def unget_token(tok)
  return if tok.kind == T::EOF  # Does not unget
  buf = @buffers.last
  buf.push(tok)
end

Private Instance Methods

buffer_empty?() click to toggle source

@return [Boolean]

# File lib/rucc/lexer/impl.rb, line 227
def buffer_empty?
  @buffers.size == 1 && @buffers.first.size == 0
end
do_read_token() click to toggle source

@return [Token]

# File lib/rucc/lexer/impl.rb, line 584
def do_read_token
  if skip_space!
    return Token::SPACE_TOKEN
  end
  mark!
  c = readc
  case c
  when "\n"
    return Token::NEWLINE_TOKEN
  when ':'
    return @token_gen.make_keyword(next?('>') ? ']' : ':')
  when '#'
    return @token_gen.make_keyword(next?('#') ? K::HASHHASH : '#')
  when '+'
    return read_rep2('+', OP::INC, '=', OP::A_ADD, '+')
  when '*'
    return read_rep('=', OP::A_MUL, '*')
  when '='
    return read_rep('=', OP::EQ, '=')
  when '!'
    return read_rep('=', OP::NE, '!')
  when '&'
    return read_rep2('&', OP::LOGAND, '=', OP::A_AND, '&');
  when '|'
    return read_rep2('|', OP::LOGOR, '=', OP::A_OR, '|');
  when '^'
    return read_rep('=', OP::A_XOR, '^')
  when '"'
    return read_string(ENC::NONE)
  when '\''
    return read_char(ENC::NONE)
  when '/'
    return @token_gen.make_keyword(next?('=') ? OP::A_DIV : '/');
  when *'a'..'t', *'v'..'z', *'A'..'K', *'M'..'T', *'V'..'Z', '_', '$', *(0x80.chr..0xFD.chr)
    return read_ident(c)
  when *'0'..'9'
    return read_number(c)
  when 'L', 'U'
    # NOTE: Wide/char32_t character/string literal
    enc = (c == 'L') ? ENC::WCHAR : ENC::CHAR32
    return read_string(enc) if next?('"')
    return read_char(enc)   if next?('\'')
    return read_ident(c)
  when 'u'
    return read_string(ENC::CHAR16) if next?('"')
    return read_char(ENC::CHAR16)   if next?('\'')
    # C11 6.4.5: UTF-8 string literal
    if next?('8')
      if next?('"')
        return read_string(ENC::UTF8)
      end
      unreadc('8')
    end
    return read_ident(c)
  when '.'
    return read_number(c) if Libc.isdigit(peek)
    if next?('.')
      if next?('.')
        return @token_gen.make_keyword(K::ELLIPSIS)
      end
      return @token_gen.make_ident('..')
    end
    return @token_gen.make_keyword('.')
  when '(', ')', ',', ';', '[', ']', '{', '}', '?', '~'
    return @token_gen.make_keyword(c)
  when '-'
    return @token_gen.make_keyword(OP::DEC)   if next?('-')
    return @token_gen.make_keyword(OP::ARROW) if next?('>')
    return @token_gen.make_keyword(OP::A_SUB) if next?('=')
    return @token_gen.make_keyword('-');
  when '<'
    return read_rep('=', OP::A_SAL, OP::SAL) if next?('<')
    return @token_gen.make_keyword(OP::LE)   if next?('=')
    return @token_gen.make_keyword('[')      if next?(':')
    return @token_gen.make_keyword('{')      if next?('%')
    return @token_gen.make_keyword('<')
  when '>'
    return @token_gen.make_keyword(OP::GE)   if next?('=')
    return read_rep('=', OP::A_SAR, OP::SAR) if next?('>')
    return @token_gen.make_keyword('>')
  when '%'
    tok = read_hash_digraph
    return tok if tok
    return read_rep('=', OP::A_MOD, '%')
  when nil
    return Token::EOF_TOKEN
  else
    return @token_gen.make_invalid(c.ord)
  end
end
do_skip_space!() click to toggle source

@return [Boolean]

# File lib/rucc/lexer/impl.rb, line 279
def do_skip_space!
  c = readc
  if c.nil?  # EOF
    return false
  end
  if iswhitespace(c)
    return true;
  end
  if c == '/'
    if next?('*')
      skip_block_comment!
      return true
    end
    if next?('/')
      skip_line!
      return true
    end
  end
  unreadc(c)
  false
end
get_pos(delta) click to toggle source

@param [Integer] delta @return [Pos]

# File lib/rucc/lexer/impl.rb, line 222
def get_pos(delta)
  Pos.new(current_file.line, current_file.column + delta)
end
is_valid_ucn(c) click to toggle source

@param [Integer] c @return [Boolean]

# File lib/rucc/lexer/impl.rb, line 548
def is_valid_ucn(c)
  # C11 6.4.3p2: U+D800 to U+DFFF are reserved for surrogate pairs.
  # A codepoint within the range cannot be a valid character.
  if (0xD800 <= c) && (c <= 0xDFFF)
    return false
  end
  # It's not allowed to encode ASCII characters using \U or \u.
  # Some characters not in the basic character set (C11 5.2.1p3)
  # are allowed as exceptions.
  (0xA0 <= c) || (c == '$'.ord) || (c == '@'.ord) || (c == '`'.ord)
end
iswhitespace(c) click to toggle source

@param [Char] c @return [Boolean]

# File lib/rucc/lexer/impl.rb, line 233
def iswhitespace(c)
  (c == ' ' || c == "\t" || c == "\f" || c == "\v")
end
mark!() click to toggle source

Update current position

# File lib/rucc/lexer/impl.rb, line 216
def mark!
  @token_gen.pos = get_pos(0)
end
next?(expect) click to toggle source

@param [Char] c @return [Boolean]

# File lib/rucc/lexer/impl.rb, line 323
def next?(expect)
  c = readc
  return true if c == expect
  unreadc(c)
  false
end
nextoct?() click to toggle source

@return [Boolean]

# File lib/rucc/lexer/impl.rb, line 579
def nextoct?
  ('0'..'7').include?(peek)
end
peek() click to toggle source

@return [Char]

# File lib/rucc/lexer/impl.rb, line 676
def peek
  r = readc
  unreadc(r)
  r
end
read_char(enc) click to toggle source

@param [ENC] enc @return [Token]

# File lib/rucc/lexer/impl.rb, line 395
def read_char(enc)
  c = readc
  r = (c == '\\'.freeze) ? read_escaped_char : c.ord
  c = readc
  if c != "'".freeze
    raise "unterminated char"
    # errorp(pos, "unterminated char");
  end
  if enc == ENC::NONE
    # NOTE: Only lower 8 bit has meaning
    return @token_gen.make_char(0xFF & r, enc)
  end
  @token_gen.make_char(r, enc)
end
read_escaped_char() click to toggle source

@return [Integer]

# File lib/rucc/lexer/impl.rb, line 441
def read_escaped_char
  # TODO(south37) Impl when necessary
  # Pos p = get_pos(-1);
  c = readc
  case c
  when '\'', '"', '?', '\\'
    c.ord
  when 'a'
    return "\a".ord
  when 'b'
    return "\b".ord
  when 'f'
    return "\f".ord
  when 'n'
    return "\n".ord
  when 'r'
    return "\r".ord
  when 't'
    return "\t".ord
  when 'v'
    return "\v".ord
  when 'e'
    return "\e".ord  # '\e' is GNU extension
  when 'x'
    return read_hex_char
  when 'u'
    return read_universal_char(4)
  when 'U'
    return read_universal_char(8)
  when *'0'..'7'
    return read_octal_char(c)
  end
  # TODO(south37) Impl when necessary
  # warnp(p, "unknown escape character: \\%c", c);
  c.ord
end
read_hash_digraph() click to toggle source

Reads a digraph starting with '%'. Digraphs are alternative spellings for some punctuation characters. They are useless in ASCII. We implement this just for the standard compliance. See C11 6.4.6p3 for the spec.

@return [Token, NilClass]

# File lib/rucc/lexer/impl.rb, line 356
def read_hash_digraph
  if next?('>')
    return @token_gen.make_keyword('}')
  end
  if next?(':')
    if next?('%')
      if next?(':')
        return @token_gen.make_keyword(K::HASHHASH)
      end
      unreadc('%')
    end
    return @token_gen.make_keyword('#')
  end
  nil
end
read_hex_char() click to toggle source

Reads a x escape sequence.

@return [Integer]

# File lib/rucc/lexer/impl.rb, line 500
def read_hex_char
  p = get_pos(-2)
  c = readc
  if !Libc.isxdigit(c)
    raise "#{p}: \\x is not followed by a hexadecimal character: #{c}"
    # errorp(p, "\\x is not followed by a hexadecimal character: %c", c);
  end
  r = 0
  while true
    case c
    when '0' .. '9' then r = (r << 4) | (c.ord - '0'.ord)
    when 'a' .. 'f' then r = (r << 4) | (c.ord - 'a'.ord + 10)
    when 'A' .. 'F' then r = (r << 4) | (c.ord - 'A'.ord + 10)
    else
      unreadc(c)
      return r
    end
    c = readc
  end
end
read_ident(c) click to toggle source
# File lib/rucc/lexer/impl.rb, line 372
def read_ident(c)
  b = c.dup
  while true
    c = readc
    if c && (Libc.isalnum(c) || ((c.ord & 0x80) > 0) || (c == '_') || (c == '$'))
      b << c
      next
    end
    # C11 6.4.2.1: \u or \U characters (universal-character-name)
    # are allowed to be part of identifiers.
    if c && (c == '\\' && (peek == 'u' || peek == 'U'))
      escaped = read_escaped_char
      UTF.write_utf8(b, escaped)
      next
    end
    unreadc(c)
    return @token_gen.make_ident(b)
  end
  raise "Must not reach here!"
end
read_number(c) click to toggle source

Reads a number literal. Lexer's grammar on numbers is not strict. Integers and floating point numbers and different base numbers are not distinguished. @param [Char] c

# File lib/rucc/lexer/impl.rb, line 481
def read_number(c)
  b = c.dup
  last = c
  while true
    c = readc
    flonum = "eEpP".freeze.include?(last) && "+-".freeze.include?(c)
    if !Libc.isdigit(c) && !Libc.isalpha(c) && c != '.' && !flonum
      unreadc(c)
      return @token_gen.make_number(b)
    end
    b << c
    last = c
  end
  raise "Must not reach here"
end
read_octal_char(c) click to toggle source

Reads an octal escape sequence.

@param [Char] c @return [Integer]

# File lib/rucc/lexer/impl.rb, line 564
def read_octal_char(c)
  r = c.ord - '0'.ord
  if !nextoct?
    return r
  end

  r = (r << 3) | (readc.ord - '0'.ord)
  if !nextoct?
    return r
  end

  (r << 3) | (readc.ord - '0'.ord)
end
read_rep(expect, t, els) click to toggle source

@param [Char] expect1 @param [OP] t @param [Char] els @return [Token]

# File lib/rucc/lexer/impl.rb, line 334
def read_rep(expect, t, els)
  @token_gen.make_keyword(next?(expect) ? t : els)
end
read_rep2(expect1, t1, expect2, t2, els) click to toggle source

@param [Char] expect1 @param [OP] t1 @param [Char] expect2 @param [OP] t2 @param [Char] els @return [Token]

# File lib/rucc/lexer/impl.rb, line 344
def read_rep2(expect1, t1, expect2, t2, els)
  return @token_gen.make_keyword(t1) if next?(expect1)
  return @token_gen.make_keyword(t2) if next?(expect2)
  @token_gen.make_keyword(els)
end
read_string(enc) click to toggle source

@param [ENC] @return [Token]

# File lib/rucc/lexer/impl.rb, line 412
def read_string(enc)
  b = ""
  while true
    c = readc
    if c.nil?
      raise "unterminated string"
      # TODO(south37) Impl errorp if necessary
      # errorp(pos, "unterminated string");
    end
    if c == '"'
      break
    end
    if c != '\\'
      b << c
      next
    end
    # Just after backslash escape
    isucs = (peek == 'u' || peek == 'U')
    c = read_escaped_char
    if isucs
      UTF.write_utf8(b, c)
      next
    end
    b << c
  end
  @token_gen.make_strtok(b, enc)
end
read_universal_char(len) click to toggle source

Reads u or U escape sequences. len is 4 or 8, respecitvely.

@param [Integer] len @return [Integer]

# File lib/rucc/lexer/impl.rb, line 525
def read_universal_char(len)
  p = get_pos(-2)
  r = 0
  len.times do
    c = readc
    case c
    when *'0'..'9' then r = (r << 4) | (c.ord - '0'.ord)
    when *'a'..'f' then r = (r << 4) | (c.ord - 'a'.ord + 10)
    when *'A'..'F' then r = (r << 4) | (c.ord - 'A'.ord + 10)
    else
      raise "#{p}: invalid universal character: #{c}"
      # errorp(p, "invalid universal character: %c", c)
    end
  end
  if !is_valid_ucn(r)
    raise "#{p}: invalid universal character: \\#{(len == 4) ? 'u' : 'U'}#{format("%0#{len}d", r)}"
    # errorp(p, "invalid universal character: \\%c%0*x", (len == 4) ? 'u' : 'U', len, r);
  end
  r
end
readc() click to toggle source

@return [Char, NilClass]

# File lib/rucc/lexer/impl.rb, line 206
def readc
  @files.readc
end
skip_block_comment!() click to toggle source
# File lib/rucc/lexer/impl.rb, line 237
def skip_block_comment!
  # TODO(south37) Impl when necessary
  # Pos p = get_pos(-2);
  maybe_end = false
  while true
    c = readc
    if c.nil?
      raise "premature end of block comment"
      # TODO(south37) Impl when necessary
      # errorp(p, "premature end of block comment");
    end
    return if (c == '/' && maybe_end)
    maybe_end = (c == '*')
  end
  raise "Must not reach here"
end
skip_char!() click to toggle source
# File lib/rucc/lexer/impl.rb, line 301
def skip_char!
  if readc == '\\'
    readc
  end
  c = readc
  while (!c.nil? && c != '\'')
    c = readc
  end
end
skip_line!() click to toggle source
# File lib/rucc/lexer/impl.rb, line 254
def skip_line!
  while true
    c = readc
    return if c.nil?
    if c == "\n"
      unreadc(c)
      return
    end
  end
  raise "Must not reach here"
end
skip_space!() click to toggle source

Skips spaces including comments. Returns true if at least one space is skipped.

@return [Boolean] true if skipped

# File lib/rucc/lexer/impl.rb, line 270
def skip_space!
  if !do_skip_space!
    return false
  end
  while do_skip_space!; end
  true
end
skip_string!() click to toggle source
# File lib/rucc/lexer/impl.rb, line 311
def skip_string!
  c = readc
  while (!c.nil? && c != '"')
    if c == '\\'
      readc
    end
    c = readc
  end
end
unreadc(c) click to toggle source

@param [Char]

# File lib/rucc/lexer/impl.rb, line 211
def unreadc(c)
  @files.unreadc(c)
end