class Nasl::Tokenizer

Public Class Methods

new(code, path) click to toggle source
# File lib/nasl/tokenizer.rb, line 160
def initialize(code, path)
  @code = code

  # Perform one-time initialization of tokenizer data structures.
  initialize!

  # Create a context object that will be shared amongst all tokens for this
  # code.
  @ctx = Context.new(@code, path)

  reset
end

Public Instance Methods

consume(num=1) click to toggle source
# File lib/nasl/tokenizer.rb, line 173
def consume(num=1)
  # Update the index of the character we're currently looking at.
  @point += num

  # Update the flag that indicates whether we've reached the file's end.
  @eof = (@point >= @code.length)

  # Update the the character we're examining currently.
  @char = @code[@point]

  # Extract the remainder of the line.
  @line = @code[@point..@ctx.eol(@point)]
end
die(msg) click to toggle source
# File lib/nasl/tokenizer.rb, line 211
def die(msg)
  # We want the default context for token errors to be all lines that
  # contain the region.
  region = @ctx.bol(@mark)..@ctx.eol(@point)
  bt = @ctx.context(@mark..@point + 1, region)

  # Raise an exception with the context as our backtrace.
  raise TokenException, msg, bt
end
get_comment() click to toggle source
# File lib/nasl/tokenizer.rb, line 315
def get_comment
  # Remember the column the comment begins in.
  col = @ctx.col(@point)

  # Consume all of the comments in the block.
  block = []
  begin
    prev = @ctx.row(@point)
    comment = @line[/^#.*$/]
    break if comment.nil?
    block << comment
    consume(comment.length)
    skip
    cur = @ctx.row(@point)
  end while @ctx.col(@point) == col && cur == prev + 1

  return [:COMMENT, block.join("\n")]
end
get_comment_c_style() click to toggle source
# File lib/nasl/tokenizer.rb, line 334
def get_comment_c_style
  if @code[@point+1] == '/'
    comment = @line[/^\/\/.*$/]
  # Multi-line: /* comment here */
  else
    newline = @code[@point..-1]
    comment = newline[/^\/\*.*?\*\//m]
    die("Unterminated multiline comment") if comment.nil?
  end

  consume(comment.length)
  skip

  return [:COMMENT, comment]
end
get_identifier() click to toggle source
# File lib/nasl/tokenizer.rb, line 221
    def get_identifier
      # Identifiers are composed of letters, digits, and underscores.
      #ident = @line[/^[_a-z][_a-z0-9]*/i]
#      ident = @line[/^[_a-z]([_a-z0-9]*::[_a-z0-9]+)*[_a-z0-9]*/i]
      ident = @line[/^(::|[_a-z])([_a-z0-9]*::[_a-z0-9]+)*[_a-z0-9]*/i]
      consume(ident.length)

      # Assume that we've got an identifier until proven otherwise.
      type = :IDENT

      # Identifiers may be prefixed with keywords. One example of a valid
      # identifier is "break_". To ensure that we catch these cases, we
      # initially parse all keywords as identifiers and then convert them as
      # needed.
      type = @@keywords[ident] if @@keywords.has_key? ident

      return [type, ident]
    end
get_integer() click to toggle source
# File lib/nasl/tokenizer.rb, line 240
def get_integer
  # Try and parse the integer in any of three bases.
  if @line =~ /^0x/i
    # Hex integers start with "0x".
    type = :INT_HEX
    name = "hex"
    regex1 = /^0x\w+/i
    regex2 = /^0x[a-f0-9]+/i
  elsif @line =~ /^0\w+/
    # Octal integers start with "0".
    type = :INT_OCT
    name = "octal"
    regex1 = /^0\w+/
    regex2 = /^0[0-7]+/
  else
    # Anything else is a decimal integer.
    type = :INT_DEC
    name = "decimal"
    regex1 = /^\w*/
    regex2 = /^[0-9]+/
  end

  # First match with an overly permissive regex, and then match with the
  # proper regex. If the permissive and restrictive versions don't match,
  # then there's an error in the input.
  permissive = @line[regex1]
  restrictive = @line[regex2]

  if permissive.nil? || restrictive.nil? || permissive != restrictive
    # NASL interprets integers with a leading zero as octal if the only
    # contain octal digits, and considers the integers as decimal otherwise.
    type = :INT_DEC
    regex2 = /^[0-9]+/
    restrictive = @line[regex2]
  end

  if permissive.nil? || restrictive.nil? || permissive != restrictive
    die("Invalid #{name} literal")
  end

  # If there was no problem, we use the restrictive version as the body of
  # our integer.
  integer = restrictive

  consume(integer.length)

  return [type, integer]
end
get_operator() click to toggle source
# File lib/nasl/tokenizer.rb, line 350
def get_operator
  line_prefixes = @@operator_lengths.map { |len| @line[0, len] }
  operators_that_matched = line_prefixes.map { |prefix| @@operators[prefix] }
  operators_that_matched.reject!(&:nil?)
  return nil if operators_that_matched.empty?
  op, type = operators_that_matched.sort { |a, b| a[2] <=> b[2] }.first
  consume(op.length)
  return [type, op]
end
get_string() click to toggle source
# File lib/nasl/tokenizer.rb, line 289
def get_string
  unparsed = @code[@point..-1]

  if @char == "'"
    type = :DATA

    # Single-quoted strings cannot have single-quotes stuffed inside them.
    contents = unparsed[/\A'(\\.|[^'\\])*'/m]
    die("Unterminated single-quoted string") if contents.nil?
  else
    type = :STRING

    # Double-quoted strings cannot have double quotes stuffed inside them.
    contents = unparsed[/\A"[^"]*"/m]
    die("Unterminated double-quoted string") if contents.nil?
  end

  # Move the point forward over the string.
  consume(contents.length)

  # Remove the bounding quotes.
  contents = contents[1..-2]

  return [type, contents]
end
get_token() click to toggle source
# File lib/nasl/tokenizer.rb, line 360
def get_token
  # If we deferred a token, emit it now.
  unless @deferred.nil?
    token = @deferred
    @deferred = nil
    return token
  end

  # Make sure we're not at the end of the file.
  return [false, Token.new(:EOF, "$", @point...@point, @ctx)] if @eof

  # Save our starting point, which to use Emacs terminology is called the
  # 'mark'.
  @mark = @point

  # Try to parse token at the point.
  token = if @char =~ /[_a-z]/i or @line =~ /^::/
    get_identifier
  elsif @char =~ /['"]/
    get_string
  elsif @char =~ /[0-9]/
    get_integer
  elsif @char == '#'
    get_comment
  elsif (@char == '/') && ["/", "*"].include?(@code[@point+1])
    get_comment_c_style
  else
    get_operator
  end

  # Everything in the language is enumerated by the above functions, so if
  # we get here without a token parsed, the input file is invalid.
  die("Invalid character ('#@char')") if token.nil?

  # Consume all whitespace after the token, and create an object with
  # context.
  skip
  token = [token.first, Token.new(*token, @mark...@point, @ctx)]

  # If a comment is the first token in a file, or is followed by certain
  # tokens, then it is considered significant. Such tokens will appear in
  # the grammar so that it can be made visible to nasldoc.
  if token.first == :COMMENT
    if @previous.nil?
      @previous = [:DUMMY, ""]
    else
      @previous = token
      token = get_token
    end
  elsif !@previous.nil? && @previous.first == :COMMENT && @@annotated.include?(token.first)
    @deferred = token
    token = @previous
    @previous = @deferred       
  else
    @previous = token
  end

  return token
end
get_tokens() click to toggle source
# File lib/nasl/tokenizer.rb, line 420
def get_tokens
  tokens = []

  begin
    tokens << get_token
  end while not tokens.last.last.type == :EOF

  return tokens
end
initialize!() click to toggle source
# File lib/nasl/tokenizer.rb, line 144
def initialize!
  return if @@initialized

  @@operator_lengths = @@operators.map { |op, type| op.length }.uniq

  # Convert the operators into a form that's fast to access.
  tmp = {}
  @@operators.each_with_index do |op_and_type, index|
    op, type = op_and_type
    tmp[op] = [op, type, index]
  end
  @@operators = tmp

  @@initialized = true
end
reset() click to toggle source
# File lib/nasl/tokenizer.rb, line 187
def reset
  # We need to remember the last token so we only emit comments significant
  # to nasldoc.
  @previous = nil
  @deferred = nil

  # Set tokenizer to initial state, ready to tokenize the code from the
  # start.
  @point = 0
  consume(0)
  skip

  # Return tokenizer to allow method chaining.
  self
end
skip() click to toggle source
# File lib/nasl/tokenizer.rb, line 203
def skip
  while true do
    whitespace = @line[/^\s+/]
    return if whitespace.nil?
    consume(whitespace.length)
  end
end