class Regexp::Lexer

A very thin wrapper around the scanner that breaks quantified literal runs, collects emitted tokens into an array, calculates their nesting depth, and normalizes tokens for the parser, and checks if they are implemented by the given syntax flavor.

Constants

CLOSING_TOKENS
CONDITION_TOKENS
OPENING_TOKENS

Attributes

block[RW]
collect_tokens[RW]
conditional_nesting[RW]
nesting[RW]
preprev_token[RW]
prev_token[RW]
set_nesting[RW]
shift[RW]
tokens[RW]

Public Class Methods

lex(input, syntax = nil, options: nil, collect_tokens: true, &block) click to toggle source
# File lib/regexp_parser/lexer.rb, line 16
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
  new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
end
Also aliased as: scan
scan(input, syntax = nil, options: nil, collect_tokens: true, &block)
Alias for: lex

Public Instance Methods

emit(token) click to toggle source
# File lib/regexp_parser/lexer.rb, line 71
def emit(token)
  if block
    # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
    res = block.call(token)
    tokens << res if collect_tokens
  else
    tokens << token
  end
end
lex(input, syntax = nil, options: nil, collect_tokens: true, &block) click to toggle source
# File lib/regexp_parser/lexer.rb, line 20
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
  syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT

  self.block = block
  self.collect_tokens = collect_tokens
  self.tokens = []
  self.prev_token = nil
  self.preprev_token = nil
  self.nesting = 0
  self.set_nesting = 0
  self.conditional_nesting = 0
  self.shift = 0

  Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
    type, token = *syntax.normalize(type, token)
    syntax.check! type, token

    ascend(type, token)

    if (last = prev_token) &&
       type == :quantifier &&
       (
         (last.type == :literal         && (parts = break_literal(last))) ||
         (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
       )
      emit(parts[0])
      last = parts[1]
    end

    current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                nesting, set_nesting, conditional_nesting)

    if type == :conditional && CONDITION_TOKENS.include?(token)
      current = merge_condition(current, last)
    elsif last
      last.next = current
      current.previous = last
      emit(last)
    end

    self.preprev_token = last
    self.prev_token = current

    descend(type, token)
  end

  emit(prev_token) if prev_token

  collect_tokens ? tokens : nil
end

Private Instance Methods

ascend(type, token) click to toggle source
# File lib/regexp_parser/lexer.rb, line 91
def ascend(type, token)
  return unless CLOSING_TOKENS.include?(token)

  case type
  when :group, :assertion
    self.nesting = nesting - 1
  when :set
    self.set_nesting = set_nesting - 1
  when :conditional
    self.conditional_nesting = conditional_nesting - 1
  else
    raise "unhandled nesting type #{type}"
  end
end
break_codepoint_list(token) click to toggle source

if a codepoint list is followed by a quantifier, that quantifier applies to the last codepoint, e.g. /u{61 62 63}{3}/ =~ ‘abccc’ c.f. break_literal.

# File lib/regexp_parser/lexer.rb, line 143
def break_codepoint_list(token)
  lead, _, tail = token.text.rpartition(' ')
  return if lead.empty?

  token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
            token.ts, (token.te - tail.length),
            nesting, set_nesting, conditional_nesting)
  token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
            (token.ts + lead.length + 1), (token.te + 3),
            nesting, set_nesting, conditional_nesting)

  self.shift = shift + 3 # one space less, but extra \, u, {, and }

  token_1.previous = preprev_token
  token_1.next = token_2
  token_2.previous = token_1 # .next will be set by #lex
  [token_1, token_2]
end
break_literal(token) click to toggle source

called by scan to break a literal run that is longer than one character into two separate tokens when it is followed by a quantifier

# File lib/regexp_parser/lexer.rb, line 123
def break_literal(token)
  lead, last, _ = token.text.partition(/.\z/mu)
  return if lead.empty?

  token_1 = Regexp::Token.new(:literal, :literal, lead,
            token.ts, (token.te - last.length),
            nesting, set_nesting, conditional_nesting)
  token_2 = Regexp::Token.new(:literal, :literal, last,
            (token.ts + lead.length), token.te,
            nesting, set_nesting, conditional_nesting)

  token_1.previous = preprev_token
  token_1.next = token_2
  token_2.previous = token_1 # .next will be set by #lex
  [token_1, token_2]
end
descend(type, token) click to toggle source
# File lib/regexp_parser/lexer.rb, line 106
def descend(type, token)
  return unless OPENING_TOKENS.include?(token)

  case type
  when :group, :assertion
    self.nesting = nesting + 1
  when :set
    self.set_nesting = set_nesting + 1
  when :conditional
    self.conditional_nesting = conditional_nesting + 1
  else
    raise "unhandled nesting type #{type}"
  end
end
merge_condition(current, last) click to toggle source
# File lib/regexp_parser/lexer.rb, line 162
def merge_condition(current, last)
  token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
    last.ts, current.te, nesting, set_nesting, conditional_nesting)
  token.previous = preprev_token # .next will be set by #lex
  token
end