class Regexp::Lexer
A very thin wrapper around the scanner that breaks quantified literal runs, collects emitted tokens into an array, calculates their nesting depth, and normalizes tokens for the parser, and checks if they are implemented by the given syntax flavor.
Constants
- CLOSING_TOKENS
- CONDITION_TOKENS
- OPENING_TOKENS
Attributes
block[RW]
collect_tokens[RW]
conditional_nesting[RW]
nesting[RW]
preprev_token[RW]
prev_token[RW]
set_nesting[RW]
shift[RW]
tokens[RW]
Public Class Methods
lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 16 def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block) new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block) end
Also aliased as: scan
Public Instance Methods
emit(token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 71 def emit(token) if block # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block res = block.call(token) tokens << res if collect_tokens else tokens << token end end
lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 20 def lex(input, syntax = nil, options: nil, collect_tokens: true, &block) syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT self.block = block self.collect_tokens = collect_tokens self.tokens = [] self.prev_token = nil self.preprev_token = nil self.nesting = 0 self.set_nesting = 0 self.conditional_nesting = 0 self.shift = 0 Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) if (last = prev_token) && type == :quantifier && ( (last.type == :literal && (parts = break_literal(last))) || (last.token == :codepoint_list && (parts = break_codepoint_list(last))) ) emit(parts[0]) last = parts[1] end current = Regexp::Token.new(type, token, text, ts + shift, te + shift, nesting, set_nesting, conditional_nesting) if type == :conditional && CONDITION_TOKENS.include?(token) current = merge_condition(current, last) elsif last last.next = current current.previous = last emit(last) end self.preprev_token = last self.prev_token = current descend(type, token) end emit(prev_token) if prev_token collect_tokens ? tokens : nil end
Private Instance Methods
ascend(type, token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 91 def ascend(type, token) return unless CLOSING_TOKENS.include?(token) case type when :group, :assertion self.nesting = nesting - 1 when :set self.set_nesting = set_nesting - 1 when :conditional self.conditional_nesting = conditional_nesting - 1 else raise "unhandled nesting type #{type}" end end
break_codepoint_list(token)
click to toggle source
if a codepoint list is followed by a quantifier, that quantifier applies to the last codepoint, e.g. /u{61 62 63}{3}/ =~ ‘abccc’ c.f. break_literal
.
# File lib/regexp_parser/lexer.rb, line 143 def break_codepoint_list(token) lead, _, tail = token.text.rpartition(' ') return if lead.empty? token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}', token.ts, (token.te - tail.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail, (token.ts + lead.length + 1), (token.te + 3), nesting, set_nesting, conditional_nesting) self.shift = shift + 3 # one space less, but extra \, u, {, and } token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end
break_literal(token)
click to toggle source
called by scan to break a literal run that is longer than one character into two separate tokens when it is followed by a quantifier
# File lib/regexp_parser/lexer.rb, line 123 def break_literal(token) lead, last, _ = token.text.partition(/.\z/mu) return if lead.empty? token_1 = Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:literal, :literal, last, (token.ts + lead.length), token.te, nesting, set_nesting, conditional_nesting) token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end
descend(type, token)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 106 def descend(type, token) return unless OPENING_TOKENS.include?(token) case type when :group, :assertion self.nesting = nesting + 1 when :set self.set_nesting = set_nesting + 1 when :conditional self.conditional_nesting = conditional_nesting + 1 else raise "unhandled nesting type #{type}" end end
merge_condition(current, last)
click to toggle source
# File lib/regexp_parser/lexer.rb, line 162 def merge_condition(current, last) token = Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, nesting, set_nesting, conditional_nesting) token.previous = preprev_token # .next will be set by #lex token end