class Rouge::RegexLexer
@abstract A stateful lexer that uses sets of regular expressions to tokenize a string. Most lexers are instances of RegexLexer
.
Constants
- MAX_NULL_SCANS
-
The number of successive scans permitted without consuming the input stream. If this is exceeded, the match fails.
Public Class Methods
Source
# File lib/rouge/regex_lexer.rb, line 250 def self.append(name, &b) name = name.to_sym dsl = state_definitions[name] or raise "no such state #{name.inspect}" replace_state(name, dsl.appended(&b)) end
Source
# File lib/rouge/regex_lexer.rb, line 257 def self.get_state(name) return name if name.is_a? State states[name.to_sym] ||= begin defn = state_definitions[name.to_sym] or raise "unknown state: #{name.inspect}" defn.to_state(self) end end
@private
Source
# File lib/rouge/regex_lexer.rb, line 244 def self.prepend(name, &b) name = name.to_sym dsl = state_definitions[name] or raise "no such state #{name.inspect}" replace_state(name, dsl.prepended(&b)) end
Source
# File lib/rouge/regex_lexer.rb, line 217 def self.replace_state(name, new_defn) states[name] = nil state_definitions[name] = new_defn end
Source
# File lib/rouge/regex_lexer.rb, line 233 def self.start(&b) start_procs << b end
Specify an action to be run every fresh lex.
@example
start { puts "I'm lexing a new string!" }
Source
# File lib/rouge/regex_lexer.rb, line 224 def self.start_procs @start_procs ||= InheritableList.new(superclass.start_procs) end
The routines to run at the beginning of a fresh lex. @see start
Source
# File lib/rouge/regex_lexer.rb, line 239 def self.state(name, &b) name = name.to_sym state_definitions[name] = StateDSL.new(name, &b) end
Define a new state for this lexer with the given name. The block will be evaluated in the context of a {StateDSL}.
Source
# File lib/rouge/regex_lexer.rb, line 212 def self.state_definitions @state_definitions ||= InheritableHash.new(superclass.state_definitions) end
Source
# File lib/rouge/regex_lexer.rb, line 208 def self.states @states ||= {} end
The states hash for this lexer. @see state
Public Instance Methods
Source
# File lib/rouge/regex_lexer.rb, line 419 def delegate(lexer, text=nil) puts " delegating to: #{lexer.inspect}" if @debug text ||= @current_stream[0] lexer.continue_lex(text) do |tok, val| puts " delegated token: #{tok.inspect}, #{val.inspect}" if @debug yield_token(tok, val) end end
Delegate the lex to another lexer. We use the ‘continue_lex` method so that reset!
will not be called. In this way, a single lexer can be repeatedly delegated to while maintaining its own internal state stack.
@param [#lex] lexer
The lexer or lexer class to delegate to
@param [String] text
The text to delegate. This defaults to the last matched string.
Source
# File lib/rouge/regex_lexer.rb, line 267 def get_state(state_name) self.class.get_state(state_name) end
@private
Source
# File lib/rouge/regex_lexer.rb, line 463 def goto(state_name) raise 'empty stack!' if stack.empty? puts " going to: state :#{state_name} " if @debug stack[-1] = get_state(state_name) end
replace the head of the stack with the given state
Source
# File lib/rouge/regex_lexer.rb, line 398 def group(tok) raise "RegexLexer#group is deprecated: use #groups instead" end
@deprecated
Yield a token with the next matched group. Subsequent calls to this method will yield subsequent groups.
Source
# File lib/rouge/regex_lexer.rb, line 404 def groups(*tokens) tokens.each_with_index do |tok, i| yield_token(tok, @current_stream[i+1]) end end
Yield tokens corresponding to the matched groups of the current match.
Source
# File lib/rouge/regex_lexer.rb, line 478 def in_state?(state_name) state_name = state_name.to_sym stack.any? do |state| state.name == state_name.to_sym end end
Check if ‘state_name` is in the state stack.
Source
# File lib/rouge/regex_lexer.rb, line 452 def pop!(times=1) raise 'empty stack!' if stack.empty? puts " popping stack: #{times}" if @debug stack.pop(times) nil end
Pop the state stack. If a number is passed in, it will be popped that number of times.
Source
# File lib/rouge/regex_lexer.rb, line 436 def push(state_name=nil, &b) push_state = if state_name get_state(state_name) elsif block_given? StateDSL.new(b.inspect, &b).to_state(self.class) else # use the top of the stack by default self.state end puts " pushing: :#{push_state.name}" if @debug stack.push(push_state) end
Push a state onto the stack. If no state name is given and you’ve passed a block, a state will be dynamically created using the {StateDSL}.
Source
# File lib/rouge/regex_lexer.rb, line 429 def recurse(text=nil) delegate(self.class, text) end
Source
# File lib/rouge/regex_lexer.rb, line 288 def reset! @stack = nil @current_stream = nil puts "start blocks" if @debug && self.class.start_procs.any? self.class.start_procs.each do |pr| instance_eval(&pr) end end
reset this lexer to its initial state. This runs all of the start_procs.
Source
# File lib/rouge/regex_lexer.rb, line 471 def reset_stack puts ' resetting stack' if @debug stack.clear stack.push get_state(:root) end
reset the stack back to ‘[:root]`.
Source
# File lib/rouge/regex_lexer.rb, line 274 def stack @stack ||= [get_state(:root)] end
The state stack. This is initially the single state ‘[:root]`. It is an error for this stack to be empty. @see state
Source
# File lib/rouge/regex_lexer.rb, line 282 def state stack.last or raise 'empty stack!' end
The current state - i.e. one on top of the state stack.
NB: if the state stack is empty, this will throw an error rather than returning nil.
Source
# File lib/rouge/regex_lexer.rb, line 486 def state?(state_name) state_name.to_sym == state.name end
Check if ‘state_name` is the state on top of the state stack.
Source
# File lib/rouge/regex_lexer.rb, line 344 def step(state, stream) state.rules.each do |rule| if rule.is_a?(State) puts " entering: mixin :#{rule.name}" if @debug return true if step(rule, stream) puts " exiting: mixin :#{rule.name}" if @debug else puts " trying: #{rule.inspect}" if @debug # XXX HACK XXX # StringScanner's implementation of ^ is b0rken. # see http://bugs.ruby-lang.org/issues/7092 # TODO: this doesn't cover cases like /(a|^b)/, but it's # the most common, for now... next if rule.beginning_of_line && !stream.beginning_of_line? if (size = stream.skip(rule.re)) puts " got: #{stream[0].inspect}" if @debug instance_exec(stream, &rule.callback) if size.zero? @null_steps += 1 if @null_steps > MAX_NULL_SCANS puts " warning: too many scans without consuming the string!" if @debug return false end else @null_steps = 0 end return true end end end false end
Runs one step of the lex. Rules in the current state are tried until one matches, at which point its callback is called.
@return true if a rule was tried successfully @return false otherwise.
Source
# File lib/rouge/regex_lexer.rb, line 310 def stream_tokens(str, &b) stream = StringScanner.new(str) @current_stream = stream @output_stream = b @states = self.class.states @null_steps = 0 until stream.eos? if @debug puts puts "lexer: #{self.class.tag}" puts "stack: #{stack.map(&:name).map(&:to_sym).inspect}" puts "stream: #{stream.peek(20).inspect}" end success = step(state, stream) if !success puts " no match, yielding Error" if @debug b.call(Token::Tokens::Error, stream.getch) end end end
This implements the lexer protocol, by yielding [token, value] pairs.
The process for lexing works as follows, until the stream is empty:
-
We look at the state on top of the stack (which by default is ‘[:root]`).
-
Each rule in that state is tried until one is successful. If one is found, that rule’s callback is evaluated - which may yield tokens and manipulate the state stack. Otherwise, one character is consumed with an ‘’Error’‘ token, and we continue at (1.)
Source
# File lib/rouge/regex_lexer.rb, line 390 def token(tok, val=@current_stream[0]) yield_token(tok, val) end
Yield a token.
@param tok
the token type
@param val
(optional) the string value to yield. If absent, this defaults to the entire last match.
Private Instance Methods
Source
# File lib/rouge/regex_lexer.rb, line 491 def yield_token(tok, val) return if val.nil? || val.empty? puts " yielding: #{tok.qualname}, #{val.inspect}" if @debug @output_stream.yield(tok, val) end