class Rouge::RegexLexer
@abstract A stateful lexer that uses sets of regular expressions to tokenize a string. Most lexers are instances of RegexLexer
.
Constants
- MAX_NULL_SCANS
-
The number of successive scans permitted without consuming the input stream. If this is exceeded, the match fails.
Public Class Methods
Source
# File lib/rouge/regex_lexer.rb, line 263 def self.append(name, &b) name = name.to_sym dsl = state_definitions[name] or raise "no such state #{name.inspect}" replace_state(name, dsl.appended(&b)) end
Source
# File lib/rouge/regex_lexer.rb, line 270 def self.get_state(name) return name if name.is_a? State states[name.to_sym] ||= begin defn = state_definitions[name.to_sym] or raise "unknown state: #{name.inspect}" defn.to_state(self) end end
@private
Source
# File lib/rouge/regex_lexer.rb, line 257 def self.prepend(name, &b) name = name.to_sym dsl = state_definitions[name] or raise "no such state #{name.inspect}" replace_state(name, dsl.prepended(&b)) end
Source
# File lib/rouge/regex_lexer.rb, line 230 def self.replace_state(name, new_defn) states[name] = nil state_definitions[name] = new_defn end
Source
# File lib/rouge/regex_lexer.rb, line 246 def self.start(&b) start_procs << b end
Specify an action to be run every fresh lex.
@example
start { puts "I'm lexing a new string!" }
Source
# File lib/rouge/regex_lexer.rb, line 237 def self.start_procs @start_procs ||= InheritableList.new(superclass.start_procs) end
The routines to run at the beginning of a fresh lex. @see start
Source
# File lib/rouge/regex_lexer.rb, line 252 def self.state(name, &b) name = name.to_sym state_definitions[name] = StateDSL.new(name, &b) end
Define a new state for this lexer with the given name. The block will be evaluated in the context of a {StateDSL}.
Source
# File lib/rouge/regex_lexer.rb, line 225 def self.state_definitions @state_definitions ||= InheritableHash.new(superclass.state_definitions) end
Source
# File lib/rouge/regex_lexer.rb, line 221 def self.states @states ||= {} end
The states hash for this lexer. @see state
Public Instance Methods
Source
# File lib/rouge/regex_lexer.rb, line 432 def delegate(lexer, text=nil) puts " delegating to: #{lexer.inspect}" if @debug text ||= @current_stream[0] lexer.continue_lex(text) do |tok, val| puts " delegated token: #{tok.inspect}, #{val.inspect}" if @debug yield_token(tok, val) end end
Delegate the lex to another lexer. We use the ‘continue_lex` method so that reset!
will not be called. In this way, a single lexer can be repeatedly delegated to while maintaining its own internal state stack.
@param [#lex] lexer
The lexer or lexer class to delegate to
@param [String] text
The text to delegate. This defaults to the last matched string.
Source
# File lib/rouge/regex_lexer.rb, line 280 def get_state(state_name) self.class.get_state(state_name) end
@private
Source
# File lib/rouge/regex_lexer.rb, line 476 def goto(state_name) raise 'empty stack!' if stack.empty? puts " going to: state :#{state_name} " if @debug stack[-1] = get_state(state_name) end
replace the head of the stack with the given state
Source
# File lib/rouge/regex_lexer.rb, line 411 def group(tok) raise "RegexLexer#group is deprecated: use #groups instead" end
@deprecated
Yield a token with the next matched group. Subsequent calls to this method will yield subsequent groups.
Source
# File lib/rouge/regex_lexer.rb, line 417 def groups(*tokens) tokens.each_with_index do |tok, i| yield_token(tok, @current_stream[i+1]) end end
Yield tokens corresponding to the matched groups of the current match.
Source
# File lib/rouge/regex_lexer.rb, line 491 def in_state?(state_name) state_name = state_name.to_sym stack.any? do |state| state.name == state_name.to_sym end end
Check if ‘state_name` is in the state stack.
Source
# File lib/rouge/regex_lexer.rb, line 465 def pop!(times=1) raise 'empty stack!' if stack.empty? puts " popping stack: #{times}" if @debug stack.pop(times) nil end
Pop the state stack. If a number is passed in, it will be popped that number of times.
Source
# File lib/rouge/regex_lexer.rb, line 449 def push(state_name=nil, &b) push_state = if state_name get_state(state_name) elsif block_given? StateDSL.new(b.inspect, &b).to_state(self.class) else # use the top of the stack by default self.state end puts " pushing: :#{push_state.name}" if @debug stack.push(push_state) end
Push a state onto the stack. If no state name is given and you’ve passed a block, a state will be dynamically created using the {StateDSL}.
Source
# File lib/rouge/regex_lexer.rb, line 442 def recurse(text=nil) delegate(self.class, text) end
Source
# File lib/rouge/regex_lexer.rb, line 301 def reset! @stack = nil @current_stream = nil puts "start blocks" if @debug && self.class.start_procs.any? self.class.start_procs.each do |pr| instance_eval(&pr) end end
reset this lexer to its initial state. This runs all of the start_procs.
Source
# File lib/rouge/regex_lexer.rb, line 484 def reset_stack puts ' resetting stack' if @debug stack.clear stack.push get_state(:root) end
reset the stack back to ‘[:root]`.
Source
# File lib/rouge/regex_lexer.rb, line 287 def stack @stack ||= [get_state(:root)] end
The state stack. This is initially the single state ‘[:root]`. It is an error for this stack to be empty. @see state
Source
# File lib/rouge/regex_lexer.rb, line 295 def state stack.last or raise 'empty stack!' end
The current state - i.e. one on top of the state stack.
NB: if the state stack is empty, this will throw an error rather than returning nil.
Source
# File lib/rouge/regex_lexer.rb, line 499 def state?(state_name) state_name.to_sym == state.name end
Check if ‘state_name` is the state on top of the state stack.
Source
# File lib/rouge/regex_lexer.rb, line 357 def step(state, stream) state.rules.each do |rule| if rule.is_a?(State) puts " entering: mixin :#{rule.name}" if @debug return true if step(rule, stream) puts " exiting: mixin :#{rule.name}" if @debug else puts " trying: #{rule.inspect}" if @debug # XXX HACK XXX # StringScanner's implementation of ^ is b0rken. # see http://bugs.ruby-lang.org/issues/7092 # TODO: this doesn't cover cases like /(a|^b)/, but it's # the most common, for now... next if rule.beginning_of_line && !stream.beginning_of_line? if (size = stream.skip(rule.re)) puts " got: #{stream[0].inspect}" if @debug instance_exec(stream, &rule.callback) if size.zero? @null_steps += 1 if @null_steps > MAX_NULL_SCANS puts " warning: too many scans without consuming the string!" if @debug return false end else @null_steps = 0 end return true end end end false end
Runs one step of the lex. Rules in the current state are tried until one matches, at which point its callback is called.
@return true if a rule was tried successfully @return false otherwise.
Source
# File lib/rouge/regex_lexer.rb, line 323 def stream_tokens(str, &b) stream = StringScanner.new(str) @current_stream = stream @output_stream = b @states = self.class.states @null_steps = 0 until stream.eos? if @debug puts puts "lexer: #{self.class.tag}" puts "stack: #{stack.map(&:name).map(&:to_sym).inspect}" puts "stream: #{stream.peek(20).inspect}" end success = step(state, stream) if !success puts " no match, yielding Error" if @debug b.call(Token::Tokens::Error, stream.getch) end end end
This implements the lexer protocol, by yielding [token, value] pairs.
The process for lexing works as follows, until the stream is empty:
-
We look at the state on top of the stack (which by default is ‘[:root]`).
-
Each rule in that state is tried until one is successful. If one is found, that rule’s callback is evaluated - which may yield tokens and manipulate the state stack. Otherwise, one character is consumed with an ‘’Error’‘ token, and we continue at (1.)
Source
# File lib/rouge/regex_lexer.rb, line 403 def token(tok, val=@current_stream[0]) yield_token(tok, val) end
Yield a token.
@param tok
the token type
@param val
(optional) the string value to yield. If absent, this defaults to the entire last match.
Private Instance Methods
Source
# File lib/rouge/regex_lexer.rb, line 504 def yield_token(tok, val) return if val.nil? || val.empty? puts " yielding: #{tok.qualname}, #{val.inspect}" if @debug @output_stream.yield(tok, val) end