class TwitterCldr::Parsers::UnicodeRegexParser
Constants
- BINARY_OPERATORS
- CHARACTER_CLASS_TOKEN_TYPES
- NEGATED_TOKEN_TYPES
- RANGED_CHARACTER_CLASS_TOKEN_TYPES
Types that are allowed to be used in character ranges.
- UNARY_OPERATORS
Public Instance Methods
parse(tokens, options = {})
click to toggle source
Calls superclass method
TwitterCldr::Parsers::Parser#parse
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 20 def parse(tokens, options = {}) super( preprocess( substitute_variables(tokens, options[:symbol_table]) ), options ) end
Private Instance Methods
add_implicit_union(operator_stack, open_count)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 268 def add_implicit_union(operator_stack, open_count) if n = @tokens[@token_index + 1] if valid_character_class_token?(n) && open_count > 0 operator_stack.push(make_token(:union)) end end end
binary_operator?(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 127 def binary_operator?(token) token && BINARY_OPERATORS.include?(token.type) end
binary_operator_node(operator, right, left)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 280 def binary_operator_node(operator, right, left) CharacterClass::BinaryOperator.new( operator, left, right ) end
build_until_open(operator_stack, operand_stack)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 229 def build_until_open(operator_stack, operand_stack) last_operator = peek(operator_stack) opening_type = CharacterClass.opening_type_for(current_token.type) until last_operator.type == opening_type operator = operator_stack.pop node = get_operator_node(operator, operand_stack) operand_stack.push(node) last_operator = peek(operator_stack) end operator_stack.pop end
character_class()
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 196 def character_class operator_stack = [] operand_stack = [] open_count = 0 loop do case current_token.type when *CharacterClass.closing_types open_count -= 1 build_until_open(operator_stack, operand_stack) add_implicit_union(operator_stack, open_count) when *CharacterClass.opening_types open_count += 1 operator_stack.push(current_token) when *(BINARY_OPERATORS + UNARY_OPERATORS) operator_stack.push(current_token) else add_implicit_union(operator_stack, open_count) operand_stack.push( send(current_token.type, current_token) ) end next_token(current_token.type) break if operator_stack.empty? && open_count == 0 end CharacterClass.new(operand_stack.pop) end
character_range(token)
click to toggle source
current_token is already a CharacterRange
object
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 192 def character_range(token) token end
character_set(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 149 def character_set(token) CharacterSet.new( token.value.gsub(/^\\p/, "").gsub(/[\{\}\[\]:]/, "") ) end
do_parse(options)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 131 def do_parse(options) elements = [] while current_token case current_token.type when :open_bracket elements << character_class when :union next_token(:union) else elements << send(current_token.type, current_token) next_token(current_token.type) end end elements end
escaped_character(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 179 def escaped_character(token) Literal.new(token.value) end
get_non_range_dash_node(operator, operand_stack)
click to toggle source
Most regular expression engines allow character classes to contain a literal hyphen caracter as the first character. For example, [-abc] is a legal expression. It denotes a character class that contains the letters '-', 'a', 'b', and 'c'. For example, /[-abc]*/.match('-ba') returns 0 in Ruby.
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 262 def get_non_range_dash_node(operator, operand_stack) binary_operator_node( :union, operand_stack.pop, string(make_token(:string, '-')) ) end
get_operator_node(operator, operand_stack)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 243 def get_operator_node(operator, operand_stack) if operator.type == :dash && operand_stack.size < 2 get_non_range_dash_node(operator, operand_stack) else if unary_operator?(operator) unary_operator_node(operator.type, operand_stack.pop) else binary_operator_node( operator.type, operand_stack.pop, operand_stack.pop ) end end end
make_character_range(initial, final)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 107 def make_character_range(initial, final) CharacterRange.new(initial, final) end
make_token(type, value = nil)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 52 def make_token(type, value = nil) TwitterCldr::Tokenizers::Token.new({ type: type, value: value }) end
multichar_string(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 173 def multichar_string(token) UnicodeString.new( token.value.gsub(/[\{\}]/, "").unpack("U*") ) end
negated_character_set(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 155 def negated_character_set(token) CharacterSet.new( token.value.gsub(/^\\[pP]/, "").gsub(/[\{\}\[\]:^]/, "") ) end
negated_token?(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 111 def negated_token?(token) token && NEGATED_TOKEN_TYPES.include?(token.type) end
peek(array)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 276 def peek(array) array.last end
preprocess(tokens)
click to toggle source
Identifies regex ranges
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 60 def preprocess(tokens) result = [] i = 0 while i < tokens.size is_range = valid_ranged_character_class_token?(tokens[i]) && valid_ranged_character_class_token?(tokens[i + 2]) && tokens[i + 1].type == :dash if is_range initial = send(tokens[i].type, tokens[i]) final = send(tokens[i + 2].type, tokens[i + 2]) result << make_character_range(initial, final) i += 3 else if negated_token?(tokens[i]) result += [ make_token(:open_bracket), make_token(:negate), tokens[i], make_token(:close_bracket) ] else result << tokens[i] end i += 1 end end result end
special_char(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 183 def special_char(token) Literal.new(token.value) end
string(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 167 def string(token) UnicodeString.new( token.value.unpack("U*") ) end
substitute_variables(tokens, symbol_table)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 93 def substitute_variables(tokens, symbol_table) return tokens unless symbol_table tokens.inject([]) do |ret, token| if token.type == :variable && sub = symbol_table.fetch(token.value) # variables can themselves contain references to other variables # note: this could be cached somehow ret += substitute_variables(sub, symbol_table) else ret << token end ret end end
unary_operator?(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 123 def unary_operator?(token) token && UNARY_OPERATORS.include?(token.type) end
unary_operator_node(operator, child)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 286 def unary_operator_node(operator, child) CharacterClass::UnaryOperator.new( operator, child ) end
unicode_char(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 161 def unicode_char(token) UnicodeString.new( [token.value.gsub(/^\\u/, "").gsub(/[\{\}]/, "").to_i(16)] ) end
valid_character_class_token?(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 115 def valid_character_class_token?(token) token && CHARACTER_CLASS_TOKEN_TYPES.include?(token.type) end
valid_ranged_character_class_token?(token)
click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 119 def valid_ranged_character_class_token?(token) token && RANGED_CHARACTER_CLASS_TOKEN_TYPES.include?(token.type) end