class TwitterCldr::Parsers::UnicodeRegexParser

Constants

BINARY_OPERATORS
CHARACTER_CLASS_TOKEN_TYPES
NEGATED_TOKEN_TYPES
RANGED_CHARACTER_CLASS_TOKEN_TYPES

Types that are allowed to be used in character ranges.

UNARY_OPERATORS

Public Instance Methods

parse(tokens, options = {}) click to toggle source
Calls superclass method TwitterCldr::Parsers::Parser#parse
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 20
def parse(tokens, options = {})
  super(
    preprocess(
      substitute_variables(tokens, options[:symbol_table])
    ), options
  )
end

Private Instance Methods

add_implicit_union(operator_stack, open_count) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 268
def add_implicit_union(operator_stack, open_count)
  if n = @tokens[@token_index + 1]
    if valid_character_class_token?(n) && open_count > 0
      operator_stack.push(make_token(:union))
    end
  end
end
ampersand(token)
Alias for: special_char
binary_operator?(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 127
def binary_operator?(token)
  token && BINARY_OPERATORS.include?(token.type)
end
binary_operator_node(operator, right, left) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 280
def binary_operator_node(operator, right, left)
  CharacterClass::BinaryOperator.new(
    operator, left, right
  )
end
build_until_open(operator_stack, operand_stack) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 229
def build_until_open(operator_stack, operand_stack)
  last_operator = peek(operator_stack)
  opening_type = CharacterClass.opening_type_for(current_token.type)

  until last_operator.type == opening_type
    operator = operator_stack.pop
    node = get_operator_node(operator, operand_stack)
    operand_stack.push(node)
    last_operator = peek(operator_stack)
  end

  operator_stack.pop
end
character_class() click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 196
def character_class
  operator_stack = []
  operand_stack = []
  open_count = 0

  loop do
    case current_token.type
      when *CharacterClass.closing_types
        open_count -= 1
        build_until_open(operator_stack, operand_stack)
        add_implicit_union(operator_stack, open_count)

      when *CharacterClass.opening_types
        open_count += 1
        operator_stack.push(current_token)

      when *(BINARY_OPERATORS + UNARY_OPERATORS)
        operator_stack.push(current_token)

      else
        add_implicit_union(operator_stack, open_count)
        operand_stack.push(
          send(current_token.type, current_token)
        )
    end

    next_token(current_token.type)
    break if operator_stack.empty? && open_count == 0
  end

  CharacterClass.new(operand_stack.pop)
end
character_range(token) click to toggle source

current_token is already a CharacterRange object

# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 192
def character_range(token)
  token
end
character_set(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 149
def character_set(token)
  CharacterSet.new(
    token.value.gsub(/^\\p/, "").gsub(/[\{\}\[\]:]/, "")
  )
end
do_parse(options) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 131
def do_parse(options)
  elements = []

  while current_token
    case current_token.type
      when :open_bracket
        elements << character_class
      when :union
        next_token(:union)
      else
        elements << send(current_token.type, current_token)
        next_token(current_token.type)
    end
  end

  elements
end
escaped_character(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 179
def escaped_character(token)
  Literal.new(token.value)
end
get_non_range_dash_node(operator, operand_stack) click to toggle source

Most regular expression engines allow character classes to contain a literal hyphen caracter as the first character. For example, [-abc] is a legal expression. It denotes a character class that contains the letters '-', 'a', 'b', and 'c'. For example, /[-abc]*/.match('-ba') returns 0 in Ruby.

# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 262
def get_non_range_dash_node(operator, operand_stack)
  binary_operator_node(
    :union, operand_stack.pop, string(make_token(:string, '-'))
  )
end
get_operator_node(operator, operand_stack) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 243
def get_operator_node(operator, operand_stack)
  if operator.type == :dash && operand_stack.size < 2
    get_non_range_dash_node(operator, operand_stack)
  else
    if unary_operator?(operator)
      unary_operator_node(operator.type, operand_stack.pop)
    else
      binary_operator_node(
        operator.type, operand_stack.pop, operand_stack.pop
      )
    end
  end
end
make_character_range(initial, final) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 107
def make_character_range(initial, final)
  CharacterRange.new(initial, final)
end
make_token(type, value = nil) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 52
def make_token(type, value = nil)
  TwitterCldr::Tokenizers::Token.new({
    type: type,
    value: value
  })
end
multichar_string(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 173
def multichar_string(token)
  UnicodeString.new(
    token.value.gsub(/[\{\}]/, "").unpack("U*")
  )
end
negate(token)
Alias for: special_char
negated_character_set(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 155
def negated_character_set(token)
  CharacterSet.new(
    token.value.gsub(/^\\[pP]/, "").gsub(/[\{\}\[\]:^]/, "")
  )
end
negated_token?(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 111
def negated_token?(token)
  token && NEGATED_TOKEN_TYPES.include?(token.type)
end
peek(array) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 276
def peek(array)
  array.last
end
pipe(token)
Alias for: special_char
preprocess(tokens) click to toggle source

Identifies regex ranges

# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 60
def preprocess(tokens)
  result = []
  i = 0

  while i < tokens.size
    is_range = valid_ranged_character_class_token?(tokens[i]) &&
      valid_ranged_character_class_token?(tokens[i + 2]) &&
      tokens[i + 1].type == :dash

    if is_range
      initial = send(tokens[i].type, tokens[i])
      final = send(tokens[i + 2].type, tokens[i + 2])
      result << make_character_range(initial, final)
      i += 3
    else
      if negated_token?(tokens[i])
        result += [
          make_token(:open_bracket),
          make_token(:negate),
          tokens[i],
          make_token(:close_bracket)
        ]
      else
        result << tokens[i]
      end

      i += 1
    end
  end

  result
end
special_char(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 183
def special_char(token)
  Literal.new(token.value)
end
Also aliased as: negate, pipe, ampersand
string(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 167
def string(token)
  UnicodeString.new(
    token.value.unpack("U*")
  )
end
substitute_variables(tokens, symbol_table) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 93
def substitute_variables(tokens, symbol_table)
  return tokens unless symbol_table
  tokens.inject([]) do |ret, token|
    if token.type == :variable && sub = symbol_table.fetch(token.value)
      # variables can themselves contain references to other variables
      # note: this could be cached somehow
      ret += substitute_variables(sub, symbol_table)
    else
      ret << token
    end
    ret
  end
end
unary_operator?(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 123
def unary_operator?(token)
  token && UNARY_OPERATORS.include?(token.type)
end
unary_operator_node(operator, child) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 286
def unary_operator_node(operator, child)
  CharacterClass::UnaryOperator.new(
    operator, child
  )
end
unicode_char(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 161
def unicode_char(token)
  UnicodeString.new(
    [token.value.gsub(/^\\u/, "").gsub(/[\{\}]/, "").to_i(16)]
  )
end
valid_character_class_token?(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 115
def valid_character_class_token?(token)
  token && CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
end
valid_ranged_character_class_token?(token) click to toggle source
# File lib/twitter_cldr/parsers/unicode_regex_parser.rb, line 119
def valid_ranged_character_class_token?(token)
  token && RANGED_CHARACTER_CLASS_TOKEN_TYPES.include?(token.type)
end