class TwitterCldr::Collation::Collator

Collator uses fractional collation elements table form CLDR to generate sort keys for Unicode strings as well as compare and sort such strings by generated sort keys.

Attributes

locale[RW]

Public Class Methods

new(locale = nil) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 18
def initialize(locale = nil)
  @locale  = TwitterCldr.convert_locale(locale) if locale
  @options = tailoring_options
  @trie    = load_trie
end

Private Class Methods

default_trie() click to toggle source

Loads and memoizes the default fractional collation elements trie.

# File lib/twitter_cldr/collation/collator.rb, line 154
def default_trie
  @default_trie ||= TwitterCldr::Collation::TrieLoader.load_default_trie.lock
end
load_tailored_trie(locale) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 168
def load_tailored_trie(locale)
  TwitterCldr::Collation::TrieLoader.load_tailored_trie(locale, default_trie).lock
end
tailored_trie(locale) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 158
def tailored_trie(locale)
  tailored_tries_cache[locale]
end
tailored_tries_cache() click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 164
def tailored_tries_cache
  @tailored_tries_cache ||= Hash.new { |hash, locale| hash[locale] = load_tailored_trie(locale) }
end

Public Instance Methods

compare(string_a, string_b) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 33
def compare(string_a, string_b)
  string_a == string_b ? 0 : get_sort_key(string_a) <=> get_sort_key(string_b)
end
get_collation_elements(string_or_code_points) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 43
def get_collation_elements(string_or_code_points)
  code_points = get_normalized_code_points(string_or_code_points)

  result = []
  result.concat(code_point_collation_elements(code_points)) until code_points.empty?
  result
end
get_sort_key(string_or_code_points, method_options = {}) click to toggle source

Second arg options, supports an option :maximum_level, to pass on to SortKeyBuilder :maximum_level.

# File lib/twitter_cldr/collation/collator.rb, line 39
def get_sort_key(string_or_code_points, method_options = {})
  TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), case_first: @options[:case_first], maximum_level: method_options[:maximum_level])
end
sort(strings) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 24
def sort(strings)
  strings.map{ |s| [s, get_sort_key(s)] }.sort{ |a, b| a[1] <=> b[1] }.map(&:first)
end
sort!(strings) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 28
def sort!(strings)
  sort_keys = Hash.new { |hash, string| hash[string] = get_sort_key(string) }
  strings.replace(strings.sort_by { |s| sort_keys[s] })
end

Private Instance Methods

code_point_collation_elements(code_points) click to toggle source

Returns the first sequence of fractional collation elements for an array of integer code points. Returned value is an array of well formed (including weights for all significant levels) integer arrays.

NOTE (side-effect): all used code points are removed from the input array.

# File lib/twitter_cldr/collation/collator.rb, line 84
def code_point_collation_elements(code_points)
  explicit_collation_elements(code_points) || implicit_collation_elements(code_points)
end
explicit_collation_elements(code_points) click to toggle source

Tries to build explicit collation elements array for the longest code points prefix in the given sequence. When possible, combines this prefix with (not necessarily subsequent) non-starters that follow it in the sequence. That's necessary because canonical ordering (that is performed during normalization) can break contractions that existed in the original, de-normalized string.

NOTE (side-effect): all used code points are removed from the input array.

For more information see section '4.2 Produce Array' of the main algorithm at www.unicode.org/reports/tr10/#Main_Algorithm

# File lib/twitter_cldr/collation/collator.rb, line 97
def explicit_collation_elements(code_points)
  # find the longest prefix in the trie
  collation_elements, prefix_size, suffixes = @trie.find_prefix(code_points)

  return unless collation_elements

  # remove prefix from the code points sequence
  code_points.shift(prefix_size)

  non_starter_pos = 0

  used_combining_classes = {}

  while non_starter_pos < code_points.size && !suffixes.empty?
    # get next code point (possibly non-starter)
    non_starter_code_point = code_points[non_starter_pos]
    non_starter_metadata   = TwitterCldr::Shared::CodePoint.get(non_starter_code_point)

    unless non_starter_metadata
      raise UnexpectedCodePointError, "'#{non_starter_code_point}' does not appear to be a valid Unicode code point"
    end

    combining_class = non_starter_metadata.combining_class.to_i

    # code point is a starter or combining class has been already used (non-starter is 'blocked' from the prefix)
    break if combining_class == 0 || used_combining_classes[combining_class]

    used_combining_classes[combining_class] = true

    # Try to find collation elements for [prefix + non-starter] code points sequence. As the subtrie contains
    # suffixes (without prefix) we pass only non-starter itself.
    new_collation_elements, _, new_suffixes = suffixes.find_prefix([non_starter_code_point])

    if new_collation_elements
      # non-starter with a collation elements sequence corresponding to [prefix + non-starter] accepted
      collation_elements = new_collation_elements
      suffixes           = new_suffixes

      # Remove non-starter from its position in the sequence. Then we can move further from the same position.
      code_points.delete_at(non_starter_pos)
    else
      # move to the next code point
      non_starter_pos += 1
    end
  end

  collation_elements
end
get_normalized_code_points(str_or_code_points) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 61
def get_normalized_code_points(str_or_code_points)
  str = if str_or_code_points.is_a?(String)
    str_or_code_points
  else
    TwitterCldr::Utils::CodePoints.to_string(str_or_code_points)
  end

  # Normalization makes the collation process significantly slower (like seven times slower on the UCA
  # non-ignorable test from CollationTest_NON_IGNORABLE.txt). ICU uses some optimizations to apply normalization
  # only in special, rare cases. We need to investigate possible solutions and do normalization cleverly too.
  TwitterCldr::Utils::CodePoints.from_string(
    TwitterCldr::Normalization.normalize(str)
  )
rescue ArgumentError
  # Meant to rescue encoding errors. Some tests do not provide valid UTF-8 sequences.
  TwitterCldr::Utils::CodePoints.from_string(str)
end
implicit_collation_elements(code_points) click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 146
def implicit_collation_elements(code_points)
  TwitterCldr::Collation::ImplicitCollationElements.for_code_point(code_points.shift)
end
load_trie() click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 57
def load_trie
  @locale ? self.class.tailored_trie(@locale) : self.class.default_trie
end
tailoring_options() click to toggle source
# File lib/twitter_cldr/collation/collator.rb, line 53
def tailoring_options
  @locale ? TwitterCldr::Collation::TrieBuilder.tailoring_data(@locale)[:collator_options] : {}
end