class TwitterCldr::Collation::Collator
Collator
uses fractional collation elements table form CLDR to generate sort keys for Unicode strings as well as compare and sort such strings by generated sort keys.
Attributes
Public Class Methods
# File lib/twitter_cldr/collation/collator.rb, line 18 def initialize(locale = nil) @locale = TwitterCldr.convert_locale(locale) if locale @options = tailoring_options @trie = load_trie end
Private Class Methods
Loads and memoizes the default fractional collation elements trie.
# File lib/twitter_cldr/collation/collator.rb, line 154 def default_trie @default_trie ||= TwitterCldr::Collation::TrieLoader.load_default_trie.lock end
# File lib/twitter_cldr/collation/collator.rb, line 168 def load_tailored_trie(locale) TwitterCldr::Collation::TrieLoader.load_tailored_trie(locale, default_trie).lock end
# File lib/twitter_cldr/collation/collator.rb, line 158 def tailored_trie(locale) tailored_tries_cache[locale] end
# File lib/twitter_cldr/collation/collator.rb, line 164 def tailored_tries_cache @tailored_tries_cache ||= Hash.new { |hash, locale| hash[locale] = load_tailored_trie(locale) } end
Public Instance Methods
# File lib/twitter_cldr/collation/collator.rb, line 33 def compare(string_a, string_b) string_a == string_b ? 0 : get_sort_key(string_a) <=> get_sort_key(string_b) end
# File lib/twitter_cldr/collation/collator.rb, line 43 def get_collation_elements(string_or_code_points) code_points = get_normalized_code_points(string_or_code_points) result = [] result.concat(code_point_collation_elements(code_points)) until code_points.empty? result end
Second arg options, supports an option :maximum_level, to pass on to SortKeyBuilder
:maximum_level.
# File lib/twitter_cldr/collation/collator.rb, line 39 def get_sort_key(string_or_code_points, method_options = {}) TwitterCldr::Collation::SortKeyBuilder.build(get_collation_elements(string_or_code_points), case_first: @options[:case_first], maximum_level: method_options[:maximum_level]) end
# File lib/twitter_cldr/collation/collator.rb, line 24 def sort(strings) strings.map{ |s| [s, get_sort_key(s)] }.sort{ |a, b| a[1] <=> b[1] }.map(&:first) end
# File lib/twitter_cldr/collation/collator.rb, line 28 def sort!(strings) sort_keys = Hash.new { |hash, string| hash[string] = get_sort_key(string) } strings.replace(strings.sort_by { |s| sort_keys[s] }) end
Private Instance Methods
Returns the first sequence of fractional collation elements for an array of integer code points. Returned value is an array of well formed (including weights for all significant levels) integer arrays.
NOTE (side-effect): all used code points are removed from the input array.
# File lib/twitter_cldr/collation/collator.rb, line 84 def code_point_collation_elements(code_points) explicit_collation_elements(code_points) || implicit_collation_elements(code_points) end
Tries to build explicit collation elements array for the longest code points prefix in the given sequence. When possible, combines this prefix with (not necessarily subsequent) non-starters that follow it in the sequence. That's necessary because canonical ordering (that is performed during normalization) can break contractions that existed in the original, de-normalized string.
NOTE (side-effect): all used code points are removed from the input array.
For more information see section '4.2 Produce Array' of the main algorithm at www.unicode.org/reports/tr10/#Main_Algorithm
# File lib/twitter_cldr/collation/collator.rb, line 97 def explicit_collation_elements(code_points) # find the longest prefix in the trie collation_elements, prefix_size, suffixes = @trie.find_prefix(code_points) return unless collation_elements # remove prefix from the code points sequence code_points.shift(prefix_size) non_starter_pos = 0 used_combining_classes = {} while non_starter_pos < code_points.size && !suffixes.empty? # get next code point (possibly non-starter) non_starter_code_point = code_points[non_starter_pos] non_starter_metadata = TwitterCldr::Shared::CodePoint.get(non_starter_code_point) unless non_starter_metadata raise UnexpectedCodePointError, "'#{non_starter_code_point}' does not appear to be a valid Unicode code point" end combining_class = non_starter_metadata.combining_class.to_i # code point is a starter or combining class has been already used (non-starter is 'blocked' from the prefix) break if combining_class == 0 || used_combining_classes[combining_class] used_combining_classes[combining_class] = true # Try to find collation elements for [prefix + non-starter] code points sequence. As the subtrie contains # suffixes (without prefix) we pass only non-starter itself. new_collation_elements, _, new_suffixes = suffixes.find_prefix([non_starter_code_point]) if new_collation_elements # non-starter with a collation elements sequence corresponding to [prefix + non-starter] accepted collation_elements = new_collation_elements suffixes = new_suffixes # Remove non-starter from its position in the sequence. Then we can move further from the same position. code_points.delete_at(non_starter_pos) else # move to the next code point non_starter_pos += 1 end end collation_elements end
# File lib/twitter_cldr/collation/collator.rb, line 61 def get_normalized_code_points(str_or_code_points) str = if str_or_code_points.is_a?(String) str_or_code_points else TwitterCldr::Utils::CodePoints.to_string(str_or_code_points) end # Normalization makes the collation process significantly slower (like seven times slower on the UCA # non-ignorable test from CollationTest_NON_IGNORABLE.txt). ICU uses some optimizations to apply normalization # only in special, rare cases. We need to investigate possible solutions and do normalization cleverly too. TwitterCldr::Utils::CodePoints.from_string( TwitterCldr::Normalization.normalize(str) ) rescue ArgumentError # Meant to rescue encoding errors. Some tests do not provide valid UTF-8 sequences. TwitterCldr::Utils::CodePoints.from_string(str) end
# File lib/twitter_cldr/collation/collator.rb, line 146 def implicit_collation_elements(code_points) TwitterCldr::Collation::ImplicitCollationElements.for_code_point(code_points.shift) end
# File lib/twitter_cldr/collation/collator.rb, line 57 def load_trie @locale ? self.class.tailored_trie(@locale) : self.class.default_trie end
# File lib/twitter_cldr/collation/collator.rb, line 53 def tailoring_options @locale ? TwitterCldr::Collation::TrieBuilder.tailoring_data(@locale)[:collator_options] : {} end