class TwitterCldr::Collation::SortKeyBuilder
SortKeyBuilder
builds a collation sort key from an array of collation elements.
Weights compression algorithms for every level are described in source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
Constants
- CASE_BITS_MASK
- CASE_SWITCH
- KEEP_CASE_MASK
- LEVEL_SEPARATOR
- MAX_REGULAR_PRIMARY
- MIN_NON_LATIN_PRIMARY
- NO_CASE_SWITCH
- PRIMARY_BYTE_MAX
- PRIMARY_BYTE_MIN
Primary level compression constants
- REMOVE_CASE_MASK
Tertiary level compression constants
- SECONDARY_BOTTOM
Secondary level compression constants
- SECONDARY_BOTTOM_COUNT
- SECONDARY_COMMON
- SECONDARY_COMMON_SPACE
- SECONDARY_PROPORTION
- SECONDARY_TOP
- SECONDARY_TOP_COUNT
- SECONDARY_TOTAL_COUNT
- TERTIARY_ADDITION_CASE_FIRST
- TERTIARY_ADDITION_NORMAL
- TERTIARY_BOTTOM_LOWER_FIRST
Lower first
- TERTIARY_BOTTOM_NORMAL
Normal (case-first disabled)
- TERTIARY_BOTTOM_UPPER_FIRST
Upper first
- TERTIARY_COMMON_LOWER_FIRST
- TERTIARY_COMMON_NORMAL
- TERTIARY_COMMON_UPPER_FIRST
- TERTIARY_LEVEL
- TERTIARY_PROPORTION
- TERTIARY_TOP_LOWER_FIRST
- TERTIARY_TOP_NORMAL
- TERTIARY_TOP_UPPER_FIRST
- VALID_CASE_FIRST_OPTIONS
- VALID_MAXIMUM_LEVEL_OPTIONS
Attributes
case_first[R]
collation_elements[R]
Public Class Methods
build(collation_elements, options = nil)
click to toggle source
Returns a sort key as an array of bytes.
Arguments:
collation_elements - an array of collation elements, represented as arrays of integer weights. options - hash of options: case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits). maximum_level - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one method into another while forming the sort key.
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 37 def self.build(collation_elements, options = nil) new(collation_elements, options).bytes_array end
new(collation_elements, options = {})
click to toggle source
Arguments:
collation_elements - an array of collation elements, represented as arrays of integer weights. options - hash of options: case_first - optional case-first sorting order setting: :upper, :lower, nil (discard case bits). maximum_level - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 48 def initialize(collation_elements, options = {}) raise ArgumentError, "second argument should be an options hash, not `#{options}`. Do you mean `:case_first => #{options}`?" unless options.kind_of? Hash case_first = options[:case_first] raise ArgumentError, "invalid case-first options '#{case_first.inspect}'" unless VALID_CASE_FIRST_OPTIONS.include?(case_first) maximum_level = options[:maximum_level] raise ArgumentError, "invalid maximum_level option 'options[:maximum_level]'" unless VALID_MAXIMUM_LEVEL_OPTIONS.include?(maximum_level) @collation_elements = collation_elements @case_first = case_first @maximum_level = maximum_level init_tertiary_constants end
Public Instance Methods
bytes_array()
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 64 def bytes_array @bytes_array ||= build_bytes_array end
Private Instance Methods
append_common_bytes(boundary, count_limit, top)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 176 def append_common_bytes(boundary, count_limit, top) sign = top ? -1 : +1 while @common_count > count_limit @bytes_array << boundary + sign * count_limit @common_count -= count_limit end @bytes_array << boundary + sign * (@common_count - 1) @common_count = 0 end
append_primary_bytes()
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 80 def append_primary_bytes @last_leading_byte = nil @collation_elements.each do |collation_element| bytes = integer_to_bytes_array(level_weight(collation_element, PRIMARY_LEVEL)) unless bytes.empty? leading_byte = bytes.shift if leading_byte != @last_leading_byte @bytes_array << (leading_byte < @last_leading_byte ? PRIMARY_BYTE_MIN : PRIMARY_BYTE_MAX) if @last_leading_byte @bytes_array << leading_byte @last_leading_byte = !bytes.empty? && compressible_primary?(leading_byte) ? leading_byte : nil end @bytes_array.concat(bytes) end end end
append_secondary_byte(secondary)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 142 def append_secondary_byte(secondary) if secondary == SECONDARY_COMMON @common_count += 1 else append_with_common_bytes(secondary, SECONDARY_COMMON_SPACE) end end
append_secondary_bytes()
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 105 def append_secondary_bytes @bytes_array << LEVEL_SEPARATOR @common_count = 0 @collation_elements.each do |collation_element| integer_to_bytes_array(level_weight(collation_element, SECONDARY_LEVEL)).each do |byte| append_secondary_byte(byte) end end # append compressed trailing common bytes append_common_bytes(SECONDARY_BOTTOM, SECONDARY_BOTTOM_COUNT, false) if @common_count > 0 end
append_tertiary_byte(tertiary)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 150 def append_tertiary_byte(tertiary) if tertiary == @tertiary_common @common_count += 1 else if @tertiary_common == TERTIARY_COMMON_NORMAL && @tertiary_common < tertiary tertiary += @tertiary_addition elsif @tertiary_common == TERTIARY_COMMON_UPPER_FIRST && tertiary <= @tertiary_common tertiary -= @tertiary_addition end append_with_common_bytes(tertiary, @tertiary_common_space) end end
append_tertiary_bytes()
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 120 def append_tertiary_bytes @bytes_array << LEVEL_SEPARATOR @common_count = 0 @collation_elements.each do |collation_element| integer_to_bytes_array(tertiary_weight(collation_element)).each do |byte| append_tertiary_byte(byte) end end # append compressed trailing common bytes if @common_count > 0 if @tertiary_common == TERTIARY_BOTTOM_NORMAL append_common_bytes(@tertiary_bottom, @tertiary_bottom_count, false) else append_common_bytes(@tertiary_top, @tertiary_top_count, true) @bytes_array[-1] -= 1 # make @bytes_array[-1] = boundary - @common_count (for compatibility with ICU) end end end
append_with_common_bytes(byte, options)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 164 def append_with_common_bytes(byte, options) if @common_count > 0 if byte < options[:common] append_common_bytes(options[:bottom], options[:bottom_count], false) else append_common_bytes(options[:top], options[:top_count], true) end end @bytes_array << byte end
build_bytes_array()
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 70 def build_bytes_array @bytes_array = [] append_primary_bytes append_secondary_bytes unless @maximum_level && (@maximum_level < 2) append_tertiary_bytes unless @maximum_level && (@maximum_level < 3) @bytes_array end
compressible_primary?(leading_byte)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 101 def compressible_primary?(leading_byte) (MIN_NON_LATIN_PRIMARY..MAX_REGULAR_PRIMARY).include?(leading_byte) end
continuation?(weight)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 213 def continuation?(weight) weight & CASE_BITS_MASK == CASE_BITS_MASK end
init_tertiary_constants()
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 221 def init_tertiary_constants @case_switch = @case_first == :upper ? CASE_SWITCH : NO_CASE_SWITCH if @case_first @tertiary_mask = KEEP_CASE_MASK @tertiary_addition = TERTIARY_ADDITION_CASE_FIRST if @case_first == :upper @tertiary_common = TERTIARY_COMMON_UPPER_FIRST @tertiary_top = TERTIARY_TOP_UPPER_FIRST @tertiary_bottom = TERTIARY_BOTTOM_UPPER_FIRST else # @case_first == :lower @tertiary_common = TERTIARY_COMMON_NORMAL @tertiary_top = TERTIARY_TOP_LOWER_FIRST @tertiary_bottom = TERTIARY_BOTTOM_LOWER_FIRST end else @tertiary_mask = REMOVE_CASE_MASK @tertiary_addition = TERTIARY_ADDITION_NORMAL @tertiary_common = TERTIARY_COMMON_NORMAL @tertiary_top = TERTIARY_TOP_NORMAL @tertiary_bottom = TERTIARY_BOTTOM_NORMAL end total_tertiary_count = @tertiary_top - @tertiary_bottom - 1 @tertiary_top_count = (TERTIARY_PROPORTION * total_tertiary_count).to_i @tertiary_bottom_count = total_tertiary_count - @tertiary_top_count @tertiary_common_space = { common: @tertiary_common, bottom: @tertiary_bottom, bottom_count: @tertiary_bottom_count, top: @tertiary_top, top_count: @tertiary_top_count } end
integer_to_bytes_array(number)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 202 def integer_to_bytes_array(number) bytes = [] while number > 0 bytes.unshift(number & 0xFF) number >>= 8 end bytes end
level_weight(collation_element, level)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 198 def level_weight(collation_element, level) collation_element[level] || 0 end
remove_continuation_bits(weight)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 217 def remove_continuation_bits(weight) weight & REMOVE_CASE_MASK end
tertiary_weight(collation_element)
click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 188 def tertiary_weight(collation_element) weight = level_weight(collation_element, TERTIARY_LEVEL) if continuation?(weight) remove_continuation_bits(weight) else (weight & @tertiary_mask) ^ @case_switch end end