class TwitterCldr::Collation::SortKeyBuilder

SortKeyBuilder builds a collation sort key from an array of collation elements.

Weights compression algorithms for every level are described in source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm

Constants

CASE_BITS_MASK
CASE_SWITCH
KEEP_CASE_MASK
LEVEL_SEPARATOR
MAX_REGULAR_PRIMARY
MIN_NON_LATIN_PRIMARY
NO_CASE_SWITCH
PRIMARY_BYTE_MAX
PRIMARY_BYTE_MIN

Primary level compression constants

REMOVE_CASE_MASK

Tertiary level compression constants

SECONDARY_BOTTOM

Secondary level compression constants

SECONDARY_BOTTOM_COUNT
SECONDARY_COMMON
SECONDARY_COMMON_SPACE
SECONDARY_PROPORTION
SECONDARY_TOP
SECONDARY_TOP_COUNT
SECONDARY_TOTAL_COUNT
TERTIARY_ADDITION_CASE_FIRST
TERTIARY_ADDITION_NORMAL
TERTIARY_BOTTOM_LOWER_FIRST

Lower first

TERTIARY_BOTTOM_NORMAL

Normal (case-first disabled)

TERTIARY_BOTTOM_UPPER_FIRST

Upper first

TERTIARY_COMMON_LOWER_FIRST
TERTIARY_COMMON_NORMAL
TERTIARY_COMMON_UPPER_FIRST
TERTIARY_LEVEL
TERTIARY_PROPORTION
TERTIARY_TOP_LOWER_FIRST
TERTIARY_TOP_NORMAL
TERTIARY_TOP_UPPER_FIRST
VALID_CASE_FIRST_OPTIONS
VALID_MAXIMUM_LEVEL_OPTIONS

Attributes

case_first[R]
collation_elements[R]

Public Class Methods

build(collation_elements, options = nil) click to toggle source

Returns a sort key as an array of bytes.

Arguments:

collation_elements - an array of collation elements, represented as arrays of integer weights.
options            - hash of options:
  case_first       - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
  maximum_level    - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications

An instance of the class is created only to prevent passing of @collation_elements and @bytes_array from one method into another while forming the sort key.

# File lib/twitter_cldr/collation/sort_key_builder.rb, line 37
def self.build(collation_elements, options = nil)
  new(collation_elements, options).bytes_array
end
new(collation_elements, options = {}) click to toggle source

Arguments:

collation_elements - an array of collation elements, represented as arrays of integer weights.
options            - hash of options:
  case_first       - optional case-first sorting order setting: :upper, :lower, nil (discard case bits).
  maximum_level    - only append weights to maximum level specified (1 or 2), can be useful for searching/matching applications
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 48
def initialize(collation_elements, options = {})
  raise ArgumentError, "second argument should be an options hash, not `#{options}`. Do you mean `:case_first => #{options}`?" unless options.kind_of? Hash

  case_first = options[:case_first]
  raise ArgumentError, "invalid case-first options '#{case_first.inspect}'" unless VALID_CASE_FIRST_OPTIONS.include?(case_first)

  maximum_level = options[:maximum_level]
  raise ArgumentError, "invalid maximum_level option 'options[:maximum_level]'" unless VALID_MAXIMUM_LEVEL_OPTIONS.include?(maximum_level)

  @collation_elements = collation_elements
  @case_first         = case_first
  @maximum_level      = maximum_level

  init_tertiary_constants
end

Public Instance Methods

bytes_array() click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 64
def bytes_array
  @bytes_array ||= build_bytes_array
end

Private Instance Methods

append_common_bytes(boundary, count_limit, top) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 176
def append_common_bytes(boundary, count_limit, top)
  sign = top ? -1 : +1

  while @common_count > count_limit
    @bytes_array << boundary + sign * count_limit
    @common_count -= count_limit
  end

  @bytes_array << boundary + sign * (@common_count - 1)
  @common_count = 0
end
append_primary_bytes() click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 80
def append_primary_bytes
  @last_leading_byte = nil

  @collation_elements.each do |collation_element|
    bytes = integer_to_bytes_array(level_weight(collation_element, PRIMARY_LEVEL))

    unless bytes.empty?
      leading_byte = bytes.shift

      if leading_byte != @last_leading_byte
        @bytes_array << (leading_byte < @last_leading_byte ? PRIMARY_BYTE_MIN : PRIMARY_BYTE_MAX) if @last_leading_byte
        @bytes_array << leading_byte

        @last_leading_byte = !bytes.empty? && compressible_primary?(leading_byte) ? leading_byte : nil
      end

      @bytes_array.concat(bytes)
    end
  end
end
append_secondary_byte(secondary) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 142
def append_secondary_byte(secondary)
  if secondary == SECONDARY_COMMON
    @common_count += 1
  else
    append_with_common_bytes(secondary, SECONDARY_COMMON_SPACE)
  end
end
append_secondary_bytes() click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 105
def append_secondary_bytes
  @bytes_array << LEVEL_SEPARATOR

  @common_count = 0

  @collation_elements.each do |collation_element|
    integer_to_bytes_array(level_weight(collation_element, SECONDARY_LEVEL)).each do |byte|
      append_secondary_byte(byte)
    end
  end

  # append compressed trailing common bytes
  append_common_bytes(SECONDARY_BOTTOM, SECONDARY_BOTTOM_COUNT, false) if @common_count > 0
end
append_tertiary_byte(tertiary) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 150
def append_tertiary_byte(tertiary)
  if tertiary == @tertiary_common
    @common_count += 1
  else
    if @tertiary_common == TERTIARY_COMMON_NORMAL && @tertiary_common < tertiary
      tertiary += @tertiary_addition
    elsif @tertiary_common == TERTIARY_COMMON_UPPER_FIRST && tertiary <= @tertiary_common
      tertiary -= @tertiary_addition
    end

    append_with_common_bytes(tertiary, @tertiary_common_space)
  end
end
append_tertiary_bytes() click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 120
def append_tertiary_bytes
  @bytes_array << LEVEL_SEPARATOR

  @common_count = 0

  @collation_elements.each do |collation_element|
    integer_to_bytes_array(tertiary_weight(collation_element)).each do |byte|
      append_tertiary_byte(byte)
    end
  end

  # append compressed trailing common bytes
  if @common_count > 0
    if @tertiary_common == TERTIARY_BOTTOM_NORMAL
      append_common_bytes(@tertiary_bottom, @tertiary_bottom_count, false)
    else
      append_common_bytes(@tertiary_top, @tertiary_top_count, true)
      @bytes_array[-1] -= 1 # make @bytes_array[-1] = boundary - @common_count (for compatibility with ICU)
    end
  end
end
append_with_common_bytes(byte, options) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 164
def append_with_common_bytes(byte, options)
  if @common_count > 0
    if byte < options[:common]
      append_common_bytes(options[:bottom], options[:bottom_count], false)
    else
      append_common_bytes(options[:top], options[:top_count], true)
    end
  end

  @bytes_array << byte
end
build_bytes_array() click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 70
def build_bytes_array
  @bytes_array = []

  append_primary_bytes
  append_secondary_bytes unless @maximum_level && (@maximum_level < 2)
  append_tertiary_bytes  unless @maximum_level && (@maximum_level < 3)

  @bytes_array
end
compressible_primary?(leading_byte) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 101
def compressible_primary?(leading_byte)
  (MIN_NON_LATIN_PRIMARY..MAX_REGULAR_PRIMARY).include?(leading_byte)
end
continuation?(weight) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 213
def continuation?(weight)
  weight & CASE_BITS_MASK == CASE_BITS_MASK
end
init_tertiary_constants() click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 221
def init_tertiary_constants
  @case_switch = @case_first == :upper ? CASE_SWITCH : NO_CASE_SWITCH

  if @case_first
    @tertiary_mask     = KEEP_CASE_MASK
    @tertiary_addition = TERTIARY_ADDITION_CASE_FIRST

    if @case_first == :upper
      @tertiary_common = TERTIARY_COMMON_UPPER_FIRST
      @tertiary_top    = TERTIARY_TOP_UPPER_FIRST
      @tertiary_bottom = TERTIARY_BOTTOM_UPPER_FIRST
    else # @case_first == :lower
      @tertiary_common = TERTIARY_COMMON_NORMAL
      @tertiary_top    = TERTIARY_TOP_LOWER_FIRST
      @tertiary_bottom = TERTIARY_BOTTOM_LOWER_FIRST
    end
  else
    @tertiary_mask     = REMOVE_CASE_MASK
    @tertiary_addition = TERTIARY_ADDITION_NORMAL

    @tertiary_common = TERTIARY_COMMON_NORMAL
    @tertiary_top    = TERTIARY_TOP_NORMAL
    @tertiary_bottom = TERTIARY_BOTTOM_NORMAL
  end

  total_tertiary_count   = @tertiary_top - @tertiary_bottom - 1
  @tertiary_top_count    = (TERTIARY_PROPORTION * total_tertiary_count).to_i
  @tertiary_bottom_count = total_tertiary_count - @tertiary_top_count

  @tertiary_common_space = {
      common:       @tertiary_common,
      bottom:       @tertiary_bottom,
      bottom_count: @tertiary_bottom_count,
      top:          @tertiary_top,
      top_count:    @tertiary_top_count
  }
end
integer_to_bytes_array(number) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 202
def integer_to_bytes_array(number)
  bytes = []

  while number > 0
    bytes.unshift(number & 0xFF)
    number >>= 8
  end

  bytes
end
level_weight(collation_element, level) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 198
def level_weight(collation_element, level)
  collation_element[level] || 0
end
remove_continuation_bits(weight) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 217
def remove_continuation_bits(weight)
  weight & REMOVE_CASE_MASK
end
tertiary_weight(collation_element) click to toggle source
# File lib/twitter_cldr/collation/sort_key_builder.rb, line 188
def tertiary_weight(collation_element)
  weight = level_weight(collation_element, TERTIARY_LEVEL)

  if continuation?(weight)
    remove_continuation_bits(weight)
  else
    (weight & @tertiary_mask) ^ @case_switch
  end
end