class TwitterCldr::Segmentation::CjBreakEngine

Constants

KATAKANA_COSTS
LARGE_NUMBER

the equivalent of Java's Integer.MAX_VALUE

MAX_KATAKANA_COST
MAX_KATAKANA_GROUP_LENGTH
MAX_KATAKANA_LENGTH
MAX_SNLP

magic number pulled from ICU's source code

MAX_WORD_SIZE

magic number pulled from ICU's source code, presumably slightly longer than the longest Chinese/Japanese/Korean word

Public Class Methods

word_set() click to toggle source
# File lib/twitter_cldr/segmentation/cj_break_engine.rb, line 29
def self.word_set
  @word_set ||= begin
    uset = TwitterCldr::Shared::UnicodeSet.new
    uset.apply_pattern('[:Han:]')
    uset.apply_pattern('[[:Katakana:]\uff9e\uff9f]')
    uset.apply_pattern('[:Hiragana:]')
    uset.add(0xFF70)  # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
    uset.add(0x30FC)  # KATAKANA-HIRAGANA PROLONGED SOUND MARK
    uset.to_set
  end
end

Private Instance Methods

dictionary() click to toggle source
# File lib/twitter_cldr/segmentation/cj_break_engine.rb, line 157
def dictionary
  @dictionary ||= Dictionary.cj
end
divide_up_dictionary_range(cursor, end_pos, &block) click to toggle source
# File lib/twitter_cldr/segmentation/cj_break_engine.rb, line 47
def divide_up_dictionary_range(cursor, end_pos, &block)
  return to_enum(__method__, cursor, end_pos) unless block_given?

  input_length = end_pos - cursor.position
  best_snlp = Array.new(input_length + 1) { LARGE_NUMBER }
  prev = Array.new(input_length + 1) { -1 }

  best_snlp[0] = 0
  start_pos = cursor.position
  is_prev_katakana = false

  until cursor.position >= end_pos
    idx = cursor.position - start_pos

    if best_snlp[idx] == LARGE_NUMBER
      cursor.advance
      next
    end

    max_search_length = if cursor.position + MAX_WORD_SIZE < end_pos
      MAX_WORD_SIZE
    else
      end_pos - cursor.position
    end

    count, values, lengths, _ = dictionary.matches(
      cursor, max_search_length, max_search_length
    )

    if (count == 0 || lengths[0] != 1) && !hangul_word_set.include?(cursor.codepoint)
      values[count] = MAX_SNLP
      lengths[count] = 1
      count += 1
    end

    count.times do |j|
      new_snlp = best_snlp[idx] + values[j]

      if new_snlp < best_snlp[lengths[j] + idx]
        best_snlp[lengths[j] + idx] = new_snlp
        prev[lengths[j] + idx] = idx
      end
    end

    # In Japanese, single-character Katakana words are pretty rare.
    # Accordingly, we apply the following heuristic: any continuous
    # run of Katakana characters is considered a candidate word with
    # a default cost specified in the katakanaCost table according
    # to its length.
    is_katakana = is_katakana?(cursor.codepoint)

    if !is_prev_katakana && is_katakana
      j = cursor.position + 1
      cursor.advance

      while j < end_pos && (j - idx) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.codepoint)
        cursor.advance
        j += 1
      end

      if (j - idx) < MAX_KATAKANA_GROUP_LENGTH
        new_snlp = best_snlp[idx] + get_katakana_cost(j - idx)

        if new_snlp < best_snlp[j]
          best_snlp[j] = new_snlp
          prev[j] = idx
        end
      end
    end

    is_prev_katakana = is_katakana

    cursor.advance
  end

  t_boundary = []

  if best_snlp[input_length] == LARGE_NUMBER
    t_boundary << end_pos
  else
    idx = end_pos - start_pos

    while idx > 0
      t_boundary << idx + start_pos
      idx = prev[idx]
    end
  end

  t_boundary.reverse_each(&block)
end
get_katakana_cost(word_length) click to toggle source
# File lib/twitter_cldr/segmentation/cj_break_engine.rb, line 149
def get_katakana_cost(word_length)
  if word_length > MAX_KATAKANA_LENGTH
    MAX_KATAKANA_COST
  else
    KATAKANA_COSTS[word_length]
  end
end
hangul_word_set() click to toggle source
# File lib/twitter_cldr/segmentation/cj_break_engine.rb, line 140
def hangul_word_set
  @@hangul_word_set ||= KoreanBreakEngine.word_set
end
is_katakana?(codepoint) click to toggle source
# File lib/twitter_cldr/segmentation/cj_break_engine.rb, line 144
def is_katakana?(codepoint)
  (codepoint >= 0x30A1 && codepoint <= 0x30FE && codepoint != 0x30FB) ||
    (codepoint >= 0xFF66 && codepoint <= 0xFF9F)
end
word_set() click to toggle source
# File lib/twitter_cldr/segmentation/cj_break_engine.rb, line 43
def word_set
  self.class.word_set
end