class TwitterCldr::Resources::SegmentRulesImporter

Constants

BOUNDARY_TYPES

@TODO: moar boundary types

CategoryTable
Locale
StateTable
StatusTable
TYPES_TO_ATTRS

Public Instance Methods

execute() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 42
def execute
  each_locale do |locale, doc|
    BOUNDARY_TYPES.each do |kind, icu_kind|
      seg = doc.xpath(
        "//ldml/segmentations/segmentation[@type=\"#{TYPES_TO_ATTRS[kind]}\"]"
      )

      rule_data = rule_data_for(icu_kind, locale, seg)

      unless rule_data.empty?
        output_dir = File.join(output_path, 'rules', locale)
        output_file = File.join(output_dir, "#{kind}.yml")
        FileUtils.mkdir_p(output_dir)
        File.write(output_file, YAML.dump(rule_data))
      end

      suppressions = suppressions_for(icu_kind, locale, seg)

      unless suppressions.empty?
        output_dir = File.join(output_path, 'suppressions', locale)
        output_file = File.join(output_dir, "#{kind}.yml")
        FileUtils.mkdir_p(output_dir)
        File.write(output_file, YAML.dump(suppressions))
      end
    end
  end
end

Private Instance Methods

brkiter_base_name() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 168
def brkiter_base_name
  @brkiter_base_name ||= icu_data.const_get(:ICU_BRKITR_BASE_NAME)
end
brkiter_name() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 164
def brkiter_name
  @brkiter_name ||= icu_data.const_get(:ICU_BRKITR_NAME)
end
bundle_for(locale) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 160
def bundle_for(locale)
  @bundle ||= resource_bundle.getBundleInstance(brkiter_base_name, locale, locale_root)
end
each_locale() { |locale, XML(read)| ... } click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 72
def each_locale
  return to_enum(__method__) unless block_given?

  pattern = File.join(requirements[:cldr].common_path, 'segments', '*.xml')

  Dir.glob(pattern).each do |file, ret|
    locale = File.basename(file).chomp('.xml').tr('_', '-')
    yield locale, Nokogiri::XML(File.read(file))
  end
end
encode_rbbi_data(data) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 102
def encode_rbbi_data(data)
  {
    metadata: metadata_from(data.fHeader),
    forward_table: StateTable.new(data.fFTable.fTable.to_a, data.fFTable.fFlags).dump16,
    backward_table: StateTable.new(data.fRTable.fTable.to_a, data.fRTable.fFlags).dump16,
    status_table: StatusTable.new(data.fStatusTable.to_a).dump,
    category_table: encode_trie(data.fTrie),  # this really isn't a trie
  }
end
encode_suppressions(suppressions) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 116
def encode_suppressions(suppressions)
  forwards_trie = TwitterCldr::Utils::Trie.new
  backwards_trie = TwitterCldr::Utils::Trie.new

  suppressions.each do |suppression|
    forwards_trie.add(suppression.codepoints, true)
    backwards_trie.add(suppression.reverse.codepoints, true)
  end

  {
    forwards_trie: Marshal.dump(forwards_trie),
    backwards_trie: Marshal.dump(backwards_trie)
  }
end
encode_trie(trie) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 131
def encode_trie(trie)
  arr = [].tap do |results|
    iter = trie.iterator

    while iter.hasNext
      range = iter.next
      results << range_to_a(range)

      # this should be the last entry, but for some reason ICU returns
      # one more out-of-order range past the Unicode max
      break if range.getEnd == 0x10FFFF
    end
  end

  # @TODO: Distinguish between the 16- and 32-bit flavors
  CategoryTable.new(arr).dump16.strip
end
icu_binary() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 180
def icu_binary
  @icu_binary ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUBinary')
end
icu_data() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 184
def icu_data
  @icu_data ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUData')
end
locale_root() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 172
def locale_root
  @locale_root ||= resource_bundle.const_get(:OpenType).const_get(:LOCALE_ROOT)
end
metadata_from(header) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 112
def metadata_from(header)
  { category_count: header.fCatCount }
end
output_path() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 196
def output_path
  params[:output_path]
end
range_to_a(range) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 149
def range_to_a(range)
  [range.getStart, range.getEnd, range.getValue]
end
rbbi_data_for(kind, locale) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 153
def rbbi_data_for(kind, locale)
  bundle = bundle_for(ulocale_class.new(locale))
  brkf_name = bundle.getStringWithFallback("boundaries/#{kind}")
  buffer = icu_binary.getData("#{brkiter_name}/#{brkf_name}")
  rbbi_data_wrapper.get(buffer)
end
rbbi_data_wrapper() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 176
def rbbi_data_wrapper
  @rbbi_data_wrapper ||= requirements[:icu].get_class('com.ibm.icu.impl.RBBIDataWrapper')
end
resource_bundle() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 188
def resource_bundle
  @bundle_class ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUResourceBundle')
end
rule_data_for(kind, locale, doc) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 83
def rule_data_for(kind, locale, doc)
  vars = doc.xpath('variables/variable')
  rules = doc.xpath('segmentRules/rule')
  result = {}

  unless vars.empty? && rules.empty?
    result.merge!(encode_rbbi_data(rbbi_data_for(kind, locale)))
  end

  result
end
suppressions_for(kind, locale, doc) click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 95
def suppressions_for(kind, locale, doc)
  suppressions = doc.xpath('suppressions/suppression').map(&:text)
  return {} if suppressions.empty?

  encode_suppressions(suppressions)
end
ulocale_class() click to toggle source
# File lib/twitter_cldr/resources/segment_rules_importer.rb, line 192
def ulocale_class
  @ulocale_class ||= requirements[:icu].get_class('com.ibm.icu.util.ULocale')
end