class TwitterCldr::Resources::SegmentDictionariesImporter

Constants

DICTIONARY_FILES
URL_TEMPLATE

Public Instance Methods

execute() click to toggle source
# File lib/twitter_cldr/resources/segment_dictionaries_importer.rb, line 27
def execute
  FileUtils.mkdir_p(output_path)

  DICTIONARY_FILES.each do |test_file|
    import_dictionary_file(test_file)
  end
end

Private Instance Methods

import_dictionary_file(dictionary_file) click to toggle source
# File lib/twitter_cldr/resources/segment_dictionaries_importer.rb, line 37
def import_dictionary_file(dictionary_file)
  source_url = url_for(dictionary_file)
  source = URI.open(source_url).read
  lines = source.split("\n")
  trie = TwitterCldr::Utils::Trie.new
  space_regexp = TwitterCldr::Shared::UnicodeRegex.compile('\A[[:Z:][:C:]]+').to_regexp

  lines.each do |line|
    line.sub!(space_regexp, '')
    next if line.start_with?('#')

    characters, frequency = line.split("\t")
    frequency = frequency ? frequency.to_i : 0

    trie.add(characters.unpack('U*'), frequency)
  end

  output_path = output_path_for(dictionary_file)
  File.write(output_path, Marshal.dump(trie))
end
output_path() click to toggle source
# File lib/twitter_cldr/resources/segment_dictionaries_importer.rb, line 70
def output_path
  params.fetch(:output_path)
end
output_path_for(dictionary_file) click to toggle source
# File lib/twitter_cldr/resources/segment_dictionaries_importer.rb, line 65
def output_path_for(dictionary_file)
  file = File.basename(dictionary_file).chomp(File.extname(dictionary_file))
  File.join(output_path, "#{file}.dump")
end
url_for(dictionary_file) click to toggle source
# File lib/twitter_cldr/resources/segment_dictionaries_importer.rb, line 58
def url_for(dictionary_file)
  URL_TEMPLATE % {
    icu_version: "release-#{Versions.icu_version.gsub('.', '-')}",
    path: dictionary_file
  }
end