class TwitterCldr::Resources::LanguageCodesImporter

Constants

INPUT_DATA
ISO_639_COLUMNS
ISO_639_FILE
KEYS_TO_STANDARDS
STANDARDS_TO_KEYS

Private Instance Methods

build_table(language_codes_map) click to toggle source
# File lib/twitter_cldr/resources/language_codes_importer.rb, line 181
def build_table(language_codes_map)
  # can't use Hash with default proc here, because we won't be able to marshal this hash later in this case
  table = ([:name] + KEYS_TO_STANDARDS.values.uniq.sort_by(&:to_s)).inject({}) do |memo, key|
    memo.merge!(key => {})
  end

  language_codes_map.each do |name, codes|
    table[:name][name] = { name: name }.merge(codes)
  end

  table[:name].each_pair do |name, standards|
    STANDARDS_TO_KEYS.each do |standard, _|
      if standards[standard]
        table[standard.to_sym][standards[standard].to_sym] = table[:name][name]
      end
    end
  end

  table.each do |key, codes|
    table[key] = Hash[codes.sort]
  end
end
execute() click to toggle source
# File lib/twitter_cldr/resources/language_codes_importer.rb, line 38
def execute
  prepare_data
  import_data
end
import_bcp_47(result = {}) click to toggle source

Generates codes in the following format:

{

:Bangka => {
    :bcp_47     => "mfb",   # preferred code
    :bcp_47_alt => "ms-mfb" # alternative code (optional)
}

}

# File lib/twitter_cldr/resources/language_codes_importer.rb, line 110
def import_bcp_47(result = {})
  File.open(source_path_for(BCP_47_FILE)) do |file|
    lines = file.each_line
    lines.next # skip header

    data  = {}
    entry = ''

    lines.each do |line|
      line.chomp!

      if line == '%%'
        process_bcp_47_entry(entry, data)
        process_bcp_47_data(data, result)
      else
        if line.include?(':')
          process_bcp_47_entry(entry, data)
          entry = line
        else
          entry += line
        end
      end
    end

    process_bcp_47_entry(entry, data)
    process_bcp_47_data(data, result)
  end

  result
end
import_data() click to toggle source
# File lib/twitter_cldr/resources/language_codes_importer.rb, line 57
def import_data
  result = import_iso_639
  result = import_bcp_47(result)

  language_codes = Hash[result.inject({}) { |memo, (key, value)| memo[key] = Hash[value.sort]; memo }.sort]
  language_codes_table = build_table(language_codes)

  write('language_codes_table.dump', Marshal.dump(language_codes_table))
end
import_iso_639(result = {}) click to toggle source

Generates codes in the following format:

{

:Albanian => {
  :iso_639_1      => "sq",
  :iso_639_2      => "alb", # default (bibliographic) code
  :iso_639_2_term => "sqi", # terminology code (optional)
  :iso_639_3      => "sqi"
}

}

# File lib/twitter_cldr/resources/language_codes_importer.rb, line 82
def import_iso_639(result = {})
  File.open(source_path_for(ISO_639_FILE)) do |file|
    lines = file.each_line
    lines.next # skip header

    lines.each do |line|
      entry = line.chomp.gsub(/"(.*)"/) { $1.gsub("\t", '') }
      data = Hash[ISO_639_COLUMNS.zip(entry.split("\t"))]
      h = result[data[:Ref_Name].to_sym] ||= {}

      STANDARDS_TO_KEYS.each do |standard_key, data_key|
        value = data[data_key]
        h[standard_key] = value.to_sym if value && !value.empty?
      end
    end
  end

  result
end
prepare_data() click to toggle source
# File lib/twitter_cldr/resources/language_codes_importer.rb, line 43
def prepare_data
  INPUT_DATA.each do |file, url|
    source_path = source_path_for(file)

    unless File.exist?(source_path)
      open(source_path, 'wb') { |file| file << open(url).read }
    end
  end
end
process_bcp_47_data(data, result) click to toggle source
# File lib/twitter_cldr/resources/language_codes_importer.rb, line 155
def process_bcp_47_data(data, result)
  if !data.empty? && %w[language extlang].include?(data['type']) && !data['names'].include?('Private use') && data['scope'] != 'collection'
    existing_names = data['names'].select { |name| result.has_key?(name) }

    prefered    = data['preferred-value']
    alternative = [data['prefix'], data['subtag']].compact.join('-')

    bcp_47 = {}

    bcp_47[:bcp_47]     = (prefered || alternative).to_sym
    bcp_47[:bcp_47_alt] = alternative.to_sym if prefered

    existing_names.each do |name|
      result[name.to_sym].merge!(bcp_47)
    end

    bcp_47.merge!(result[existing_names.first]) unless existing_names.empty?

    (data['names'] - existing_names).each do |name|
      result[name.to_sym] = bcp_47.dup
    end
  end

  data.clear
end
process_bcp_47_entry(entry, data) click to toggle source
# File lib/twitter_cldr/resources/language_codes_importer.rb, line 141
def process_bcp_47_entry(entry, data)
  return if entry.nil? || entry.empty?

  key, value = entry.chomp.split(':', 2).map(&:strip)

  if key == 'Description'
    (data['names'] ||= []) << value.to_sym
  else
    data[key.downcase] = value
  end

  entry.clear
end
source_path_for(file) click to toggle source
# File lib/twitter_cldr/resources/language_codes_importer.rb, line 53
def source_path_for(file)
  File.join(TwitterCldr::VENDOR_DIR, file)
end
write(file, data) click to toggle source
# File lib/twitter_cldr/resources/language_codes_importer.rb, line 67
def write(file, data)
  File.write(File.join(params.fetch(:output_path), file), data)
end