module ZhongwenTools::Romanization

Public: Romanization converts, detects and splits different romanizations.

Public: Romanization converts to pinyin and pyn.

Constants

PYN_PY
PYN_SYLLABIC_NASALS
ROMANIZATIONS_TABLE

TODO: remove excess values, i.e. keys whose value == :pyn TODO: en.wikipedia.org/wiki/Jyutping TODO: en.wikipedia.org/wiki/Simplified_Wade

ROMANIZATION_TYPES

Public Class Methods

convert(str, to, from) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 14
def self.convert(str, to, from)
  # NOTE: don't convert if it already is converted.
  return str if to == from

  if to == :py
    convert_to_py(str, from)
  elsif to == :pyn
    convert_to_pyn(str, from)
  else
    convert_to_other(str, from, to)
  end
end
convert_to_py(str, from) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 8
def self.convert_to_py(str, from)
  str =  convert_romanization(str, from, :pyn) if from != :pyn
  Pinyin.convert_pyn_to_pinyin(str)
end
convert_to_pyn(str, from) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 13
def self.convert_to_pyn(str, from)
  orig_str = str.dup

  if from == :py
    str = Romanization::Pinyin.convert_pinyin_to_pyn(str)
  else
    str = convert_romanization(str, from, :pyn)
  end

  str = Romanization::Pinyin.add_hyphens_to_pyn(str) if hyphenated?(orig_str)

  str
end
romanization?(str) click to toggle source

Public: Checks the romanization type for the string.

Romanization types are like ducks. If it walks, talks, and acts
like a duck, it is a duck. Therefore, where a String is both
pinyin and another romanization system, it will be identified
as pinyin. If you need to determine whether a py/pyn string
belongs to another romanization system p a romanization
system, use the romanization modules specific function.

Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale.
http://en.wikipedia.org/wiki/Tongyong_Pinyin
http://pinyin.info/romanization/tongyong/
http://en.wikipedia.org/wiki/Wade%E2%80%93Giles
http://en.wikipedia.org/wiki/Bopomofo
http://pinyin.info/romanization/bopomofo/index.html  # str - a String to test.

Examples

romanization?('hao3') #=> :pyn
romanization?('zzzz')   #=> nil

Returns a String for the romanization system or Nil if the string is not a romanization.

# File lib/zhongwen_tools/romanization.rb, line 49
def self.romanization?(str)
  if ZhongwenTools::Romanization::Pinyin.pyn?(str)
    :pyn
  elsif ZhongwenTools::Romanization::Pinyin.py?(str)
    :py
  elsif ZhongwenTools::Romanization::ZhuyinFuhao.bpmf?(str)
    :bpmf
  elsif ZhongwenTools::Romanization::WadeGiles.wg?(str)
    :wg
  elsif ZhongwenTools::Romanization::TongyongPinyin.typy?(str)
    :typy
  elsif ZhongwenTools::Romanization::Yale.yale?(str)
    :yale
  elsif ZhongwenTools::Romanization::MPS2.mps2?(str)
    :mps2
  end
end
split(str, type = nil) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 67
def self.split(str, type = nil)
  type ||= romanization?(str)

  if type == :py
    ZhongwenTools::Romanization::Pinyin.split_py(str)
  elsif type == :pyn
    ZhongwenTools::Romanization::Pinyin.split_pyn(str)
  elsif type == :bpmf
    ZhongwenTools::Romanization::ZhuyinFuhao.split(str)
  elsif type == :wg
    ZhongwenTools::Romanization::WadeGiles.split(str)
  elsif type == :typy
    ZhongwenTools::Romanization::TongyongPinyin.split(str)
  elsif type == :yale
    ZhongwenTools::Romanization::Yale.split(str)
  elsif type == :mps2
    ZhongwenTools::Romanization::MPS2.split(str)
  end
end

Private Class Methods

convert_romanization(str, from, to) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 105
def self.convert_romanization(str, from, to)
  # NOTE: extract/refactor tokens cause tests to fail.
  if from == :pyn
    tokens = ZhongwenTools::Romanization::Pinyin.split_pyn(str).uniq
  else
    tokens = romanization_module(from).send(:split, str).uniq
  end

  tokens.collect do |t|
    search, replace = find_token_replacement(t, str, to, from)
    str =  str.gsub(search, replace)
  end

  str
end
convert_to_other(str, from, to) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 121
def self.convert_to_other(str, from, to)
  if from == :py
    str =  ZhongwenTools::Romanization::Pinyin.convert_pinyin_to_pyn(str)
    from = :pyn
  end

  str = convert_romanization(str, from, to)

  if to == :bpmf
    str.gsub('-', '')
  else
    str
  end
end
detect_regex(type) click to toggle source

Internal: Produces a Regexp for a romanization type.

type - a Symbol for the romanization type.

Examples:

detect_regex(:typy) #=> <Regexp>

Returns a Regexp.

# File lib/zhongwen_tools/romanization.rb, line 174
def self.detect_regex(type)
  # TODO: memoize
  @memoized_detect_regex ||= {}
  @memoized_detect_regex[type] ||= /#{romanization_values(type).sort{|x,y| x.size <=> y.size}.reverse.join('|')}/
end
detect_romanization(str, regex) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 89
def self.detect_romanization(str, regex)
  normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '')
  # TODO: ignore tonal marks from other systems wade giles, tongyong etc.

  normalized_str.scan(regex).join == normalized_str
end
find_token_replacement(token, str, to, from) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 136
def self.find_token_replacement(token, str, to, from)
  search = token.gsub(/[1-5].*/,'')

  replace = token_replacement(token, from).fetch(to){ search }
  replace = fix_capitalization(str, token, replace)

  [search, replace]
end
fix_capitalization(str, token, replace) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 145
def self.fix_capitalization(str, token, replace)
  replace = replace.capitalize  if(token.downcase != token)

  replace
end
hyphenated?(str) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 206
def self.hyphenated?(str)
  !str[/\-/].nil?
end
romanization_module(type = :py) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 200
def self.romanization_module(type = :py)
  module_name = ROMANIZATION_TYPES.find{ |_k, v| v.include?(type.to_s) }.first

  ZhongwenTools::Romanization.const_get(module_name)
end
romanization_values(type) click to toggle source

Internal: Selects the romanization values for a particular romanization type.

type - a Symbol for the romanization type.

Examples:

romanization_values(:typy) #=> ['a', ..., 'r']

Returns an Array that contains the romanization's values.

# File lib/zhongwen_tools/romanization.rb, line 190
def self.romanization_values(type)
  # TODO: memoize
  @memoized_romanization_values = {}
  @memoized_romanization_values[type] = ZhongwenTools::Romanization::ROMANIZATIONS_TABLE.map do |r|
    "[#{ r[type][0] }#{ r[type][0].upcase }]#{ r[type][1..-1] }" || r[:pyn]
  end.flatten

  @memoized_romanization_values[type]
end
split_romanization(str, regex) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 96
def self.split_romanization(str, regex)
  # TODO: ignore tonal marks from other systems wade giles, tongyong etc.
  results = str.scan(regex).map do |arr|
    arr[0].strip.gsub('-','')
  end

  results.flatten - ['']
end
token_replacement(token, from = nil) click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 151
def self.token_replacement(token, from = nil)
  token = token.downcase.gsub(/[1-5].*/,'')
  result = ROMANIZATIONS_TABLE.find do |x|
    if from.nil?
      x.values.include?(token)
    else
      x[from] == token
    end
  end

  result || {}
end