module ZhongwenTools::Romanization
Public: Romanization
converts, detects and splits different romanizations.
Public: Romanization
converts to pinyin and pyn.
Constants
- PYN_PY
- PYN_SYLLABIC_NASALS
- ROMANIZATIONS_TABLE
TODO: remove excess values, i.e. keys whose value == :pyn TODO: en.wikipedia.org/wiki/Jyutping TODO: en.wikipedia.org/wiki/Simplified_Wade
- ROMANIZATION_TYPES
Public Class Methods
convert(str, to, from)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 14 def self.convert(str, to, from) # NOTE: don't convert if it already is converted. return str if to == from if to == :py convert_to_py(str, from) elsif to == :pyn convert_to_pyn(str, from) else convert_to_other(str, from, to) end end
convert_to_py(str, from)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 8 def self.convert_to_py(str, from) str = convert_romanization(str, from, :pyn) if from != :pyn Pinyin.convert_pyn_to_pinyin(str) end
convert_to_pyn(str, from)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 13 def self.convert_to_pyn(str, from) orig_str = str.dup if from == :py str = Romanization::Pinyin.convert_pinyin_to_pyn(str) else str = convert_romanization(str, from, :pyn) end str = Romanization::Pinyin.add_hyphens_to_pyn(str) if hyphenated?(orig_str) str end
romanization?(str)
click to toggle source
Public: Checks the romanization type for the string.
Romanization types are like ducks. If it walks, talks, and acts like a duck, it is a duck. Therefore, where a String is both pinyin and another romanization system, it will be identified as pinyin. If you need to determine whether a py/pyn string belongs to another romanization system p a romanization system, use the romanization modules specific function. Zhuyin Fuhao, Tongyong Pinyin, Wade Giles, MSP2 or Yale. http://en.wikipedia.org/wiki/Tongyong_Pinyin http://pinyin.info/romanization/tongyong/ http://en.wikipedia.org/wiki/Wade%E2%80%93Giles http://en.wikipedia.org/wiki/Bopomofo http://pinyin.info/romanization/bopomofo/index.html # str - a String to test.
Examples
romanization?('hao3') #=> :pyn romanization?('zzzz') #=> nil
Returns a String
for the romanization system or Nil if the string is not a romanization.
# File lib/zhongwen_tools/romanization.rb, line 49 def self.romanization?(str) if ZhongwenTools::Romanization::Pinyin.pyn?(str) :pyn elsif ZhongwenTools::Romanization::Pinyin.py?(str) :py elsif ZhongwenTools::Romanization::ZhuyinFuhao.bpmf?(str) :bpmf elsif ZhongwenTools::Romanization::WadeGiles.wg?(str) :wg elsif ZhongwenTools::Romanization::TongyongPinyin.typy?(str) :typy elsif ZhongwenTools::Romanization::Yale.yale?(str) :yale elsif ZhongwenTools::Romanization::MPS2.mps2?(str) :mps2 end end
split(str, type = nil)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 67 def self.split(str, type = nil) type ||= romanization?(str) if type == :py ZhongwenTools::Romanization::Pinyin.split_py(str) elsif type == :pyn ZhongwenTools::Romanization::Pinyin.split_pyn(str) elsif type == :bpmf ZhongwenTools::Romanization::ZhuyinFuhao.split(str) elsif type == :wg ZhongwenTools::Romanization::WadeGiles.split(str) elsif type == :typy ZhongwenTools::Romanization::TongyongPinyin.split(str) elsif type == :yale ZhongwenTools::Romanization::Yale.split(str) elsif type == :mps2 ZhongwenTools::Romanization::MPS2.split(str) end end
Private Class Methods
convert_romanization(str, from, to)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 105 def self.convert_romanization(str, from, to) # NOTE: extract/refactor tokens cause tests to fail. if from == :pyn tokens = ZhongwenTools::Romanization::Pinyin.split_pyn(str).uniq else tokens = romanization_module(from).send(:split, str).uniq end tokens.collect do |t| search, replace = find_token_replacement(t, str, to, from) str = str.gsub(search, replace) end str end
convert_to_other(str, from, to)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 121 def self.convert_to_other(str, from, to) if from == :py str = ZhongwenTools::Romanization::Pinyin.convert_pinyin_to_pyn(str) from = :pyn end str = convert_romanization(str, from, to) if to == :bpmf str.gsub('-', '') else str end end
detect_regex(type)
click to toggle source
Internal: Produces a Regexp for a romanization type.
type - a Symbol for the romanization type.
Examples:
detect_regex(:typy) #=> <Regexp>
Returns a Regexp.
# File lib/zhongwen_tools/romanization.rb, line 174 def self.detect_regex(type) # TODO: memoize @memoized_detect_regex ||= {} @memoized_detect_regex[type] ||= /#{romanization_values(type).sort{|x,y| x.size <=> y.size}.reverse.join('|')}/ end
detect_romanization(str, regex)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 89 def self.detect_romanization(str, regex) normalized_str = str.downcase.gsub(ZhongwenTools::Regex.punc, '').gsub(/[1-5\s\-']/, '') # TODO: ignore tonal marks from other systems wade giles, tongyong etc. normalized_str.scan(regex).join == normalized_str end
find_token_replacement(token, str, to, from)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 136 def self.find_token_replacement(token, str, to, from) search = token.gsub(/[1-5].*/,'') replace = token_replacement(token, from).fetch(to){ search } replace = fix_capitalization(str, token, replace) [search, replace] end
fix_capitalization(str, token, replace)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 145 def self.fix_capitalization(str, token, replace) replace = replace.capitalize if(token.downcase != token) replace end
hyphenated?(str)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 206 def self.hyphenated?(str) !str[/\-/].nil? end
romanization_module(type = :py)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 200 def self.romanization_module(type = :py) module_name = ROMANIZATION_TYPES.find{ |_k, v| v.include?(type.to_s) }.first ZhongwenTools::Romanization.const_get(module_name) end
romanization_values(type)
click to toggle source
Internal: Selects the romanization values for a particular romanization type.
type - a Symbol for the romanization type.
Examples:
romanization_values(:typy) #=> ['a', ..., 'r']
Returns an Array that contains the romanization's values.
# File lib/zhongwen_tools/romanization.rb, line 190 def self.romanization_values(type) # TODO: memoize @memoized_romanization_values = {} @memoized_romanization_values[type] = ZhongwenTools::Romanization::ROMANIZATIONS_TABLE.map do |r| "[#{ r[type][0] }#{ r[type][0].upcase }]#{ r[type][1..-1] }" || r[:pyn] end.flatten @memoized_romanization_values[type] end
split_romanization(str, regex)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 96 def self.split_romanization(str, regex) # TODO: ignore tonal marks from other systems wade giles, tongyong etc. results = str.scan(regex).map do |arr| arr[0].strip.gsub('-','') end results.flatten - [''] end
token_replacement(token, from = nil)
click to toggle source
# File lib/zhongwen_tools/romanization.rb, line 151 def self.token_replacement(token, from = nil) token = token.downcase.gsub(/[1-5].*/,'') result = ROMANIZATIONS_TABLE.find do |x| if from.nil? x.values.include?(token) else x[from] == token end end result || {} end