module ZhongwenTools::Romanization::Pinyin
Public: methods to convert, detect and split pinyin or
pyn (pinyin with numbers, e.g. hao3).
Public Class Methods
add_hyphens_to_pyn(str)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 108 def self.add_hyphens_to_pyn(str) results = str.split(' ').map do |s| split_pyn(s).join('-') end results.join(' ') end
are_all_pyn_syllables_complete?(pyn_arr)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 128 def self.are_all_pyn_syllables_complete?(pyn_arr) pyns = ROMANIZATIONS_TABLE.map { |r| r[:pyn] } + PYN_SYLLABIC_NASALS pyn_syllables = pyn_arr.select do |p| pyns.include?(p.gsub(/[1-5]/, '')) end pyn_arr.size == pyn_syllables.size end
capitalized?(str)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 203 def self.capitalized?(str) first_letter = str[ZhongwenTools::Regex.pinyin_caps][0] first_letter != Caps.downcase(first_letter) end
convert_pinyin_to_pyn(pinyin)
click to toggle source
Internal: converts real pinyin to pinyin number string.
pinyin - A String
for the pinyin.
Examples
convert_pinyin_to_pyn('Nǐ hǎo ma') #=> 'Ni3 hao3 ma5?'
Returns a String
in pinyin number format.
# File lib/zhongwen_tools/romanization/pinyin.rb, line 188 def self.convert_pinyin_to_pyn(pinyin) words = pinyin.split(' ') pyn = words.map do |word| # NOTE: if a word is upcase, then it will be converted the same # as a word that is only capitalized. word, is_capitalized = normalize_pinyin(word) pys = split_py(word) recapitalize(current_pyn(word, pys), is_capitalized) end pyn.join(' ') end
convert_pyn_to_pinyin(str)
click to toggle source
Internal: Replaces numbered pinyin with actual pinyin. Pinyin
separated with hyphens are combined as one word.
str - A String
to replace with actual pinyin
Examples
convert_pyn_to_pinyin 'Ni3 hao3 ma5?' # => "Nǐ hǎo ma?"
Returns a string with actual pinyin
# File lib/zhongwen_tools/romanization/pinyin.rb, line 245 def self.convert_pyn_to_pinyin(str) regex = Regex.pinyin_num # NOTE: Using gsub is ~8x faster than using scan and each. # NOTE: if it's pinyin without vowels, e.g. m, ng, then convert, # otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html). # If it does, add it and then convert. Otherwise, just convert it. # Oh, and if it has double hyphens, replace with one hyphen. # And finally, correct those apostrophes at the very end. # It's like magic. str.gsub(regex) do ($3.nil? ? "#{ PYN_PY[$1] }" : ($2 == '' && %w(a e o).include?($3[0, 1])) ? "'#{ PYN_PY["#{ $3 }#{ $6 }"]}#{ $4 }#{ $5 }" : "#{ $2 }#{ PYN_PY["#{ $3 }#{ $6 }"] }#{ $4 }#{ $5 }") + (($7.to_s.length > 1) ? '-' : '') end.gsub("-'", '-').sub(/^'/, '').gsub(" '", ' ') end
current_pyn(pyn, pinyin_arr)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 209 def self.current_pyn(pyn, pinyin_arr) replace = {} pinyin_arr.map { |pinyin| replace[pinyin] = pinyin_replacement(pinyin) } pyn.gsub(/#{pinyin_arr.join('|')}/, replace).gsub("''", '') end
find_py(str)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 164 def self.find_py(str) regex = ZhongwenTools::Regex.find_py_regex str.scan(regex).map { |x| x.compact[0] } end
normalize_n(pinyin)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 153 def self.normalize_n(pinyin) # Special Case split_py("yìnián") # => ["yì" + "nián"] # split_py("Xīní") # => ["Xī", "ní"] regex = /#{Regex.only_tones}(n(#{Regex.py_tones['v']}|#{Regex.py_tones['i']}|[iu]|#{Regex.py_tones['e']}|[#{Regex.py_tones['a']}]))/ pinyin.gsub(regex) { "#{$1}-#{$2}" } end
normalize_n_g(pinyin)
click to toggle source
NOTE: Special Case split_py
(“fǎnguāng”) # => [“fǎn” + “guāng”]
In pinyin, sāngēng == sān gēng and sāng'ēng = sāng ēng
# File lib/zhongwen_tools/romanization/pinyin.rb, line 146 def self.normalize_n_g(pinyin) regex = /(?<n_part>n)(?<g_part>g(#{Regex.py_tones['o']}|#{Regex.py_tones['u']}|#{Regex.py_tones['a']}|#{Regex.py_tones['e']}))/ pinyin.gsub(regex) do "#{Regexp.last_match[:n_part]}-#{Regexp.last_match[:g_part]}" end end
normalize_pinyin(pinyin)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 160 def self.normalize_pinyin(pinyin) [Caps.downcase(pinyin), capitalized?(pinyin)] end
not_hyphen_regex()
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 120 def self.not_hyphen_regex @not_hyphen_regex ||= /[^\-]*/ end
pinyin_replacement(py)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 216 def self.pinyin_replacement(py) matches = PYN_PY.values.select do |x| py.include? x end match = select_pinyin_match(matches) replace = PYN_PY.find { |k, v| k if v == match }[0] py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/) { $1 + $3 + $2 } end
py?(str)
click to toggle source
Public: checks if a string is pinyin.
http://en.wikipedia.org/wiki/Pinyin
Examples
py?('nǐ hǎo') # => true
Returns Boolean.
# File lib/zhongwen_tools/romanization/pinyin.rb, line 77 def self.py?(str) if str[Regex.only_tones].nil? && str[/[1-5]/].nil? pyn?(str) else # TODO: py regex does not include capitals with tones. # NOTE: Special Case "fǎnguāng" should be "fǎn" + "guāng" regex = /(#{ Regex.punc }|#{ Regex.py }|#{ Regex.py_syllabic_nasals }|[\s\-])/ str = str.gsub('ngu', 'n-gu') Caps.downcase(str).gsub(regex, '').strip == '' end end
py_type(romanization)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 138 def self.py_type(romanization) romanization = romanization.to_s.downcase.to_sym { pyn: :pyn, py: :py, pinyin: :py }[romanization] end
pyn?(str)
click to toggle source
Public: checks if a string is pinyin.
Examples
pyn?('pin1-yin1') # => true
Returns Boolean.
# File lib/zhongwen_tools/romanization/pinyin.rb, line 97 def self.pyn?(str) return false if str =~ /a{2,}|e{2,}|i{2,}|o{2,}|u{2,}/ # FIXME: use strip_punctuation method, e.g. gsub(/\p{Punct}/, '') normalized_str = Caps.downcase(str.gsub(Regex.punc, '').gsub(/[\s\-]/, '')) pyn_arr = split_pyn(normalized_str).map { |p| p } pyn_arr << normalized_str if pyn_arr.size == 0 && PYN_SYLLABIC_NASALS.include?(normalized_str.gsub(/[1-5]/, '')) pyn_matches_properly?(pyn_arr, normalized_str) && are_all_pyn_syllables_complete?(pyn_arr) end
pyn_matches_properly?(pyn_arr, normalized_str)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 124 def self.pyn_matches_properly?(pyn_arr, normalized_str) pyn_arr.join('') == normalized_str end
recapitalize(obj, capitalized)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 169 def self.recapitalize(obj, capitalized) return obj unless capitalized if obj.is_a? String Caps.capitalize(obj) elsif obj.is_a? Array [Caps.capitalize(obj[0]), obj[1..-1]].flatten end end
select_pinyin_match(matches)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 227 def self.select_pinyin_match(matches) # take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm' match = matches.sort { |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1] # Edge case.. en/eng pyn -> py conversion is one way only. match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0] end
simple_tone_numbers()
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 116 def self.simple_tone_numbers @simple_tone_numbers ||= /[1-5]/ end
split_py(str)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 51 def self.split_py(str) words = str.split(' ') words.flat_map do |word| word, is_capitalized = normalize_pinyin(word) word = normalize_n_g(word) word = normalize_n(word) result = word.split(/['\-]/).flatten.map do |x| find_py(x) end # NOTE: Special Case split_py('wányìr') # => ['wán', 'yì', 'r'] result << 'r' unless word[/(.*[^#{ Regex.py_tones['e'] }.])(r)$/].nil? recapitalize(result.flatten, is_capitalized) end end
split_pyn(str)
click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 39 def self.split_pyn(str) # NOTE: This methods is called quite frequently. Unfortunately, it was # slower than it needed to be. After looking into several # optimizations, I ended up settling on one that cached the Regexp # creation. # FIXME: ignore punctuation regex = str[simple_tone_numbers].nil? ? Regex.capture_pinyin_toneless : Regex.pyn_and_pynt # NOTE: Fast Ruby: p[/[^\-]*/].to_s is 25% faster than gsub('-', '') strip_regex = not_hyphen_regex str.scan(regex).flat_map { |arr| arr[0].strip[strip_regex].to_s } end