module ZhongwenTools::Romanization::Pinyin

Public: methods to convert, detect and split pinyin or

pyn (pinyin with numbers, e.g. hao3).

Public Class Methods

add_hyphens_to_pyn(str) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 108
def self.add_hyphens_to_pyn(str)
  results = str.split(' ').map do |s|
    split_pyn(s).join('-')
  end

  results.join(' ')
end
are_all_pyn_syllables_complete?(pyn_arr) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 128
def self.are_all_pyn_syllables_complete?(pyn_arr)
  pyns = ROMANIZATIONS_TABLE.map { |r| r[:pyn] } + PYN_SYLLABIC_NASALS

  pyn_syllables = pyn_arr.select do |p|
    pyns.include?(p.gsub(/[1-5]/, ''))
  end

  pyn_arr.size == pyn_syllables.size
end
capitalized?(str) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 203
def self.capitalized?(str)
  first_letter = str[ZhongwenTools::Regex.pinyin_caps][0]

  first_letter != Caps.downcase(first_letter)
end
convert_pinyin_to_pyn(pinyin) click to toggle source

Internal: converts real pinyin to pinyin number string.

pinyin - A String for the pinyin.

Examples

convert_pinyin_to_pyn('Nǐ hǎo ma') #=> 'Ni3 hao3 ma5?'

Returns a String in pinyin number format.

# File lib/zhongwen_tools/romanization/pinyin.rb, line 188
def self.convert_pinyin_to_pyn(pinyin)
  words = pinyin.split(' ')

  pyn = words.map do |word|
    # NOTE: if a word is upcase, then it will be converted the same
    #       as a word that is only capitalized.
    word, is_capitalized = normalize_pinyin(word)
    pys = split_py(word)

    recapitalize(current_pyn(word, pys), is_capitalized)
  end

  pyn.join(' ')
end
convert_pyn_to_pinyin(str) click to toggle source

Internal: Replaces numbered pinyin with actual pinyin. Pinyin separated with hyphens are combined as one word.

str - A String to replace with actual pinyin

Examples

convert_pyn_to_pinyin 'Ni3 hao3 ma5?' # => "Nǐ hǎo ma?"

Returns a string with actual pinyin

# File lib/zhongwen_tools/romanization/pinyin.rb, line 245
def self.convert_pyn_to_pinyin(str)
  regex = Regex.pinyin_num
  # NOTE: Using gsub is ~8x faster than using scan and each.
  # NOTE: if it's pinyin without vowels, e.g. m, ng, then convert,
  #       otherwise, check if it needs an apostrophe (http://www.pinyin.info/romanization/hanyu/apostrophes.html).
  #       If it does, add it and then convert. Otherwise, just convert it.
  #       Oh, and if it has double hyphens, replace with one hyphen.
  #       And finally, correct those apostrophes at the very end.
  #       It's like magic.
  str.gsub(regex) do
    ($3.nil? ? "#{ PYN_PY[$1] }" : ($2 == '' && %w(a e o).include?($3[0, 1])) ? "'#{ PYN_PY["#{ $3 }#{ $6 }"]}#{ $4 }#{ $5 }" : "#{ $2 }#{ PYN_PY["#{ $3 }#{ $6 }"] }#{ $4 }#{ $5 }") + (($7.to_s.length > 1) ? '-' : '')
  end.gsub("-'", '-').sub(/^'/, '').gsub(" '", ' ')
end
current_pyn(pyn, pinyin_arr) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 209
def self.current_pyn(pyn, pinyin_arr)
  replace = {}
  pinyin_arr.map { |pinyin| replace[pinyin] = pinyin_replacement(pinyin) }

  pyn.gsub(/#{pinyin_arr.join('|')}/, replace).gsub("''", '')
end
find_py(str) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 164
def self.find_py(str)
  regex = ZhongwenTools::Regex.find_py_regex
  str.scan(regex).map { |x| x.compact[0] }
end
normalize_n(pinyin) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 153
def self.normalize_n(pinyin)
  #       Special Case split_py("yìnián")   # => ["yì" + "nián"]
  #                    split_py("Xīní")     # => ["Xī", "ní"]
  regex = /#{Regex.only_tones}(n(#{Regex.py_tones['v']}|#{Regex.py_tones['i']}|[iu]|#{Regex.py_tones['e']}|[#{Regex.py_tones['a']}]))/
  pinyin.gsub(regex) { "#{$1}-#{$2}" }
end
normalize_n_g(pinyin) click to toggle source

NOTE: Special Case split_py(“fǎnguāng”) # => [“fǎn” + “guāng”]

In pinyin, sāngēng == sān gēng and sāng'ēng = sāng ēng
# File lib/zhongwen_tools/romanization/pinyin.rb, line 146
def self.normalize_n_g(pinyin)
  regex = /(?<n_part>n)(?<g_part>g(#{Regex.py_tones['o']}|#{Regex.py_tones['u']}|#{Regex.py_tones['a']}|#{Regex.py_tones['e']}))/
  pinyin.gsub(regex) do
    "#{Regexp.last_match[:n_part]}-#{Regexp.last_match[:g_part]}"
  end
end
normalize_pinyin(pinyin) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 160
def self.normalize_pinyin(pinyin)
  [Caps.downcase(pinyin), capitalized?(pinyin)]
end
not_hyphen_regex() click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 120
def self.not_hyphen_regex
  @not_hyphen_regex ||= /[^\-]*/
end
pinyin_replacement(py) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 216
def self.pinyin_replacement(py)
  matches = PYN_PY.values.select do |x|
    py.include? x
  end

  match = select_pinyin_match(matches)
  replace = PYN_PY.find { |k, v| k if v == match }[0]

  py.gsub(match, replace).gsub(/([^\d ]*)(\d)([^\d ]*)/) { $1 + $3 + $2 }
end
py?(str) click to toggle source

Public: checks if a string is pinyin.

http://en.wikipedia.org/wiki/Pinyin

Examples

py?('nǐ hǎo')
# => true

Returns Boolean.

# File lib/zhongwen_tools/romanization/pinyin.rb, line 77
def self.py?(str)
  if str[Regex.only_tones].nil? && str[/[1-5]/].nil?
    pyn?(str)
  else
    # TODO: py regex does not include capitals with tones.
    # NOTE: Special Case "fǎnguāng" should be "fǎn" + "guāng"

    regex = /(#{ Regex.punc }|#{ Regex.py }|#{ Regex.py_syllabic_nasals }|[\s\-])/
    str = str.gsub('ngu', 'n-gu')
    Caps.downcase(str).gsub(regex, '').strip == ''
  end
end
py_type(romanization) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 138
def self.py_type(romanization)
  romanization = romanization.to_s.downcase.to_sym

  { pyn: :pyn, py: :py, pinyin: :py }[romanization]
end
pyn?(str) click to toggle source

Public: checks if a string is pinyin.

Examples

pyn?('pin1-yin1')
# => true

Returns Boolean.

# File lib/zhongwen_tools/romanization/pinyin.rb, line 97
def self.pyn?(str)
  return false if str =~ /a{2,}|e{2,}|i{2,}|o{2,}|u{2,}/
  # FIXME: use strip_punctuation method, e.g. gsub(/\p{Punct}/, '')
  normalized_str = Caps.downcase(str.gsub(Regex.punc, '').gsub(/[\s\-]/, ''))
  pyn_arr = split_pyn(normalized_str).map { |p| p }
  pyn_arr << normalized_str if pyn_arr.size == 0 && PYN_SYLLABIC_NASALS.include?(normalized_str.gsub(/[1-5]/, ''))

  pyn_matches_properly?(pyn_arr, normalized_str) &&
    are_all_pyn_syllables_complete?(pyn_arr)
end
pyn_matches_properly?(pyn_arr, normalized_str) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 124
def self.pyn_matches_properly?(pyn_arr, normalized_str)
  pyn_arr.join('') == normalized_str
end
recapitalize(obj, capitalized) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 169
def self.recapitalize(obj, capitalized)
  return obj unless capitalized

  if obj.is_a? String
    Caps.capitalize(obj)
  elsif obj.is_a? Array
    [Caps.capitalize(obj[0]), obj[1..-1]].flatten
  end
end
select_pinyin_match(matches) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 227
def self.select_pinyin_match(matches)
  # take the longest pinyin match. Use bytes because 'è' is prefered over 'n' or 'r' or 'm'
  match = matches.sort { |x, y| x.bytes.to_a.length <=> y.bytes.to_a.length }[-1]

  # Edge case.. en/eng pyn -> py conversion is one way only.
  match[/^(ē|é|ě|è|e)n?g?/].nil? ? match : match.chars[0]
end
simple_tone_numbers() click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 116
def self.simple_tone_numbers
  @simple_tone_numbers ||= /[1-5]/
end
split_py(str) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 51
def self.split_py(str)
  words = str.split(' ')

  words.flat_map do |word|
    word, is_capitalized = normalize_pinyin(word)
    word = normalize_n_g(word)
    word = normalize_n(word)
    result = word.split(/['\-]/).flatten.map do |x|
      find_py(x)
    end

    # NOTE: Special Case split_py('wányìr')   # => ['wán', 'yì', 'r']
    result << 'r' unless word[/(.*[^#{ Regex.py_tones['e'] }.])(r)$/].nil?

    recapitalize(result.flatten, is_capitalized)
  end
end
split_pyn(str) click to toggle source
# File lib/zhongwen_tools/romanization/pinyin.rb, line 39
def self.split_pyn(str)
  # NOTE: This methods is called quite frequently. Unfortunately, it was
  #       slower than it needed to be. After looking into several
  #       optimizations, I ended up settling on one that cached the Regexp
  #       creation.
  # FIXME: ignore punctuation
  regex = str[simple_tone_numbers].nil? ? Regex.capture_pinyin_toneless : Regex.pyn_and_pynt
  # NOTE: Fast Ruby: p[/[^\-]*/].to_s is 25% faster than gsub('-', '')
  strip_regex = not_hyphen_regex
  str.scan(regex).flat_map { |arr| arr[0].strip[strip_regex].to_s }
end