module HangulTools

Courtesy of algorithms described at: gernot-katzers-spice-pages.com/var/korean_hangul_unicode.html

Constants

BLENDS
LEADS
TAILS
VOWELS

Public Class Methods

decompose(text) click to toggle source

it is assumed that `text` contains nothing but hangul codepoints

# File lib/hangul_tools.rb, line 22
def self.decompose(text)
  text.codepoints.map do |point|
    tail = (point - 44032) % 28
    vowel = 1 + ((point - 44032 - tail) % 588) / 28
    lead = 1 + (point - 44032) / 588

    [lead, vowel, tail]
  end
end
matrices() click to toggle source
# File lib/hangul_tools.rb, line 71
def self.matrices
  @matrices ||= {}.tap do |hash|
    raw = File.read(__FILE__).lines
    split_at = raw.index("__END__\n")

    key = lines = nil
    raw[(split_at+1)..-1].each do |line|
      if line =~ /^(\w+):$/
        hash[key.to_sym] = parse_matrix(lines) if lines
        key = $1
        lines = []
      elsif line !~ /^$/
        lines << line
      end
    end

    hash[key.to_sym] = parse_matrix(lines) if lines
  end
end
parse_matrix(lines) click to toggle source
# File lib/hangul_tools.rb, line 91
def self.parse_matrix(lines)
  lead = lines.first.split(/\s+/)[1..-1].map do |v|
    if v == '_'
      nil
    elsif v == 'final'
      :final
    else
      v
    end
  end

  matrix = {}

  lines[1..-1].each do |line|
    tail, *sounds = line.split(/\s+/)

    if tail == 'initial'
      tail = :initial
    elsif tail == 'voiced'
      tail = :voiced
    elsif tail == '_'
      tail = nil
    end

    sounds.map! { |s| s == '_' ? nil : s }

    matrix[tail] = Hash[lead.zip(sounds)]
  end

  matrix
end
romanize(text, system: :revised, initial: :initial) click to toggle source
# File lib/hangul_tools.rb, line 5
def self.romanize(text, system: :revised, initial: :initial)
  matrix = matrices[system]
  vowels = VOWELS[system]

  text.scan(/[\uAC00-\uD7a3]+|[^\uAC00-\uD7a3]+/).map.with_index do |string, idx|
    if string =~ /[\uAC00-\uD7a3]/
      romanize_with_system(string, system, idx > 0 ? :voiced : initial)
    else
      string
    end
  end.join
end
romanize_with_system(text, system, voiced) click to toggle source
# File lib/hangul_tools.rb, line 32
def self.romanize_with_system(text, system, voiced)
  matrix = matrices[system]
  vowels = VOWELS[system]
  blends = BLENDS[system]

  syllables = decompose(text)
  phonemes = []

  syllables.each.with_index do |(lead, vowel, tail), idx|
    prior = (idx > 0) ? TAILS[syllables[idx-1][2].to_i] : voiced
    final = syllables[idx+1] ? false : true

    phonemes << (matrix[prior] || {})[LEADS[lead]]
    phonemes << vowels[vowel]

    if final
      phonemes << (matrix[TAILS[tail]] || {})[:final]
    end
  end

  result = phonemes.compact.join

  blends.each do |pattern, blend|
    result = result.gsub(pattern, blend)
  end

  result
end