class JapaneseNames::Splitter
Provides methods to split a full Japanese name strings into surname and given name.
Public Instance Methods
split(kanji, kana)
click to toggle source
Given a kanji and kana representation of a name splits into to family/given names.
The choice to prioritize family name is arbitrary. Further analysis is needed for whether given or family name should be prioritized.
Returns Array [[kanji_fam, kanji_giv], [kana_fam, kana_giv]] if there was a match. Returns nil if there was no match.
# File lib/japanese_names/splitter.rb, line 13 def split(kanji, kana) return nil unless kanji && kana kanji = kanji.strip kana = kana.strip # Short-circuit: Return last name if it can match the full string if kanji.size <= 3 && kana.size <= 4 full_match = finder.find(kanji).detect { |d| d[0] == kanji && d[1] =~ /\A#{hk kana}\z/ } return [[kanji, nil], [kana, nil]] if full_match end # Partition kanji into candidate n-grams kanji_ngrams = Util::Ngram.ngram_partition(kanji) # Find all possible matches of all kanji n-grams in dictionary dict = finder.find(kanji_ngrams.flatten.uniq) first_lhs_match = nil first_rhs_match = nil kanji_ngrams.each do |kanji_pair| lhs_dict = dict.select { |d| d[0] == kanji_pair[0] } rhs_dict = dict.select { |d| d[0] == kanji_pair[1] } lhs_match = detect_lhs(lhs_dict, kanji, kana) rhs_match = detect_rhs(rhs_dict, kanji, kana) return lhs_match if lhs_match && lhs_match == rhs_match first_lhs_match ||= lhs_match first_rhs_match ||= rhs_match end # As a fallback, return single-sided match prioritizing surname match first first_lhs_match || first_rhs_match end
Private Instance Methods
detect_lhs(dict, kanji, kana)
click to toggle source
# File lib/japanese_names/splitter.rb, line 51 def detect_lhs(dict, kanji, kana) dict_match = dict.select { |d| match_kana_lhs(d, kana) }.sort_by { |m| m[1].size * -1 }.first if dict_match kana_match = match_kana_lhs(dict_match, kana) return [[dict_match[0], Util::Ngram.mask_left(kanji, dict_match[0])], [kana_match, Util::Ngram.mask_left(kana, kana_match)]] end end
detect_rhs(dict, kanji, kana)
click to toggle source
# File lib/japanese_names/splitter.rb, line 60 def detect_rhs(dict, kanji, kana) dict_match = dict.select { |d| match_kana_rhs(d, kana) }.sort_by { |m| m[1].size * -1 }.first if dict_match kana_match = match_kana_rhs(dict_match, kana) return [[Util::Ngram.mask_right(kanji, dict_match[0]), dict_match[0]], [Util::Ngram.mask_right(kana, kana_match), kana_match]] end end
finder()
click to toggle source
# File lib/japanese_names/splitter.rb, line 82 def finder @finder ||= Finder.new end
hk(str)
click to toggle source
Returns a regex string which matches both hiragana and katakana variations of a String.
# File lib/japanese_names/splitter.rb, line 78 def hk(str) "(?:#{Moji.kata_to_hira(str)}|#{Moji.hira_to_kata(str)})" end
match_kana_lhs(dict, kana)
click to toggle source
# File lib/japanese_names/splitter.rb, line 69 def match_kana_lhs(dict, kana) kana[/\A#{hk dict[1]}/] end
match_kana_rhs(dict, kana)
click to toggle source
# File lib/japanese_names/splitter.rb, line 73 def match_kana_rhs(dict, kana) kana[/#{hk dict[1]}\z/] end