class Ve::Parse::MecabIpadic
Constants
- BA
- DAIMEISHI
- DE
- DOUSHI
- DOUSHIHIJIRITSUTEKI
- FIRAA
- FUHENKAGATA
- FUKUSHI
- FUKUSHIKA
- FUKUSHIKANOU
- HIJIRITSU
Pos2 and Inflection types
- JINMEI
- JODOUSHI
- JODOUSHIGOKAN
- JOSHI
- KAKARIJOSHI
- KANDOUSHI
- KAZU
- KEIYOUDOUSHIGOKAN
- KEIYOUSHI
- KIGOU
- KOYUUMEISHI
- MEIREI_I
- MEISHI
PoS
- NA
Etc
- NAIKEIYOUSHIGOKAN
- NI
- NN
- PARSER
- RENTAIKA
- RENTAISHI
- SA
- SAHENSETSUZOKU
- SAHEN_SURU
- SETSUBI
- SETSUZOKUJOSHI
- SETSUZOKUSHI
- SETSUZOKUSHITEKI
- SETTOUSHI
- SONOTA
- TAIGENSETSUZOKU
- TE
- TOKUSHU
- TOKUSHU_DA
- TOKUSHU_DESU
- TOKUSHU_MASU
- TOKUSHU_NAI
- TOKUSHU_NU
- TOKUSHU_TA
- TOKUSHU_TAI
Attributes
text[R]
tokens[R]
Public Class Methods
new(text, output)
click to toggle source
# File lib/providers/mecab_ipadic.rb, line 68 def initialize(text, output) @tokens = [] @text = text position = 0 output.each_with_index do |line, index| line.rstrip! token = {:raw => line} # Anything unparsed at the end of the text # This must happen before sentence splits are detected to avoid funny ordering if output.length > 1 && output.length == index + 1 unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position) if unparsed_md[1].length > 0 unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''} unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1)) @tokens << unparsed_token end end if line =~ %r{^ EOS $}x token[:type] = :sentence_split token[:literal] = '' elsif md = PARSER.match(line) # The parsed token token[:type] = :parsed token[:literal] = md[1] info = md[2].split(',') [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i| token[attr] = info[i] end # Anything unparsed preceding this token unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position) if unparsed_md[1].length > 0 unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]} unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1)) @tokens << unparsed_token position += unparsed_token[:literal].length end token[:characters] = (position..(position+token[:literal].length-1)) position += token[:literal].length else # C'est une catastrophe end @tokens << token end end
Public Instance Methods
sentences()
click to toggle source
# File lib/providers/mecab_ipadic.rb, line 353 def sentences # TODO: Sentence objects that keep track of the sentence's tokens sentences = [] current = '' @tokens.each do |token| if token[:type] == :sentence_split sentences << current current = '' elsif token[:literal] == '。' current << token[:literal] sentences << current current = '' else current << token[:literal] end end # In case there is no :sentence_split at the end sentences << current if current.length > 0 sentences end
words()
click to toggle source
# File lib/providers/mecab_ipadic.rb, line 173 def words words = [] tokens = @tokens.find_all { |t| t[:type] == :parsed } tokens = tokens.to_enum previous = nil # This is becoming very big begin while token = tokens.next pos = nil grammar = nil eat_next = false eat_lemma = true attach_to_previous = false also_attach_to_lemma = false update_pos = false case token[:pos] when MEISHI pos = Ve::PartOfSpeech::Noun case token[:pos2] when KOYUUMEISHI pos = Ve::PartOfSpeech::ProperNoun when DAIMEISHI pos = Ve::PartOfSpeech::Pronoun when FUKUSHIKANOU, SAHENSETSUZOKU, KEIYOUDOUSHIGOKAN, NAIKEIYOUSHIGOKAN if tokens.more? following = tokens.peek if following[:inflection_type] == SAHEN_SURU pos = Ve::PartOfSpeech::Verb eat_next = true elsif following[:inflection_type] == TOKUSHU_DA pos = Ve::PartOfSpeech::Adjective if following[:inflection_form] == TAIGENSETSUZOKU eat_next = true eat_lemma = false end elsif following[:inflection_type] == TOKUSHU_NAI pos = Ve::PartOfSpeech::Adjective eat_next = true elsif following[:pos] == JOSHI && following[:literal] == NI pos = Ve::PartOfSpeech::Adverb eat_next = false end end when HIJIRITSU, TOKUSHU if tokens.more? following = tokens.peek case token[:pos3] when FUKUSHIKANOU if following[:pos] == JOSHI && following[:literal] == NI pos = Ve::PartOfSpeech::Adverb eat_next = true end when JODOUSHIGOKAN if following[:inflection_type] == TOKUSHU_DA pos = Ve::PartOfSpeech::Verb grammar = :auxillary if following[:inflection_form] == TAIGENSETSUZOKU eat_next = true end elsif following[:pos] == JOSHI && following[:pos2] == FUKUSHIKA pos = Ve::PartOfSpeech::Adverb eat_next = true end when KEIYOUDOUSHIGOKAN pos = Ve::PartOfSpeech::Adjective if (following[:inflection_type] == TOKUSHU_DA && following[:inflection_form] == TAIGENSETSUZOKU) || following[:pos2] == RENTAIKA eat_next = true end end end when KAZU # TODO: recurse and find following numbers and add to this word. Except non-numbers like 幾 pos = Ve::PartOfSpeech::Number if words.length > 0 && words[-1].part_of_speech == Ve::PartOfSpeech::Number attach_to_previous = true also_attach_to_lemma = true end when SETSUBI if token[:pos3] == JINMEI pos = Ve::PartOfSpeech::Suffix else if token[:pos3] == TOKUSHU && token[:lemma] == SA update_pos = true pos = Ve::PartOfSpeech::Noun else also_attach_to_lemma = true end attach_to_previous = true end when SETSUZOKUSHITEKI pos = Ve::PartOfSpeech::Conjunction when DOUSHIHIJIRITSUTEKI pos = Ve::PartOfSpeech::Verb grammar = :nominal end when SETTOUSHI # TODO: elaborate this when we have the "main part" feature for words? pos = Ve::PartOfSpeech::Prefix when JODOUSHI pos = Ve::PartOfSpeech::Postposition if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) && [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type]) attach_to_previous = true elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN attach_to_previous = true elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA pos = Ve::PartOfSpeech::Verb end when DOUSHI pos = Ve::PartOfSpeech::Verb if token[:pos2] == SETSUBI attach_to_previous = true elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I attach_to_previous = true end when KEIYOUSHI pos = Ve::PartOfSpeech::Adjective when JOSHI pos = Ve::PartOfSpeech::Postposition if token[:pos2] == SETSUZOKUJOSHI && [TE, DE, BA].include?(token[:literal]) attach_to_previous = true end when RENTAISHI pos = Ve::PartOfSpeech::Determiner when SETSUZOKUSHI pos = Ve::PartOfSpeech::Conjunction when FUKUSHI pos = Ve::PartOfSpeech::Adverb when KIGOU pos = Ve::PartOfSpeech::Symbol when FIRAA, KANDOUSHI pos = Ve::PartOfSpeech::Interjection when SONOTA pos = Ve::PartOfSpeech::Other else # C'est une catastrophe end if attach_to_previous && words.length > 0 words[-1].tokens << token words[-1].word << token[:literal] words[-1].extra[:reading] << (token[:reading] || '') words[-1].extra[:transcription] << (token[:hatsuon] || '') words[-1].lemma << token[:lemma] if also_attach_to_lemma words[-1].part_of_speech = pos if update_pos else pos = Ve::PartOfSpeech::TBD if pos.nil? word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], { :reading => token[:reading] || '', :transcription => token[:hatsuon] || '', :grammar => grammar }, { :reading_script => :kata, :transcription_script => :kata }) if eat_next following = tokens.next word.tokens << following word.word << following[:literal] word.extra[:reading] << following[:reading] word.extra[:transcription] << following[:hatsuon] word.lemma << following[:lemma] if eat_lemma end words << word end previous = token end rescue StopIteration end return words end