class Ve::Parse::MecabIpadic

Constants

BA
DAIMEISHI
DE
DOUSHI
DOUSHIHIJIRITSUTEKI
FIRAA
FUHENKAGATA
FUKUSHI
FUKUSHIKA
FUKUSHIKANOU
HIJIRITSU

Pos2 and Inflection types

JINMEI
JODOUSHI
JODOUSHIGOKAN
JOSHI
KAKARIJOSHI
KANDOUSHI
KAZU
KEIYOUDOUSHIGOKAN
KEIYOUSHI
KIGOU
KOYUUMEISHI
MEIREI_I
MEISHI

PoS

NA

Etc

NAIKEIYOUSHIGOKAN
NI
NN
PARSER
RENTAIKA
RENTAISHI
SA
SAHENSETSUZOKU
SAHEN_SURU
SETSUBI
SETSUZOKUJOSHI
SETSUZOKUSHI
SETSUZOKUSHITEKI
SETTOUSHI
SONOTA
TAIGENSETSUZOKU
TE
TOKUSHU
TOKUSHU_DA
TOKUSHU_DESU
TOKUSHU_MASU
TOKUSHU_NAI
TOKUSHU_NU
TOKUSHU_TA
TOKUSHU_TAI

Attributes

text[R]
tokens[R]

Public Class Methods

new(text, output) click to toggle source
# File lib/providers/mecab_ipadic.rb, line 68
def initialize(text, output)
  @tokens = []
  @text = text
  position = 0

  output.each_with_index do |line, index|
    line.rstrip!
    token = {:raw => line}
    # Anything unparsed at the end of the text
    # This must happen before sentence splits are detected to avoid funny ordering
    if output.length > 1 && output.length == index + 1
      unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
      end
    end

    if line =~ %r{^ EOS $}x
      token[:type] = :sentence_split
      token[:literal] = ''
    elsif md = PARSER.match(line)
      # The parsed token
      token[:type] = :parsed
      token[:literal] = md[1]
      info = md[2].split(',')
      [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
        token[attr] = info[i]
      end

      # Anything unparsed preceding this token
      unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
        position += unparsed_token[:literal].length
      end

      token[:characters] = (position..(position+token[:literal].length-1))
      position += token[:literal].length
    else
      # C'est une catastrophe
    end

    @tokens << token
  end
end

Public Instance Methods

sentences() click to toggle source
# File lib/providers/mecab_ipadic.rb, line 353
def sentences
  # TODO: Sentence objects that keep track of the sentence's tokens
  sentences = []
  current = ''

  @tokens.each do |token|
    if token[:type] == :sentence_split
      sentences << current
      current = ''
    elsif token[:literal] == '。'
      current << token[:literal]
      sentences << current
      current = ''
    else
      current << token[:literal]
    end
  end

  # In case there is no :sentence_split at the end
  sentences << current if current.length > 0

  sentences
end
words() click to toggle source
# File lib/providers/mecab_ipadic.rb, line 173
def words
  words = []
  tokens = @tokens.find_all { |t| t[:type] == :parsed }
  tokens = tokens.to_enum
  previous = nil

  # This is becoming very big
  begin
    while token = tokens.next
      pos = nil
      grammar = nil
      eat_next = false
      eat_lemma = true
      attach_to_previous = false
      also_attach_to_lemma = false
      update_pos = false

      case token[:pos]
      when MEISHI
        pos = Ve::PartOfSpeech::Noun

        case token[:pos2]
        when KOYUUMEISHI
          pos = Ve::PartOfSpeech::ProperNoun
        when DAIMEISHI
          pos = Ve::PartOfSpeech::Pronoun
        when FUKUSHIKANOU, SAHENSETSUZOKU, KEIYOUDOUSHIGOKAN, NAIKEIYOUSHIGOKAN
          if tokens.more?
            following = tokens.peek
            if following[:inflection_type] == SAHEN_SURU
              pos = Ve::PartOfSpeech::Verb
              eat_next = true
            elsif following[:inflection_type] == TOKUSHU_DA
              pos = Ve::PartOfSpeech::Adjective
              if following[:inflection_form] == TAIGENSETSUZOKU
                eat_next = true
                eat_lemma = false
              end
            elsif following[:inflection_type] == TOKUSHU_NAI
              pos = Ve::PartOfSpeech::Adjective
              eat_next = true
            elsif following[:pos] == JOSHI && following[:literal] == NI
              pos = Ve::PartOfSpeech::Adverb
              eat_next = false
            end
          end
        when HIJIRITSU, TOKUSHU
          if tokens.more?
            following = tokens.peek
            case token[:pos3]
            when FUKUSHIKANOU
              if following[:pos] == JOSHI && following[:literal] == NI
                pos = Ve::PartOfSpeech::Adverb
                eat_next = true
              end
            when JODOUSHIGOKAN
              if following[:inflection_type] == TOKUSHU_DA
                pos = Ve::PartOfSpeech::Verb
                grammar = :auxillary
                if following[:inflection_form] == TAIGENSETSUZOKU
                  eat_next = true
                end
              elsif following[:pos] == JOSHI && following[:pos2] == FUKUSHIKA
                pos = Ve::PartOfSpeech::Adverb
                eat_next = true
              end
            when KEIYOUDOUSHIGOKAN
              pos = Ve::PartOfSpeech::Adjective
              if (following[:inflection_type] == TOKUSHU_DA && following[:inflection_form] == TAIGENSETSUZOKU) || following[:pos2] == RENTAIKA
                eat_next = true
              end
            end
          end
        when KAZU
          # TODO: recurse and find following numbers and add to this word. Except non-numbers like 幾
          pos = Ve::PartOfSpeech::Number
          if words.length > 0 && words[-1].part_of_speech == Ve::PartOfSpeech::Number
            attach_to_previous = true
            also_attach_to_lemma = true
          end
        when SETSUBI
          if token[:pos3] == JINMEI
            pos = Ve::PartOfSpeech::Suffix
          else
            if token[:pos3] == TOKUSHU && token[:lemma] == SA
              update_pos = true
              pos = Ve::PartOfSpeech::Noun
            else
              also_attach_to_lemma = true
            end
            attach_to_previous = true
          end
        when SETSUZOKUSHITEKI
          pos = Ve::PartOfSpeech::Conjunction
        when DOUSHIHIJIRITSUTEKI
          pos = Ve::PartOfSpeech::Verb
          grammar = :nominal
        end
      when SETTOUSHI
        # TODO: elaborate this when we have the "main part" feature for words?
        pos = Ve::PartOfSpeech::Prefix
      when JODOUSHI
        pos = Ve::PartOfSpeech::Postposition

        if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) &&
           [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type])
          attach_to_previous = true
        elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN
          attach_to_previous = true
        elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
          pos = Ve::PartOfSpeech::Verb
        end
      when DOUSHI
        pos = Ve::PartOfSpeech::Verb
        if token[:pos2] == SETSUBI
          attach_to_previous = true
        elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I
          attach_to_previous = true
        end
      when KEIYOUSHI
        pos = Ve::PartOfSpeech::Adjective
      when JOSHI
        pos = Ve::PartOfSpeech::Postposition
        if token[:pos2] == SETSUZOKUJOSHI && [TE, DE, BA].include?(token[:literal])
          attach_to_previous = true
        end
      when RENTAISHI
        pos = Ve::PartOfSpeech::Determiner
      when SETSUZOKUSHI
        pos = Ve::PartOfSpeech::Conjunction
      when FUKUSHI
        pos = Ve::PartOfSpeech::Adverb
      when KIGOU
        pos = Ve::PartOfSpeech::Symbol
      when FIRAA, KANDOUSHI
        pos = Ve::PartOfSpeech::Interjection
      when SONOTA
        pos = Ve::PartOfSpeech::Other
      else
        # C'est une catastrophe
      end

      if attach_to_previous && words.length > 0
        words[-1].tokens << token
        words[-1].word << token[:literal]
        words[-1].extra[:reading] << (token[:reading] || '')
        words[-1].extra[:transcription] << (token[:hatsuon] || '')
        words[-1].lemma << token[:lemma] if also_attach_to_lemma
        words[-1].part_of_speech = pos if update_pos
      else
        pos = Ve::PartOfSpeech::TBD if pos.nil?
        word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
          :reading => token[:reading] || '',
          :transcription => token[:hatsuon] || '',
          :grammar => grammar
        }, {
          :reading_script => :kata,
          :transcription_script => :kata
        })

        if eat_next
          following = tokens.next
          word.tokens << following
          word.word << following[:literal]
          word.extra[:reading] << following[:reading]
          word.extra[:transcription] << following[:hatsuon]
          word.lemma << following[:lemma] if eat_lemma
        end

        words << word
      end

      previous = token
    end
  rescue StopIteration
  end

  return words
end