class Myasorubka::AOT::Dictionary

MRD file is a text file that contains a morphological dictionary of a natural language. MRD is an abbreviation of “morphological dictionary”.

All words in MRD file are written in UPPERCASE. One MRD file has the following sections: section of flexion and prefix models, section of accentual models, section of user sessions, session of prefix sets, section of lemmas.

Attributes

accents_offset[R]
filename[R]
language[R]
lemmas_offset[R]
lines[R]
logs_offset[R]
prefixes_offset[R]
rules_offset[R]

Public Class Methods

new(filename, language = nil, ee = nil, ie = Encoding.default_external) click to toggle source

The parser should be initialized by passing filename and language parameters.

# File lib/myasorubka/aot/dictionary.rb, line 19
def initialize(filename, language = nil, ee = nil, ie = Encoding.default_external)
  encoding = { internal_encoding: ie, external_encoding: ee }
  @filename = filename
  @lines, @language = File.readlines(filename, $/, encoding), language

  @rules_offset = 0
  @accents_offset = rules_offset + rules.length + 1
  @logs_offset = accents_offset + accents.length + 1
  @prefixes_offset = logs_offset + logs.length + 1
  @lemmas_offset = prefixes_offset + prefixes.length + 1
end

Public Instance Methods

accents() click to toggle source

Accents section accessor.

# File lib/myasorubka/aot/dictionary.rb, line 93
def accents
  @accents ||= Section.new(lines, accents_offset)
end
lemmas() click to toggle source

Lemmas section accessor.

# File lib/myasorubka/aot/dictionary.rb, line 111
def lemmas
  @lemmas ||= Section.new(lines, lemmas_offset) do |line|
    stem, rule_id, accent_id, session_id, ancode, prefix_id = line.split

    case language
    when :russian then
      stem &&= stem.tr 'Ёё', 'Ее'
    end

    Array.new.tap do |result|
      result <<
        (stem == '#' ? nil : stem) <<
        rule_id.to_i <<
        accent_id.to_i <<
        session_id.to_i <<
        (ancode == '-' ? nil : ancode[0..1]) <<
        (prefix_id == '-' ? nil : prefix_id.to_i)
    end
  end
end
logs() click to toggle source

Logs section accessor.

# File lib/myasorubka/aot/dictionary.rb, line 99
def logs
  @logs ||= Section.new(lines, logs_offset)
end
prefixes() click to toggle source

Prefixes section accessor.

# File lib/myasorubka/aot/dictionary.rb, line 105
def prefixes
  @prefixes ||= Section.new(lines, prefixes_offset)
end
rules() click to toggle source

Rules section accessor.

# File lib/myasorubka/aot/dictionary.rb, line 73
def rules
  @rules ||= Section.new(lines, rules_offset) do |line|
    line.split('%').map do |rule_line|
      next unless rule_line && !rule_line.empty?

      suffix, ancode, prefix = rule_line.split '*'

      case language
      when :russian then
        suffix &&= suffix.tr 'Ёё', 'Ее'
        prefix &&= prefix.tr 'Ёё', 'Ее'
      end

      [suffix, ancode[0..1], prefix]
    end.compact
  end
end