class ChainPunk::Corpus

Attributes

frequency_table[R]
seeds[R]

Public Class Methods

new(text, options = {}) click to toggle source
# File lib/chain_punk/corpus.rb, line 7
def initialize(text, options = {})
  train(text, options)
end

Public Instance Methods

train(text, options = {}) click to toggle source
# File lib/chain_punk/corpus.rb, line 11
def train(text, options = {})
  exclusion_text = remove_exclusions(text, options[:exclusions])
  text_phrases = process_sets(exclusion_text, options[:closures])
  grapheme_phrases = process_phrases(text_phrases, options[:boundaries])
  @frequency_table, @seeds = process_graphemes(grapheme_phrases, options[:index_size])
end

Private Instance Methods

process_graphemes(grapheme_phrases, index_size = 1, frequency_table = {}, seeds = []) click to toggle source
# File lib/chain_punk/corpus.rb, line 52
def process_graphemes(grapheme_phrases, index_size = 1, frequency_table = {}, seeds = [])
  index_size ||= 1
  grapheme_phrases.each do |phrase|
    seeds << phrase[0, index_size]

    while phrase.size > index_size
      (frequency_table[phrase[0, index_size]] ||= []) << phrase[index_size, index_size]
      phrase.shift
    end
  end

  [frequency_table, seeds]
end
process_phrases(phrases, boundaries = nil) click to toggle source
# File lib/chain_punk/corpus.rb, line 36
def process_phrases(phrases, boundaries = nil)
  grapheme_phrases = []
  until phrases.empty?
    grapheme_phrases << split_phrase(phrases[0], boundaries)
    phrases.shift
  end

  grapheme_phrases
end
process_sets(text, closures = nil) click to toggle source
# File lib/chain_punk/corpus.rb, line 30
def process_sets(text, closures = nil)
  return [text] if closures.nil?

  text.split(Regexp.union(closures)).reject(&:empty?)
end
remove_exclusions(text, exclusions = nil) click to toggle source
# File lib/chain_punk/corpus.rb, line 20
def remove_exclusions(text, exclusions = nil)
  return text if exclusions.nil?

  exclusions.each do |exclusion|
    text = text.gsub(exclusion, '')
  end

  text
end
split_phrase(phrase, boundaries = nil) click to toggle source
# File lib/chain_punk/corpus.rb, line 46
def split_phrase(phrase, boundaries = nil)
  return phrase.to_s.chars if boundaries.nil?

  phrase.split(Regexp.union(boundaries)).reject(&:empty?)
end