class Ebooks::SuffixGenerator

This generator uses data similar to a Markov model, but instead of making a chain by looking up bigrams it uses the positions to randomly replace token array suffixes in one sentence with matching suffixes in another

Public Class Methods

build(sentences) click to toggle source

Build a generator from a corpus of tikified sentences “tikis” are token indexes– a way of representing words and punctuation as their integer position in a big array of such tokens @param sentences [Array<Array<Integer>>] @return [SuffixGenerator]

# File lib/bot_twitter_ebooks/suffix.rb, line 15
def self.build(sentences)
  SuffixGenerator.new(sentences)
end
new(sentences) click to toggle source
# File lib/bot_twitter_ebooks/suffix.rb, line 19
def initialize(sentences)
  @sentences = sentences.reject { |s| s.empty? }
  @unigrams = {}
  @bigrams = {}

  @sentences.each_with_index do |tikis, i|
    if (i % 10000 == 0) then
      log ("Building: sentence #{i} of #{sentences.length}")
    end
    last_tiki = INTERIM
    tikis.each_with_index do |tiki, j|
      @unigrams[last_tiki] ||= []
      @unigrams[last_tiki] << [i, j]

      @bigrams[last_tiki] ||= {}
      @bigrams[last_tiki][tiki] ||= []

      if j == tikis.length-1 # Mark sentence endings
        @unigrams[tiki] ||= []
        @unigrams[tiki] << [i, INTERIM]
        @bigrams[last_tiki][tiki] << [i, INTERIM]
      else
        @bigrams[last_tiki][tiki] << [i, j+1]
      end

      last_tiki = tiki
    end
  end

  self
end

Public Instance Methods

generate(passes=5, n=:unigrams) click to toggle source

Generate a recombined sequence of tikis @param passes [Integer] number of times to recombine @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is) @return [Array<Integer>]

# File lib/bot_twitter_ebooks/suffix.rb, line 55
def generate(passes=5, n=:unigrams)
  index = rand(@sentences.length)
  tikis = @sentences[index]
  used = [index] # Sentences we've already used
  verbatim = [tikis] # Verbatim sentences to avoid reproducing

  0.upto(passes-1) do
    varsites = {} # Map bigram start site => next tiki alternatives

    tikis.each_with_index do |tiki, i|
      next_tiki = tikis[i+1]
      break if next_tiki.nil?

      alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
      # Filter out suffixes from previous sentences
      alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
      varsites[i] = alternatives unless alternatives.empty?
    end

    variant = nil
    varsites.to_a.shuffle.each do |site|
      start = site[0]

      site[1].shuffle.each do |alt|
        verbatim << @sentences[alt[0]]
        suffix = @sentences[alt[0]][alt[1]..-1]
        potential = tikis[0..start+1] + suffix

        # Ensure we're not just rebuilding some segment of another sentence
        unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
          used << alt[0]
          variant = potential
          break
        end
      end

      break if variant
    end

    # If we failed to produce a variation from any alternative, there
    # is no use running additional passes-- they'll have the same result.
    break if variant.nil?

    tikis = variant
  end

  tikis
end