class Ebooks::Model

Attributes

keywords[RW]

@return [Array<String>] The top 200 most important keywords, in descending order

mentions[RW]

@return [Array<Array<Integer>>] Sentences derived from Twitter mentions

sentences[RW]

@return [Array<Array<Integer>>] Sentences represented by arrays of tikis

tokens[RW]

@return [Array<String>] An array of unique tokens. This is the main source of actual strings in the model. Manipulation of a token is done using its index in this array, which we call a “tiki”

Public Class Methods

consume(path) click to toggle source

Generate a new model from a corpus file @param path [String] @return [Ebooks::Model]

# File lib/bot_twitter_ebooks/model.rb, line 32
def self.consume(path)
  Model.new.consume(path)
end
consume_all(paths) click to toggle source

Generate a new model from multiple corpus files @param paths [Array<String>] @return [Ebooks::Model]

# File lib/bot_twitter_ebooks/model.rb, line 39
def self.consume_all(paths)
  Model.new.consume_all(paths)
end
load(path) click to toggle source

Load a saved model @param path [String] @return [Ebooks::Model]

# File lib/bot_twitter_ebooks/model.rb, line 46
def self.load(path)
  model = Model.new
  model.instance_eval do
    props = Marshal.load(File.open(path, 'rb') { |f| f.read })
    @tokens = props[:tokens]
    @sentences = props[:sentences]
    @mentions = props[:mentions]
    @keywords = props[:keywords]
  end
  model
end
new() click to toggle source
# File lib/bot_twitter_ebooks/model.rb, line 101
def initialize
  @tokens = []

  # Reverse lookup tiki by token, for faster generation
  @tikis = {}
end

Public Instance Methods

append(path) click to toggle source

Append a generated model to existing model file instead of overwriting it @param path [String]

# File lib/bot_twitter_ebooks/model.rb, line 74
def append(path)
  existing = File.file?(path)
  if !existing
    log "No existing model found at #{path}"
    return
  else
    #read-in and deserialize existing model
    props = Marshal.load(File.open(path,'rb') { |old| old.read })
    old_tokens = props[:tokens]
    old_sentences = props[:sentences]
    old_mentions = props[:mentions]
    old_keywords = props[:keywords]

    #append existing properties to new ones and overwrite with new model
    File.open(path, 'wb') do |f|
      f.write(Marshal.dump({
        tokens: @tokens.concat(old_tokens),
        sentences: @sentences.concat(old_sentences),
        mentions: @mentions.concat(old_mentions),
        keywords: @keywords.concat(old_keywords)
      }))
    end
  end
  self
end
consume(path) click to toggle source

Consume a corpus into this model @param path [String]

# File lib/bot_twitter_ebooks/model.rb, line 139
def consume(path)
  content = File.read(path, :encoding => 'utf-8')

  if path.split('.')[-1] == "json"
    log "Reading json corpus from #{path}"
    lines = JSON.parse(content).map do |tweet|
      tweet['text']
    end
  elsif path.split('.')[-1] == "csv"
    log "Reading CSV corpus from #{path}"
    content = CSV.parse(content)
    header = content.shift
    text_col = header.index('text')
    lines = content.map do |tweet|
      tweet[text_col]
    end
  else
    log "Reading plaintext corpus from #{path} (if this is a json or csv file, please rename the file with an extension and reconsume)"
    lines = content.split("\n").reject { |l| l.start_with?('#') } # Remove commented lines
  end

  consume_lines(lines)
end
consume_all(paths) click to toggle source

Consume multiple corpuses into this model @param paths [Array<String>]

# File lib/bot_twitter_ebooks/model.rb, line 199
def consume_all(paths)
  lines = []
  paths.each do |path|
    content = File.read(path, :encoding => 'utf-8')

    if path.split('.')[-1] == "json"
      log "Reading json corpus from #{path}"
      l = JSON.parse(content).map do |tweet|
        tweet['text']
      end
      lines.concat(l)
    elsif path.split('.')[-1] == "csv"
      log "Reading CSV corpus from #{path}"
      content = CSV.parse(content)
      header = content.shift
      text_col = header.index('text')
      l = content.map do |tweet|
        tweet[text_col]
      end
      lines.concat(l)
    else
      log "Reading plaintext corpus from #{path}"
      l = content.split("\n")
      lines.concat(l)
    end
  end
  consume_lines(lines)
end
consume_lines(lines) click to toggle source

Consume a sequence of lines @param lines [Array<String>]

# File lib/bot_twitter_ebooks/model.rb, line 165
def consume_lines(lines)
  log "Removing rts and sorting mentions"

  statements = []
  mentions = []
  lines.each do |l|
    next if l.start_with?('RT @') || l.start_with?('MT @') # Remove soft retweets

    if l.include?('@')
      mentions << NLP.normalize(l)
    else
      statements << NLP.normalize(l)
    end
  end

  text = statements.join("\n").encode('UTF-8', :invalid => :replace)
  mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)

  lines = nil; statements = nil; mentions = nil # Allow garbage collection

  log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"

  @sentences = mass_tikify(text)
  @mentions = mass_tikify(mention_text)

  log "Ranking keywords"
  @keywords = NLP.keywords(text).top(200).map(&:to_s)
  log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]} #{@keywords[3]} #{@keywords[4]} #{@keywords[5]} #{@keywords[6]} #{@keywords[7]}"

  self
end
find_relevant(sentences, input) click to toggle source

Finds relevant and slightly relevant tokenized sentences to input comparing non-stopword token overlaps @param sentences [Array<Array<Integer>>] @param input [String] @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]

# File lib/bot_twitter_ebooks/model.rb, line 294
def find_relevant(sentences, input)
  relevant = []
  slightly_relevant = []

  tokenized = NLP.tokenize(input).map(&:downcase)

  sentences.each do |sent|
    tokenized.each do |token|
      if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
        relevant << sent unless NLP.stopword?(token)
        slightly_relevant << sent
      end
    end
  end

  [relevant, slightly_relevant]
end
fix(text) click to toggle source

Correct encoding issues in generated text @param text [String] @return [String]

# File lib/bot_twitter_ebooks/model.rb, line 231
def fix(text)
  NLP.htmlentities.decode text
end
make_response(input, limit=140, sentences=@mentions) click to toggle source

Generates a response by looking for related sentences in the corpus and building a smaller generator from these @param input [String] @param limit [Integer] characters available for response @param sentences [Array<Array<Integer>>] @return [String]

# File lib/bot_twitter_ebooks/model.rb, line 318
def make_response(input, limit=140, sentences=@mentions)
  # Prefer mentions
  relevant, slightly_relevant = find_relevant(sentences, input)

  if relevant.length >= 3
    generator = SuffixGenerator.build(relevant)
    make_statement(limit, generator)
  elsif slightly_relevant.length >= 5
    generator = SuffixGenerator.build(slightly_relevant)
    make_statement(limit, generator)
  elsif sentences.equal?(@mentions)
    make_response(input, limit, @sentences)
  else
    make_statement(limit)
  end
end
make_statement(limit=140, generator=nil, retry_limit=10) click to toggle source

Generate some text @param limit [Integer] available characters @param generator [SuffixGenerator, nil] @param retry_limit [Integer] how many times to retry on invalid tweet @return [String]

# File lib/bot_twitter_ebooks/model.rb, line 248
def make_statement(limit=140, generator=nil, retry_limit=10)
  responding = !generator.nil?
  generator ||= SuffixGenerator.build(@sentences)

  retries = 0
  tweet = ""

  while (tikis = generator.generate(3, :bigrams)) do
    #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
    break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)

    retries += 1
    break if retries >= retry_limit
  end

  if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
    #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
    while (tikis = generator.generate(3, :unigrams)) do
      break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

      retries += 1
      break if retries >= retry_limit
    end
  end

  tweet = NLP.reconstruct(tikis, @tokens)

  if retries >= retry_limit
    log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
  end

  fix tweet
end
mass_tikify(text) click to toggle source

Convert a body of text into arrays of tikis @param text [String] @return [Array<Array<Integer>>]

# File lib/bot_twitter_ebooks/model.rb, line 124
def mass_tikify(text)
  sentences = NLP.sentences(text)

  sentences.map do |s|
    tokens = NLP.tokenize(s).reject do |t|
      # Don't include usernames/urls as tokens
      t.start_with?('@') || t.downcase.start_with?('http')
    end

    tokens.map { |t| tikify(t) }
  end
end
save(path) click to toggle source

Save model to a file @param path [String]

# File lib/bot_twitter_ebooks/model.rb, line 60
def save(path)
  File.open(path, 'wb') do |f|
    f.write(Marshal.dump({
      tokens: @tokens,
      sentences: @sentences,
      mentions: @mentions,
      keywords: @keywords
    }))
  end
  self
end
tikify(token) click to toggle source

Reverse lookup a token index from a token @param token [String] @return [Integer]

# File lib/bot_twitter_ebooks/model.rb, line 111
def tikify(token)
  if @tikis.has_key?(token) then
    return @tikis[token]
  else
    (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
    @tokens << token
    return @tikis[token] = @tokens.length-1
  end
end
valid_tweet?(tikis, limit) click to toggle source

Check if an array of tikis comprises a valid tweet @param tikis [Array<Integer>] @param limit Integer how many chars we have left

# File lib/bot_twitter_ebooks/model.rb, line 238
def valid_tweet?(tikis, limit)
  tweet = NLP.reconstruct(tikis, @tokens)
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end
verbatim?(tikis) click to toggle source

Test if a sentence has been copied verbatim from original @param tikis [Array<Integer>] @return [Boolean]

# File lib/bot_twitter_ebooks/model.rb, line 285
def verbatim?(tikis)
  @sentences.include?(tikis) || @mentions.include?(tikis)
end