class EnterRockstar::Corpus::Tokenizer

take the downloaded lyrics texts and tokenize them

Public Class Methods

new(data_dir:, name:) click to toggle source
# File lib/enter_rockstar/corpus/tokenizer.rb, line 10
def initialize(data_dir:, name:)
  @data_dir = data_dir
  @stats = {}
  @tokens = {}
  @output_stats = "lyrics_data/#{name}_stats.json.gz"
  @output_tokens = "lyrics_data/#{name}_tokens.json.gz"
  @wl = WhatLanguage.new(:all)
end

Public Instance Methods

save_all() click to toggle source
# File lib/enter_rockstar/corpus/tokenizer.rb, line 55
def save_all
  EnterRockstar::Utils.save_file(@output_tokens, @tokens.to_json)
  EnterRockstar::Utils.save_file(@output_stats, @stats.to_json)
end
tokenize() click to toggle source
# File lib/enter_rockstar/corpus/tokenizer.rb, line 19
def tokenize
  text_files = Dir.glob("#{@data_dir}/**/*.txt")
  puts "Parsing #{text_files.count} files."
  progressbar = ProgressBar.create(title: 'Progress', total: text_files.count)

  text_files.each do |filename|
    # read the lyrics and tokenize the words
    text = IO.read(filename)

    # Rockstar doesn't really work well with languages other than English
    if @wl.language(text) == :english
      tokenized = _to_tokens(text)
      # save stats which word appears after which one
      n = 3
      tokenized.each_cons(n) do |*head, continuation|
        @stats[head] ||= Hash.new(0)

        @stats[head][continuation] += 1
      end

      # save the words themselves based on what length they are
      tokenized.each do |token|
        next if token.length < 4 # shorter words are boring anyway

        @tokens[token.length] ||= []
        @tokens[token.length].push token unless @tokens[token.length].include? token
      end
      progressbar.increment
    else
      progressbar.increment
      next
    end
  end
  puts
end

Private Instance Methods

_to_tokens(text) click to toggle source
# File lib/enter_rockstar/corpus/tokenizer.rb, line 62
def _to_tokens(text)
  text.downcase.split(/[^[[:alpha:]]]+/).reject(&:empty?).map(&:to_sym)
end