class Lurn::Text::BernoulliVectorizer

Attributes

tokenizer[RW]
vocabulary[RW]

Public Class Methods

new(options = {}) click to toggle source
# File lib/lurn/text/bernoulli_vectorizer.rb, line 8
def initialize(options = {})
  @tokenizer = options[:tokenizer] || WordTokenizer.new
  @vocabulary = []

  options[:max_df] ||= 50
  options[:min_df] ||= 0
  @options = options
end

Public Instance Methods

fit(documents) click to toggle source
# File lib/lurn/text/bernoulli_vectorizer.rb, line 17
def fit(documents)
  @vocabulary = []
  tokenized_docs = tokenize_documents(documents)
  @vocabulary = tokenized_docs.flatten(1).uniq.sort
  reduce_features(tokenized_docs)
end
to_h() click to toggle source
# File lib/lurn/text/bernoulli_vectorizer.rb, line 24
def to_h
  {
    tokenizer_options: @tokenizer.to_h,
    vocabulary: @vocabulary
  }
end
transform(documents) click to toggle source
# File lib/lurn/text/bernoulli_vectorizer.rb, line 31
def transform(documents)
  documents.map do |document|
    tokens = @tokenizer.tokenize(document)
    @vocabulary.map do |word|
      tokens.include? word
    end
  end
end

Private Instance Methods

reduce_features(tokenized_docs) click to toggle source
# File lib/lurn/text/bernoulli_vectorizer.rb, line 42
def reduce_features(tokenized_docs)
  doc_frequencies = Array.new(@vocabulary.length, 0)

  tokenized_docs.each do |tokens|
    tokens.each do |token|
      vocab_index = @vocabulary.index(token)
      doc_frequencies[vocab_index] += 1
    end
  end

  reduced_features = @vocabulary.select.with_index do |token, index|
    freq = doc_frequencies[index]
    @options[:min_df] < freq && freq < @options[:max_df]
  end

  @vocabulary = reduced_features
end
tokenize_documents(documents) click to toggle source
# File lib/lurn/text/bernoulli_vectorizer.rb, line 60
def tokenize_documents(documents)
  documents.map { |doc| @tokenizer.tokenize(doc).uniq }
end