module TorchText::Data::Utils

Constants

PATTERNS_DICT

Public Instance Methods

ngrams_iterator(token_list, ngrams) { |x| ... } click to toggle source
# File lib/torchtext/data/utils.rb, line 17
def ngrams_iterator(token_list, ngrams)
  return enum_for(:ngrams_iterator, token_list, ngrams) unless block_given?

  get_ngrams = lambda do |n|
    (token_list.size - n + 1).times.map { |i| token_list[i...(i + n)] }
  end

  token_list.each do |x|
    yield x
  end
  2.upto(ngrams) do |n|
    get_ngrams.call(n).each do |x|
      yield x.join(" ")
    end
  end
end
tokenizer(tokenizer, language: "en") click to toggle source
# File lib/torchtext/data/utils.rb, line 4
def tokenizer(tokenizer, language: "en")
  return method(:split_tokenizer) if tokenizer.nil?

  if tokenizer == "basic_english"
    if language != "en"
      raise ArgumentError, "Basic normalization is only available for English(en)"
    end
    return method(:basic_english_normalize)
  end

  raise "Not implemented yet"
end

Private Instance Methods

basic_english_normalize(line) click to toggle source
# File lib/torchtext/data/utils.rb, line 45
def basic_english_normalize(line)
  line = line.downcase

  PATTERNS_DICT.each do |pattern_re, replaced_str|
    line.sub!(pattern_re, replaced_str)
  end
  line.split
end
split_tokenizer(x) click to toggle source
# File lib/torchtext/data/utils.rb, line 36
def split_tokenizer(x)
  x.split
end