class SimString::NGramBuilder

Constants

SENTINAL_CHAR

Attributes

n[RW]

Public Class Methods

new(n) click to toggle source
# File lib/simstring_pure.rb, line 19
def initialize(n)
  self.n = n
end

Public Instance Methods

features(string) click to toggle source
# File lib/simstring_pure.rb, line 23
def features(string)
  prefix_and_suffix_string = SENTINAL_CHAR * (n - 1)
  string = prefix_and_suffix_string + string + prefix_and_suffix_string
  ngram_strings = string.each_char.each_cons(n).map(&:join)
  ngram_strings_to_count_map = ngram_strings.reduce({}) {|memo, ngram_string| memo[ngram_string] = (memo[ngram_string] || 0) + 1; memo }
  numbered_ngrams = ngram_strings_to_count_map.flat_map {|ngram_string, count| (1..count).map {|i| NGram.new(ngram_string, i) } }
  numbered_ngrams.to_set
end