class Boilerpipe::UnicodeTokenizer

Constants

INVISIBLE_SEPARATOR
NOT_WORD_BOUNDARY
WORD_BOUNDARY

Public Class Methods

tokenize(text) click to toggle source

replace word boundaries with 'invisible separator' strip invisible separators from non-word boundaries replace spaces or invisible separators with a single space trim split words on single space

# File lib/boilerpipe/util/unicode_tokenizer.rb, line 13
def self.tokenize(text)
  text.gsub(WORD_BOUNDARY, INVISIBLE_SEPARATOR)
    .gsub(NOT_WORD_BOUNDARY, '\1')
    .gsub(/[ \u2063]+/, ' ')
    .strip
    .split(/[ ]+/)
end