class TfIdfSimilarity::Token
Public Instance Methods
classic_filter()
click to toggle source
Returns a string with no English possessive or periods in acronyms.
@return [Token] a string with no English possessive or periods in acronyms
@see wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
# File lib/tf-idf-similarity/token.rb, line 49 def classic_filter self.class.new(self.gsub('.', '').sub(/['`’]s\z/, '')) end
lowercase_filter()
click to toggle source
Returns a lowercase string.
@return [Token] a lowercase string
@see wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
# File lib/tf-idf-similarity/token.rb, line 40 def lowercase_filter self.class.new(UnicodeUtils.downcase(self)) end
to_s()
click to toggle source
# File lib/tf-idf-similarity/token.rb, line 53 def to_s # Don't call #lowercase_filter and #classic_filter to avoid creating unnecessary objects. UnicodeUtils.downcase(self).gsub('.', '').sub(/['`’]s\z/, '') end
valid?()
click to toggle source
Returns a falsy value if all its characters are numbers, punctuation, whitespace or control characters.
@note Some implementations ignore one and two-letter words.
@return [Boolean] whether the string is a token
# File lib/tf-idf-similarity/token.rb, line 22 def valid? !self[%r{ \A ( \d | # number [[:cntrl:]] | # control character [[:punct:]] | # punctuation [[:space:]] # whitespace )+ \z }x] end