class Boilerpipe::Filters::MinClauseWordsFilter

Public Class Methods

is_clause?(text, min_words = 5) click to toggle source
# File lib/boilerpipe/filters/min_clause_words_filter.rb, line 27
def self.is_clause?(text, min_words = 5)
  return false if text.nil?

  whitespace = /[ \n\r]+/
  text.scan(whitespace).size >= min_words
end
process(doc, min_words = 5) click to toggle source
# File lib/boilerpipe/filters/min_clause_words_filter.rb, line 11
def self.process(doc, min_words = 5)
  doc.text_blocks.each do |tb|
    next if tb.is_not_content?

    clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
    hasClause = false
    tb.text.scan(clause_delimiter).each do |possible_clause|
      hasClause |= is_clause? possible_clause
    end

    tb.content = false unless hasClause
  end

  doc
end