class Boilerpipe::Filters::NumWordsRulesClassifier

Public Class Methods

process(doc) click to toggle source
# File lib/boilerpipe/filters/num_words_rules_classifier.rb, line 8
def self.process(doc)
  empty = Boilerpipe::Document::TextBlock.empty_start
  text_blocks = [empty] + doc.text_blocks + [empty]

  text_blocks.each_cons(3) do |slice|
    prev, current, nxt = *slice
    current.content = classify(prev, current, nxt)
  end

  doc
end

Private Class Methods

classify(prev, current, nxt) click to toggle source
# File lib/boilerpipe/filters/num_words_rules_classifier.rb, line 22
def self.classify(prev, current, nxt)
  return false if current.link_density > 0.333333

  if prev.link_density <= 0.555556
    return true if current.num_words > 16

    return true if nxt.num_words > 15
    return true if prev.num_words > 4
  else
    return true if current.num_words > 40
    return true if nxt.num_words > 17
  end

  false
end