class Boilerpipe::Filters::LargeBlockSameTagLevelToContentFilter

Public Class Methods

process(doc) click to toggle source
# File lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb, line 9
def self.process(doc)
  largest = doc.text_blocks.find do |tb|
    tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
  end

  return doc if largest.nil?

  tag_level = largest.tag_level

  doc.text_blocks.each do |tb|
    next if tb.is_content?

    tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
  end

  doc
end