class Boilerpipe::Filters::KeepLargestBlockFilter

Constants

INSTANCE
INSTANCE_EXPAND_TO_SAME_TAGLEVEL
INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS

Public Class Methods

new(expand_to_same_level_text, min_words) click to toggle source
# File lib/boilerpipe/filters/keep_largest_block_filter.rb, line 10
def initialize(expand_to_same_level_text, min_words)
  @expand_to_same_level_text = expand_to_same_level_text
  @min_words = min_words
end

Public Instance Methods

expand_tag_level(tbs, level, min_words) click to toggle source

sets content to true

# File lib/boilerpipe/filters/keep_largest_block_filter.rb, line 49
def expand_tag_level(tbs, level, min_words)
  tbs.each do |tb|
    if tb.tag_level < level
      break
    elsif tb.tag_level == level
      tb.content = true if tb.num_words >= min_words
    end
  end
end
process(doc) click to toggle source
# File lib/boilerpipe/filters/keep_largest_block_filter.rb, line 19
def process(doc)
  tbs = doc.text_blocks
  return false if tbs.size < 2

  # find tb with the most words
  largest_block = tbs.select(&:is_content?).max_by(&:num_words)
  level = @expand_to_same_level_text ? largest_block.tag_level : -1

  # set labels for text blocks
  tbs.each do |tb|
    if tb == largest_block
      tb.content = true
      tb.add_label :VERY_LIKELY_CONTENT
    else
      tb.content = false
      tb.add_label :MIGHT_BE_CONTENT
    end
  end

  n = tbs.index(largest_block)
  if @expand_to_same_level_text && n
    # expand blocks to the left
    expand_tag_level(tbs[0...n].reverse, level, @min_words)

    # expand blocks to the right
    expand_tag_level(tbs[n + 1..-1], level, @min_words)
  end
end