class Boilerpipe::Filters::KeepLargestBlockFilter
Constants
- INSTANCE
- INSTANCE_EXPAND_TO_SAME_TAGLEVEL
- INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS
Public Class Methods
new(expand_to_same_level_text, min_words)
click to toggle source
# File lib/boilerpipe/filters/keep_largest_block_filter.rb, line 10 def initialize(expand_to_same_level_text, min_words) @expand_to_same_level_text = expand_to_same_level_text @min_words = min_words end
Public Instance Methods
expand_tag_level(tbs, level, min_words)
click to toggle source
sets content to true
# File lib/boilerpipe/filters/keep_largest_block_filter.rb, line 49 def expand_tag_level(tbs, level, min_words) tbs.each do |tb| if tb.tag_level < level break elsif tb.tag_level == level tb.content = true if tb.num_words >= min_words end end end
process(doc)
click to toggle source
# File lib/boilerpipe/filters/keep_largest_block_filter.rb, line 19 def process(doc) tbs = doc.text_blocks return false if tbs.size < 2 # find tb with the most words largest_block = tbs.select(&:is_content?).max_by(&:num_words) level = @expand_to_same_level_text ? largest_block.tag_level : -1 # set labels for text blocks tbs.each do |tb| if tb == largest_block tb.content = true tb.add_label :VERY_LIKELY_CONTENT else tb.content = false tb.add_label :MIGHT_BE_CONTENT end end n = tbs.index(largest_block) if @expand_to_same_level_text && n # expand blocks to the left expand_tag_level(tbs[0...n].reverse, level, @min_words) # expand blocks to the right expand_tag_level(tbs[n + 1..-1], level, @min_words) end end