class Boilerpipe::Filters::BlockProximityFusion

Constants

MAX_DISTANCE_1
MAX_DISTANCE_1_CONTENT_ONLY
MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL
MAX_DISTANCE_1_SAME_TAGLEVEL

Public Class Methods

new(max_blocks_distance, content_only, same_tag_level_only) click to toggle source
# File lib/boilerpipe/filters/block_proximity_fusion.rb, line 6
def initialize(max_blocks_distance, content_only, same_tag_level_only)
  @max_blocks_distance = max_blocks_distance
  @content_only = content_only
  @same_tag_level_only = same_tag_level_only
end

Public Instance Methods

process(doc) click to toggle source
# File lib/boilerpipe/filters/block_proximity_fusion.rb, line 17
def process(doc)
  text_blocks = doc.text_blocks
  return false if text_blocks.size < 2

  prev_block = if @content_only
                 text_blocks.find { |tb| tb.is_content? }
               else
                 text_blocks.first
               end

  return false if prev_block.nil?

  offset = text_blocks.index(prev_block) + 1
  blocks = text_blocks[offset..-1]

  blocks_to_remove = []

  blocks.each do |tb|
    if tb.is_not_content?
      prev_block = tb
      next
    end

    diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1
    if diff_blocks <= @max_blocks_distance
      ok = true
      ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
      ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only

      if ok
        prev_block.merge_next(tb)
        blocks_to_remove << tb
      else
        prev_block = tb
      end
    end
  end
  doc.replace_text_blocks!(text_blocks - blocks_to_remove)
  doc
end