class Boilerpipe::Filters::BlockProximityFusion
Constants
- MAX_DISTANCE_1
- MAX_DISTANCE_1_CONTENT_ONLY
- MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL
- MAX_DISTANCE_1_SAME_TAGLEVEL
Public Class Methods
new(max_blocks_distance, content_only, same_tag_level_only)
click to toggle source
# File lib/boilerpipe/filters/block_proximity_fusion.rb, line 6 def initialize(max_blocks_distance, content_only, same_tag_level_only) @max_blocks_distance = max_blocks_distance @content_only = content_only @same_tag_level_only = same_tag_level_only end
Public Instance Methods
process(doc)
click to toggle source
# File lib/boilerpipe/filters/block_proximity_fusion.rb, line 17 def process(doc) text_blocks = doc.text_blocks return false if text_blocks.size < 2 prev_block = if @content_only text_blocks.find { |tb| tb.is_content? } else text_blocks.first end return false if prev_block.nil? offset = text_blocks.index(prev_block) + 1 blocks = text_blocks[offset..-1] blocks_to_remove = [] blocks.each do |tb| if tb.is_not_content? prev_block = tb next end diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1 if diff_blocks <= @max_blocks_distance ok = true ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only if ok prev_block.merge_next(tb) blocks_to_remove << tb else prev_block = tb end end end doc.replace_text_blocks!(text_blocks - blocks_to_remove) doc end