class Boilerpipe::Filters::ExpandTitleToContentFilter
Public Class Methods
no_title_with_subsequent_content?(content_start, title_idx)
click to toggle source
# File lib/boilerpipe/filters/expand_title_to_content_filter.rb, line 25 def self.no_title_with_subsequent_content?(content_start, title_idx) # title has to start before content title_idx.nil? || content_start.nil? || title_idx >= content_start end
process(doc)
click to toggle source
# File lib/boilerpipe/filters/expand_title_to_content_filter.rb, line 8 def self.process(doc) tbs = doc.text_blocks title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last title_idx = tbs.index(title) content_start = tbs.find_index(&:is_content?) return doc if no_title_with_subsequent_content?(content_start, title_idx) tbs.slice(title_idx...content_start) .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) } .each{ |tb| tb.content = true } doc end