class Boilerpipe::Filters::ExpandTitleToContentFilter

Public Class Methods

no_title_with_subsequent_content?(content_start, title_idx) click to toggle source
# File lib/boilerpipe/filters/expand_title_to_content_filter.rb, line 25
def self.no_title_with_subsequent_content?(content_start, title_idx)
  # title has to start before content
  title_idx.nil? || content_start.nil? || title_idx >= content_start
end
process(doc) click to toggle source
# File lib/boilerpipe/filters/expand_title_to_content_filter.rb, line 8
def self.process(doc)
  tbs = doc.text_blocks

  title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
  title_idx = tbs.index(title)

  content_start = tbs.find_index(&:is_content?)

  return doc if no_title_with_subsequent_content?(content_start, title_idx)

  tbs.slice(title_idx...content_start)
    .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
    .each{ |tb| tb.content = true }

  doc
end