class Boilerpipe::Filters::DocumentTitleMatchClassifier

Attributes

potential_titles[R]

Public Class Methods

new(title) click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 12
def initialize(title)
  @potential_titles = Set.new
  generate_potential_titles(title)
end

Public Instance Methods

process(doc) click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 17
def process(doc)
  return doc if @potential_titles.empty?

  doc.text_blocks.each do |tb|
    text = tb.text.gsub('\u00a0', ' ')
      .gsub("'", '')
      .strip.downcase

    if @potential_titles.member? text
      tb.add_label :TITLE
      break
    end

    remove_characters = /[?!.-:]+/
    text = text.gsub(remove_characters, '').strip

    if @potential_titles.member? text
      tb.add_label :TITLE
      break
    end
  end

  doc
end

Private Instance Methods

add_potential_titles(title, regex, min_words) click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 102
def add_potential_titles(title, regex, min_words)
  parts = title.split regex
  return if parts.size == 1

  parts.each do |part|
    next if part =~ /[.]com/

    num_words = number_of_words(part)

    @potential_titles << part if num_words >= min_words
  end
end
generate_potential_titles(title) click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 44
def generate_potential_titles(title)
  return if title.nil?

  title = title.gsub('\u00a0', ' ')
    .gsub("'", '')
    .strip
    .downcase

  @potential_titles << title

  # unnecessary
  # p = longest_part(title, /[ ]*[|»-][ ]*/)
  # @potential_titles << p if p

  # p = longest_part(title, /[ ]*[|»:][ ]*/)
  # @potential_titles << p if p

  # p = longest_part(title, /[ ]*[|»:()][ ]*/)
  # @potential_titles << p if p

  # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
  # @potential_titles << p if p

  p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
  @potential_titles << p if p

  # we replace \u00a0 so why check for it?
  # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
  # @potential_titles << p if p

  add_potential_titles(title, /[ ]+[|][ ]+/, 4)
  add_potential_titles(title, /[ ]+[-][ ]+/, 4)

  @potential_titles << title.sub(/ - [^-]+$/, '') # remove right of -
  @potential_titles << title.sub(/^[^-]+ - /, '') # remove left of -
end
longest_part(title, regex) click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 81
def longest_part(title, regex)
  parts = title.split regex
  return nil if parts.size == 1

  longest_num_words = 0
  longest_part = ''

  parts.each do |part|
    next if part =~ /[.]com/

    num_words = number_of_words(part)

    if num_words > longest_num_words || part.size > longest_part.size
      longest_num_words = num_words
      longest_part = part
    end
  end

  longest_part.empty? ? nil : longest_part.strip
end
number_of_words(s) click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 115
def number_of_words(s)
  s.split(/[\b ]+/).size
end