class Boilerpipe::Filters::DocumentTitleMatchClassifier
Attributes
potential_titles[R]
Public Class Methods
new(title)
click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 12 def initialize(title) @potential_titles = Set.new generate_potential_titles(title) end
Public Instance Methods
process(doc)
click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 17 def process(doc) return doc if @potential_titles.empty? doc.text_blocks.each do |tb| text = tb.text.gsub('\u00a0', ' ') .gsub("'", '') .strip.downcase if @potential_titles.member? text tb.add_label :TITLE break end remove_characters = /[?!.-:]+/ text = text.gsub(remove_characters, '').strip if @potential_titles.member? text tb.add_label :TITLE break end end doc end
Private Instance Methods
add_potential_titles(title, regex, min_words)
click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 102 def add_potential_titles(title, regex, min_words) parts = title.split regex return if parts.size == 1 parts.each do |part| next if part =~ /[.]com/ num_words = number_of_words(part) @potential_titles << part if num_words >= min_words end end
generate_potential_titles(title)
click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 44 def generate_potential_titles(title) return if title.nil? title = title.gsub('\u00a0', ' ') .gsub("'", '') .strip .downcase @potential_titles << title # unnecessary # p = longest_part(title, /[ ]*[|»-][ ]*/) # @potential_titles << p if p # p = longest_part(title, /[ ]*[|»:][ ]*/) # @potential_titles << p if p # p = longest_part(title, /[ ]*[|»:()][ ]*/) # @potential_titles << p if p # p = longest_part(title, /[ ]*[|»:()-][ ]*/) # @potential_titles << p if p p = longest_part(title, /[ ]*[|»,:()-][ ]*/) @potential_titles << p if p # we replace \u00a0 so why check for it? # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/) # @potential_titles << p if p add_potential_titles(title, /[ ]+[|][ ]+/, 4) add_potential_titles(title, /[ ]+[-][ ]+/, 4) @potential_titles << title.sub(/ - [^-]+$/, '') # remove right of - @potential_titles << title.sub(/^[^-]+ - /, '') # remove left of - end
longest_part(title, regex)
click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 81 def longest_part(title, regex) parts = title.split regex return nil if parts.size == 1 longest_num_words = 0 longest_part = '' parts.each do |part| next if part =~ /[.]com/ num_words = number_of_words(part) if num_words > longest_num_words || part.size > longest_part.size longest_num_words = num_words longest_part = part end end longest_part.empty? ? nil : longest_part.strip end
number_of_words(s)
click to toggle source
# File lib/boilerpipe/filters/document_title_match_classifier.rb, line 115 def number_of_words(s) s.split(/[\b ]+/).size end