class WikipediaContentCleaner
Attributes
content[RW]
Public Class Methods
new(article_name)
click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 7 def initialize(article_name) @article_name = article_name end
Public Instance Methods
css_selectors_containing_readable_content()
click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 26 def css_selectors_containing_readable_content [ '#firstHeading', '#bodyContent #mw-content-text p', '#bodyContent #siteSub', 'h2' ] end
css_selectors_to_be_ignored()
click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 35 def css_selectors_to_be_ignored [ 'table' ] end
delete_silent_substrings()
click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 41 def delete_silent_substrings @content = @content.gsub(/\[\d+\]/, '') @content = @content.gsub(/\[edit\]/, '') # @content = @content.gsub(/(\\\\\\\\d+\\\\\\\\.\\\\\\\\d+)×10(\\\\\\\\d+)/, "$1 times ten to the $2") end
load_from_wikipedia()
click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 11 def load_from_wikipedia url = "http://en.wikipedia.org/wiki/#{@article_name}" html = Nokogiri::HTML(open(url)) content_pieces = [] compound_selector = css_selectors_containing_readable_content.join(', ') html.css(compound_selector).each do |element| content_pieces.push element.content end @content = content_pieces.join(' ') end