class WikipediaContentCleaner

Attributes

content[RW]

Public Class Methods

new(article_name) click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 7
def initialize(article_name)
  @article_name = article_name
end

Public Instance Methods

css_selectors_containing_readable_content() click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 26
def css_selectors_containing_readable_content
  [
    '#firstHeading',
    '#bodyContent #mw-content-text p',
    '#bodyContent #siteSub',
    'h2'
  ]
end
css_selectors_to_be_ignored() click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 35
def css_selectors_to_be_ignored
  [
    'table'
  ]
end
delete_silent_substrings() click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 41
def delete_silent_substrings
  @content = @content.gsub(/\[\d+\]/, '')
  @content = @content.gsub(/\[edit\]/, '')
  # @content = @content.gsub(/(\\\\\\\\d+\\\\\\\\.\\\\\\\\d+)×10(\\\\\\\\d+)/, "$1 times ten to the $2")
end
load_from_wikipedia() click to toggle source
# File lib/tellmeabout/wikipedia_content_cleaner.rb, line 11
def load_from_wikipedia
  url = "http://en.wikipedia.org/wiki/#{@article_name}"
  html = Nokogiri::HTML(open(url))

  content_pieces = []

  compound_selector = css_selectors_containing_readable_content.join(', ')

  html.css(compound_selector).each do |element|
    content_pieces.push element.content
  end

  @content = content_pieces.join(' ')
end